123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478 |
- import json
- import os
- import re
- import time
- from bs4 import BeautifulSoup
- import pandas as pd
- import requests
- def req_iata_info():
- city_df = pd.read_excel('city.xlsx', sheet_name='city')
- origin = []
- with open("airport_other_city.json", "r") as f:
- origin = json.load(f)
- p_list = {'日本': 900085, '澳大利亚': 900216, '新西兰': 900222}
- arrs = {'日本': [], '澳大利亚': [], '新西兰': []}
- new_origin = []
- for item in origin:
- try:
- if 'country_name' not in item or 'iata' not in item or item['country_name'] == '' or item['iata'] == '' or item['country_name'] not in p_list:
- new_origin.append(item)
- continue
- uri = 'http://www.yuntisoft.com/airport/{}.html'.format(item['iata'])
- text = requests.get(uri).text
- html = BeautifulSoup(text, features="html.parser")
- ul = html.find(name = "ul", attrs={'class': 'port_items'})
- if ul:
- li_arr = ul.find_all(name = "li")
- if li_arr and len(li_arr) > 0:
- for li in li_arr:
- text = li.text
- idx = text.find(':')
- if idx <= 0:
- continue
- title = text[0:idx].strip()
- value = text[idx + 1:].strip()
- if len(title) == 0 or len(value) == 0:
- continue
- if len(item['icao']) == 0 and title == 'ICAO(四字码)':
- item['icao'] = value
- elif title == '机场名':
- idx = value.find(' ')
- if idx > 0:
- item['airport_cn_name'] = value[0:idx]
- item['airport_en_name'] = value[idx + 1:]
- elif title == '所属城市':
- idx = value.find(' ')
- if idx > 0:
- item['city_cn_name'] = value[0:idx]
- item['city_en_name'] = value[idx + 1:]
- if city_df.loc[(city_df['CountryCode'] == p_list[item['country_name']]) & (city_df['ZHName'] == item['city_cn_name'])].empty:
- item['c'] = '+'
- else:
- item['c'] = item['city_cn_name']
- if 'c' not in item:
- item['c'] = '?'
- arrs[item['country_name']].append(item)
- print('{}'.format(item))
- except Exception as e:
- print(f"Exception in iata process: {e=}, {type(e)=}")
- for key, arr in arrs.items():
- if arr and len(arr) > 0:
- s = json.dumps(arr, ensure_ascii=False)
- with open('{}_iata.json'.format(key), 'w') as f:
- f.write(s)
- if new_origin and len(new_origin) > 0:
- s = json.dumps(new_origin, ensure_ascii=False)
- with open('airport_other_city.json', 'w') as f:
- f.write(s)
- def req_iata_for_city():
- pth = '/Users/marion/Desktop/airport_change'
- files = os.listdir(pth)
- arr = []
- for file in files:
- file_path = os.path.join(pth, file)
- if not os.path.isfile(file_path) or not os.path.splitext(file)[-1] == '.json':
- continue
- origin = []
- with open(file_path, "r") as f:
- origin = json.load(f)
- i = 0;
- for item in origin:
- try:
- if not item['c'] == '+':
- continue
- str = '{} {}'.format(item['country_name'], item['city_cn_name'])
- uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&language=en&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(str)
- text = requests.get(uri).text
- res = json.loads(text).get('results')
- if res and len(res) > 0:
- geo = res[0]
- if geo:
- new_item = {
- 'country_name': item['country_name'],
- 'ZHName': item['city_cn_name'],
- 'geo_en': geo
- }
- arr.append(new_item)
- print('{}'.format(new_item))
- i += 1
- if i % 5 == 0:
- time.sleep(1)
- except Exception as e:
- print(f"Exception in iata for city process: {e=}, {type(e)=}")
-
- if arr and len(arr) > 0:
- s = json.dumps(arr, ensure_ascii=False)
- # print(s)
- with open('city_new_geo.json', 'w') as f:
- f.write(s)
-
- def req_geocode_for_city():
- country_df = pd.read_excel('city.xlsx', sheet_name='country')
- province_df = pd.read_excel('city.xlsx', sheet_name='province')
- city_df = pd.read_excel('city.xlsx', sheet_name='city')
-
- arr = []
- for idx, row in city_df.iterrows():
- try:
- if row['CountryCode'] == 1 or row['ProvinceCode'] == 1:
- continue
- str = ''
- if row['CountryCode'] == 10:
- provinces = province_df.loc[province_df['ProvinceCode']==row['ProvinceCode'], ['ZHName']]
- if not provinces.empty:
- province_name = provinces.iloc[0]['ZHName']
- str = '中国 {} {}'.format(province_name, row['ZHName'])
- else:
- countries = country_df.loc[country_df['CountryCode']==row['CountryCode'], ['ZHName']]
- if not countries.empty:
- country_name = countries.iloc[0]['ZHName']
- str = '{} {}'.format(country_name, row['ZHName'])
- if str == '':
- continue
- uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&language=en&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(str)
- text = requests.get(uri).text
- res = json.loads(text).get('results')
- if res and len(res) > 0:
- geo = res[0]
- if geo:
- new_item = {
- 'CityCode': row['CityCode'],
- 'ZHName': row['ZHName'],
- 'geo_en': geo
- }
- arr.append(new_item)
- print('{} {}'.format(idx, new_item))
- if idx % 5 == 0:
- time.sleep(1)
- except Exception as e:
- print(f"Exception in geocode for city process: {e=}, {type(e)=}")
-
- if arr and len(arr) > 0:
- s = json.dumps(arr, ensure_ascii=False)
- # print(s)
- with open('city_geo.json', 'w') as f:
- f.write(s)
- # 清理错误数据
- def clean_geocode():
- arr = []
- with open("airport.json", "r") as f:
- arr = json.load(f)
- langs = ['en', 'zh-CN']
- pattern3 = re.compile(r'^[A-Z]{2}$')
- pattern4 = re.compile(r'^[A-Z]{4}$')
- for item in arr:
- if not pattern3.match(item['国家(地区)代码']):
- swap = item['国家(地区)代码']
- for k, v in item.items():
- if k.startswith('geo_') or k == '国家(地区)代码' or not isinstance(v, str):
- continue
- if pattern3.match(v):
- item['国家(地区)代码'] = v
- item[k] = swap
-
- if not pattern4.match(item['机场四字码']) and item['机场名称(英文)'] == '':
- item['机场名称(英文)'] = item['机场四字码']
- if not '国家(地区)代码' in item or item['国家(地区)代码'] == '':
- continue
- for lang in langs:
- k = 'geo_{}'.format(lang)
- if not k in item:
- continue
- g = item[k]
- if len(g) == 0 or not 'address_components' in g or len(g['address_components']) == 0:
- continue
- cc = g['address_components']
- for c in cc:
- if not 'types' in c or len(c['types']) == 0 or not 'short_name' in c:
- continue
- if c['types'][0] == 'country' and c['short_name'] != item['国家(地区)代码']:
- item[k] = {}
- break
- s = json.dumps(arr, ensure_ascii=False)
- with open('airport.json', 'w') as f:
- f.write(s)
- # 检查数据
- def test_geocode():
- arr = []
- with open("airport.json", "r") as f:
- arr = json.load(f)
- langs = ['en', 'zh-CN']
- count_empty_code = 0
- flag_empty = {}
- flag_cc = {}
- flag_fail = {}
- for item in arr:
- if not '机场三字码' in item or item['机场三字码'] == '':
- count_empty_code += 1
- continue
- flag = False
- flag_sub = 0
- for lang in langs:
- k = 'geo_{}'.format(lang)
- if not k in item:
- flag_sub += 1
- continue
- g = item[k]
- if len(g) == 0:
- flag = flag or False
- continue
- flag = flag or ('geometry' in g and 'location' in g['geometry'] and 'lat' in g['geometry']['location'] and 'lng' in g['geometry']['location'])
- if 'address_components' in g and len(g['address_components']) > 0:
- cc = g['address_components']
- for c in cc:
- if not 'types' in c or len(c['types']) == 0 or not 'short_name' in c:
- continue
- if c['types'][0] == 'country' and c['short_name'] != item['国家(地区)代码']:
- flag_cc[item['机场三字码']] = 1
- break
- if not flag:
- flag_empty[item['机场三字码']] = 1
- if flag_sub >= 2:
- flag_fail[item['机场三字码']] = 1
- print("无三字码的数据数量: {}".format(count_empty_code))
- print("Google地图信息为空的数据数量: {}".format(len(flag_empty)))
- if len(flag_empty) > 0:
- print(flag_empty)
- print("Google地图信息国家不一致的数据数量: {}".format(len(flag_cc)))
- if len(flag_cc) > 0:
- print(flag_cc)
- print("无Google地图信息(拉数据失败)的数据数量: {}".format(len(flag_fail)))
- if len(flag_fail) > 0:
- print(flag_fail)
- # 根据机场三字码,重新从 Google Geocoding AP 刷数据
- def req_geocode():
- # arr = [{"机场三字码":"SZX"}]
- arr = []
- with open("airport.json", "r") as f:
- arr = json.load(f)
- langs = ['en', 'zh-CN']
- is_skip_empty = False # geo_ 为空 dict 的,是否不重新拉数据
- i = 0
- for item in arr:
- try:
- if not '机场三字码' in item or item['机场三字码'] == '':
- continue
- flag = False
- for lang in langs:
- k = 'geo_{}'.format(lang)
- if not k in item:
- continue
- g = item[k]
- if len(g) == 0:
- flag = flag or is_skip_empty
- continue
- flag = flag or ('geometry' in g and 'location' in g['geometry'] and 'lat' in g['geometry']['location'] and 'lng' in g['geometry']['location'])
- if flag:
- continue
-
- for lang in langs:
- k = 'geo_{}'.format(lang)
- uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}%20{}%20Airport%2C{}&language={}&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(
- item['国家(地区)名称'] if item['国家(地区)名称'] else item['国家(地区)代码'],
- item['机场名称(英文)'] if item['机场名称(英文)'] else item['机场三字码'],
- item['城市名(英文)'],
- lang)
- text = requests.get(uri).text
- res = json.loads(text).get('results')
- if res and len(res) > 0:
- geo = res[0]
- if geo:
- item[k] = geo
- else:
- item[k] = {}
- else:
- item[k] = {}
-
- s = json.dumps(item, ensure_ascii=False)
- print(item)
- i += 1
- if i % 5 == 0:
- time.sleep(1)
- except Exception as e:
- print(f"Exception in geocode process: {e=}, {type(e)=}")
- s = json.dumps(arr, ensure_ascii=False)
- # print(s)
- with open('airport.json', 'w') as f:
- f.write(s)
- # 抓取机场三字码等信息
- def get_info():
- pattern1 = re.compile(r'^-*\d+\.*\d+$')
- pattern2 = re.compile(r'^[A-Z]{3}$')
- pattern3 = re.compile(r'^[A-Z]{2}$')
- pattern4 = re.compile(r'^[A-Z]{4}$')
- pattern5 = re.compile(r'^[A-Za-z\s,]+$')
- pattern6 = re.compile(r'^[\u4e00-\u9fa5]+$')
- result = []
- n = 290
- for i in range(1, n):
- page = i
- try:
- text = requests.get("https://airportcode.bmcx.com/" + str(page) + "__airportcode/")
- res = text.text
- html = BeautifulSoup(res, features="html.parser")
- tabs = html.find_all(name = "table")
- if len(tabs) > 1:
- arr = tabs[1].find_all(name = "tr")
- if len(arr) > 2:
- for tr in arr[2:]:
- href = tr.find(name = "a").attrs.get("href")
- if href != None:
- try:
- sub_text = requests.get("https://airportcode.bmcx.com" + href)
- sub_res = sub_text.text
- sub_html = BeautifulSoup(sub_res, features="html.parser")
- sub_tabs = sub_html.find_all(name = "table")
- if len(sub_tabs) > 1:
- sub_arr = sub_tabs[1].find_all(name = "tr")
- item = {}
- for sub_tr in sub_arr:
- sub_t = sub_tr.find_all(name = "td")
- item[sub_t[0].string] = sub_t[1].string
- flag = False
- name = ""
- swap = ""
- if item["纬度"] != '' and not pattern1.match(item["纬度"]):
- name = "纬度"
- swap = item["纬度"]
- flag = True
- if item["经度"] != '' and not pattern1.match(item["经度"]):
- name = "经度"
- swap = item["经度"]
- flag = True
- if flag:
- for k, v in item.items():
- if k != "纬度" and k != "经度" and pattern1.match(v):
- item[name] = v
- item[k] = swap
- break
- flag = False
- swap = ""
- if item["机场三字码"] == '' or not pattern2.match(item["机场三字码"]):
- swap = item["机场三字码"]
- flag = True
- if flag:
- for k, v in item.items():
- if k != "机场三字码" and pattern2.match(v):
- item["机场三字码"] = v
- item[k] = swap
- break
- item["机场三字码"] = item["机场三字码"].upper()
- if item["国家(地区)代码"] == '' and pattern3.match(item["纬度"]):
- item["国家(地区)代码"] = item["纬度"]
- item["纬度"] = ''
- if item["国家(地区)代码"] == '' and pattern3.match(item["经度"]):
- item["国家(地区)代码"] = item["经度"]
- item["经度"] = ''
- if item["机场四字码"] == '' and pattern4.match(item["纬度"]):
- item["机场四字码"] = item["纬度"]
- item["纬度"] = ''
- if item["机场四字码"] == '' and pattern4.match(item["经度"]):
- item["机场四字码"] = item["经度"]
- item["经度"] = ''
- if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["纬度"]):
- item["机场名称(英文)"] = item["纬度"]
- item["纬度"] = ''
- if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["经度"]):
- item["机场名称(英文)"] = item["经度"]
- item["经度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["纬度"]):
- item["城市名(英文)"] = item["纬度"]
- item["纬度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["经度"]):
- item["城市名(英文)"] = item["经度"]
- item["经度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["纬度"]):
- item["城市名"] = item["纬度"]
- item["纬度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["经度"]):
- item["城市名"] = item["经度"]
- item["经度"] = ''
- if item["国家(地区)名称"] == '' and pattern6.match(item["纬度"]):
- item["国家(地区)名称"] = item["纬度"]
- item["纬度"] = ''
- if item["国家(地区)名称"] == '' and pattern6.match(item["经度"]):
- item["国家(地区)名称"] = item["经度"]
- item["经度"] = ''
- if item["城市名"] == '' and pattern6.match(item["纬度"]):
- item["城市名"] = item["纬度"]
- item["纬度"] = ''
- if item["城市名"] == '' and pattern6.match(item["经度"]):
- item["城市名"] = item["经度"]
- item["经度"] = ''
- result.append(item)
- except Exception as e:
- print(f"Exception in sub process: {e=}, {type(e)=}")
- except Exception as ex:
- print(f"Exception in main loop process: {ex=}, {type(ex)=}")
- s = json.dumps(result, ensure_ascii=False)
- # print(s)
- with open('airport.json', 'w') as f:
- f.write(s)
- # 结果校验
- arr = []
- with open("airport.json", "r") as f:
- arr = json.load(f)
- ct = 0
- for item in arr:
- swap = ""
- if item["纬度"] != '' and not pattern1.match(item["纬度"]):
- print(item["机场三字码"])
- ct += 1
- if item["经度"] != '' and not pattern1.match(item["经度"]):
- print(item["机场三字码"])
- ct += 1
- if item["机场三字码"] != '' and not pattern2.match(item["机场三字码"]):
- ct += 1
-
- if ct > 0:
- print('Sth wrong ' + ct)
- # s = json.dumps(arr, ensure_ascii=False)
- # with open('airport.json', 'w') as f:
- # f.write(s)
|