import json import os import re import time from bs4 import BeautifulSoup import pandas as pd import requests def req_iata_info(): city_df = pd.read_excel('city.xlsx', sheet_name='city') origin = [] with open("airport_other_city.json", "r") as f: origin = json.load(f) p_list = {'日本': 900085, '澳大利亚': 900216, '新西兰': 900222} arrs = {'日本': [], '澳大利亚': [], '新西兰': []} new_origin = [] for item in origin: try: if 'country_name' not in item or 'iata' not in item or item['country_name'] == '' or item['iata'] == '' or item['country_name'] not in p_list: new_origin.append(item) continue uri = 'http://www.yuntisoft.com/airport/{}.html'.format(item['iata']) text = requests.get(uri).text html = BeautifulSoup(text, features="html.parser") ul = html.find(name = "ul", attrs={'class': 'port_items'}) if ul: li_arr = ul.find_all(name = "li") if li_arr and len(li_arr) > 0: for li in li_arr: text = li.text idx = text.find(':') if idx <= 0: continue title = text[0:idx].strip() value = text[idx + 1:].strip() if len(title) == 0 or len(value) == 0: continue if len(item['icao']) == 0 and title == 'ICAO(四字码)': item['icao'] = value elif title == '机场名': idx = value.find(' ') if idx > 0: item['airport_cn_name'] = value[0:idx] item['airport_en_name'] = value[idx + 1:] elif title == '所属城市': idx = value.find(' ') if idx > 0: item['city_cn_name'] = value[0:idx] item['city_en_name'] = value[idx + 1:] if city_df.loc[(city_df['CountryCode'] == p_list[item['country_name']]) & (city_df['ZHName'] == item['city_cn_name'])].empty: item['c'] = '+' else: item['c'] = item['city_cn_name'] if 'c' not in item: item['c'] = '?' arrs[item['country_name']].append(item) print('{}'.format(item)) except Exception as e: print(f"Exception in iata process: {e=}, {type(e)=}") for key, arr in arrs.items(): if arr and len(arr) > 0: s = json.dumps(arr, ensure_ascii=False) with open('{}_iata.json'.format(key), 'w') as f: f.write(s) if new_origin and len(new_origin) > 0: s = json.dumps(new_origin, ensure_ascii=False) with open('airport_other_city.json', 'w') as f: f.write(s) def req_iata_for_city(): pth = '/Users/marion/Desktop/airport_change' files = os.listdir(pth) arr = [] for file in files: file_path = os.path.join(pth, file) if not os.path.isfile(file_path) or not os.path.splitext(file)[-1] == '.json': continue origin = [] with open(file_path, "r") as f: origin = json.load(f) i = 0; for item in origin: try: if not item['c'] == '+': continue str = '{} {}'.format(item['country_name'], item['city_cn_name']) uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&language=en&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(str) text = requests.get(uri).text res = json.loads(text).get('results') if res and len(res) > 0: geo = res[0] if geo: new_item = { 'country_name': item['country_name'], 'ZHName': item['city_cn_name'], 'geo_en': geo } arr.append(new_item) print('{}'.format(new_item)) i += 1 if i % 5 == 0: time.sleep(1) except Exception as e: print(f"Exception in iata for city process: {e=}, {type(e)=}") if arr and len(arr) > 0: s = json.dumps(arr, ensure_ascii=False) # print(s) with open('city_new_geo.json', 'w') as f: f.write(s) def req_geocode_for_city(): country_df = pd.read_excel('city.xlsx', sheet_name='country') province_df = pd.read_excel('city.xlsx', sheet_name='province') city_df = pd.read_excel('city.xlsx', sheet_name='city') arr = [] for idx, row in city_df.iterrows(): try: if row['CountryCode'] == 1 or row['ProvinceCode'] == 1: continue str = '' if row['CountryCode'] == 10: provinces = province_df.loc[province_df['ProvinceCode']==row['ProvinceCode'], ['ZHName']] if not provinces.empty: province_name = provinces.iloc[0]['ZHName'] str = '中国 {} {}'.format(province_name, row['ZHName']) else: countries = country_df.loc[country_df['CountryCode']==row['CountryCode'], ['ZHName']] if not countries.empty: country_name = countries.iloc[0]['ZHName'] str = '{} {}'.format(country_name, row['ZHName']) if str == '': continue uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&language=en&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(str) text = requests.get(uri).text res = json.loads(text).get('results') if res and len(res) > 0: geo = res[0] if geo: new_item = { 'CityCode': row['CityCode'], 'ZHName': row['ZHName'], 'geo_en': geo } arr.append(new_item) print('{} {}'.format(idx, new_item)) if idx % 5 == 0: time.sleep(1) except Exception as e: print(f"Exception in geocode for city process: {e=}, {type(e)=}") if arr and len(arr) > 0: s = json.dumps(arr, ensure_ascii=False) # print(s) with open('city_geo.json', 'w') as f: f.write(s) # 清理错误数据 def clean_geocode(): arr = [] with open("airport.json", "r") as f: arr = json.load(f) langs = ['en', 'zh-CN'] pattern3 = re.compile(r'^[A-Z]{2}$') pattern4 = re.compile(r'^[A-Z]{4}$') for item in arr: if not pattern3.match(item['国家(地区)代码']): swap = item['国家(地区)代码'] for k, v in item.items(): if k.startswith('geo_') or k == '国家(地区)代码' or not isinstance(v, str): continue if pattern3.match(v): item['国家(地区)代码'] = v item[k] = swap if not pattern4.match(item['机场四字码']) and item['机场名称(英文)'] == '': item['机场名称(英文)'] = item['机场四字码'] if not '国家(地区)代码' in item or item['国家(地区)代码'] == '': continue for lang in langs: k = 'geo_{}'.format(lang) if not k in item: continue g = item[k] if len(g) == 0 or not 'address_components' in g or len(g['address_components']) == 0: continue cc = g['address_components'] for c in cc: if not 'types' in c or len(c['types']) == 0 or not 'short_name' in c: continue if c['types'][0] == 'country' and c['short_name'] != item['国家(地区)代码']: item[k] = {} break s = json.dumps(arr, ensure_ascii=False) with open('airport.json', 'w') as f: f.write(s) # 检查数据 def test_geocode(): arr = [] with open("airport.json", "r") as f: arr = json.load(f) langs = ['en', 'zh-CN'] count_empty_code = 0 flag_empty = {} flag_cc = {} flag_fail = {} for item in arr: if not '机场三字码' in item or item['机场三字码'] == '': count_empty_code += 1 continue flag = False flag_sub = 0 for lang in langs: k = 'geo_{}'.format(lang) if not k in item: flag_sub += 1 continue g = item[k] if len(g) == 0: flag = flag or False continue flag = flag or ('geometry' in g and 'location' in g['geometry'] and 'lat' in g['geometry']['location'] and 'lng' in g['geometry']['location']) if 'address_components' in g and len(g['address_components']) > 0: cc = g['address_components'] for c in cc: if not 'types' in c or len(c['types']) == 0 or not 'short_name' in c: continue if c['types'][0] == 'country' and c['short_name'] != item['国家(地区)代码']: flag_cc[item['机场三字码']] = 1 break if not flag: flag_empty[item['机场三字码']] = 1 if flag_sub >= 2: flag_fail[item['机场三字码']] = 1 print("无三字码的数据数量: {}".format(count_empty_code)) print("Google地图信息为空的数据数量: {}".format(len(flag_empty))) if len(flag_empty) > 0: print(flag_empty) print("Google地图信息国家不一致的数据数量: {}".format(len(flag_cc))) if len(flag_cc) > 0: print(flag_cc) print("无Google地图信息(拉数据失败)的数据数量: {}".format(len(flag_fail))) if len(flag_fail) > 0: print(flag_fail) # 根据机场三字码,重新从 Google Geocoding AP 刷数据 def req_geocode(): # arr = [{"机场三字码":"SZX"}] arr = [] with open("airport.json", "r") as f: arr = json.load(f) langs = ['en', 'zh-CN'] is_skip_empty = False # geo_ 为空 dict 的,是否不重新拉数据 i = 0 for item in arr: try: if not '机场三字码' in item or item['机场三字码'] == '': continue flag = False for lang in langs: k = 'geo_{}'.format(lang) if not k in item: continue g = item[k] if len(g) == 0: flag = flag or is_skip_empty continue flag = flag or ('geometry' in g and 'location' in g['geometry'] and 'lat' in g['geometry']['location'] and 'lng' in g['geometry']['location']) if flag: continue for lang in langs: k = 'geo_{}'.format(lang) uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}%20{}%20Airport%2C{}&language={}&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format( item['国家(地区)名称'] if item['国家(地区)名称'] else item['国家(地区)代码'], item['机场名称(英文)'] if item['机场名称(英文)'] else item['机场三字码'], item['城市名(英文)'], lang) text = requests.get(uri).text res = json.loads(text).get('results') if res and len(res) > 0: geo = res[0] if geo: item[k] = geo else: item[k] = {} else: item[k] = {} s = json.dumps(item, ensure_ascii=False) print(item) i += 1 if i % 5 == 0: time.sleep(1) except Exception as e: print(f"Exception in geocode process: {e=}, {type(e)=}") s = json.dumps(arr, ensure_ascii=False) # print(s) with open('airport.json', 'w') as f: f.write(s) # 抓取机场三字码等信息 def get_info(): pattern1 = re.compile(r'^-*\d+\.*\d+$') pattern2 = re.compile(r'^[A-Z]{3}$') pattern3 = re.compile(r'^[A-Z]{2}$') pattern4 = re.compile(r'^[A-Z]{4}$') pattern5 = re.compile(r'^[A-Za-z\s,]+$') pattern6 = re.compile(r'^[\u4e00-\u9fa5]+$') result = [] n = 290 for i in range(1, n): page = i try: text = requests.get("https://airportcode.bmcx.com/" + str(page) + "__airportcode/") res = text.text html = BeautifulSoup(res, features="html.parser") tabs = html.find_all(name = "table") if len(tabs) > 1: arr = tabs[1].find_all(name = "tr") if len(arr) > 2: for tr in arr[2:]: href = tr.find(name = "a").attrs.get("href") if href != None: try: sub_text = requests.get("https://airportcode.bmcx.com" + href) sub_res = sub_text.text sub_html = BeautifulSoup(sub_res, features="html.parser") sub_tabs = sub_html.find_all(name = "table") if len(sub_tabs) > 1: sub_arr = sub_tabs[1].find_all(name = "tr") item = {} for sub_tr in sub_arr: sub_t = sub_tr.find_all(name = "td") item[sub_t[0].string] = sub_t[1].string flag = False name = "" swap = "" if item["纬度"] != '' and not pattern1.match(item["纬度"]): name = "纬度" swap = item["纬度"] flag = True if item["经度"] != '' and not pattern1.match(item["经度"]): name = "经度" swap = item["经度"] flag = True if flag: for k, v in item.items(): if k != "纬度" and k != "经度" and pattern1.match(v): item[name] = v item[k] = swap break flag = False swap = "" if item["机场三字码"] == '' or not pattern2.match(item["机场三字码"]): swap = item["机场三字码"] flag = True if flag: for k, v in item.items(): if k != "机场三字码" and pattern2.match(v): item["机场三字码"] = v item[k] = swap break item["机场三字码"] = item["机场三字码"].upper() if item["国家(地区)代码"] == '' and pattern3.match(item["纬度"]): item["国家(地区)代码"] = item["纬度"] item["纬度"] = '' if item["国家(地区)代码"] == '' and pattern3.match(item["经度"]): item["国家(地区)代码"] = item["经度"] item["经度"] = '' if item["机场四字码"] == '' and pattern4.match(item["纬度"]): item["机场四字码"] = item["纬度"] item["纬度"] = '' if item["机场四字码"] == '' and pattern4.match(item["经度"]): item["机场四字码"] = item["经度"] item["经度"] = '' if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["纬度"]): item["机场名称(英文)"] = item["纬度"] item["纬度"] = '' if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["经度"]): item["机场名称(英文)"] = item["经度"] item["经度"] = '' if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["纬度"]): item["城市名(英文)"] = item["纬度"] item["纬度"] = '' if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["经度"]): item["城市名(英文)"] = item["经度"] item["经度"] = '' if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["纬度"]): item["城市名"] = item["纬度"] item["纬度"] = '' if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["经度"]): item["城市名"] = item["经度"] item["经度"] = '' if item["国家(地区)名称"] == '' and pattern6.match(item["纬度"]): item["国家(地区)名称"] = item["纬度"] item["纬度"] = '' if item["国家(地区)名称"] == '' and pattern6.match(item["经度"]): item["国家(地区)名称"] = item["经度"] item["经度"] = '' if item["城市名"] == '' and pattern6.match(item["纬度"]): item["城市名"] = item["纬度"] item["纬度"] = '' if item["城市名"] == '' and pattern6.match(item["经度"]): item["城市名"] = item["经度"] item["经度"] = '' result.append(item) except Exception as e: print(f"Exception in sub process: {e=}, {type(e)=}") except Exception as ex: print(f"Exception in main loop process: {ex=}, {type(ex)=}") s = json.dumps(result, ensure_ascii=False) # print(s) with open('airport.json', 'w') as f: f.write(s) # 结果校验 arr = [] with open("airport.json", "r") as f: arr = json.load(f) ct = 0 for item in arr: swap = "" if item["纬度"] != '' and not pattern1.match(item["纬度"]): print(item["机场三字码"]) ct += 1 if item["经度"] != '' and not pattern1.match(item["经度"]): print(item["机场三字码"]) ct += 1 if item["机场三字码"] != '' and not pattern2.match(item["机场三字码"]): ct += 1 if ct > 0: print('Sth wrong ' + ct) # s = json.dumps(arr, ensure_ascii=False) # with open('airport.json', 'w') as f: # f.write(s)