123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307 |
- import json
- import re
- import time
- from bs4 import BeautifulSoup
- import requests
- # 清理错误数据
- def clean_geocode():
- arr = []
- with open("out.json", "r") as f:
- arr = json.load(f)
- langs = ['en', 'zh-CN']
- pattern3 = re.compile(r'^[A-Z]{2}$')
- pattern4 = re.compile(r'^[A-Z]{4}$')
- for item in arr:
- if not pattern3.match(item['国家(地区)代码']):
- swap = item['国家(地区)代码']
- for k, v in item.items():
- if k.startswith('geo_') or k == '国家(地区)代码' or not isinstance(v, str):
- continue
- if pattern3.match(v):
- item['国家(地区)代码'] = v
- item[k] = swap
-
- if not pattern4.match(item['机场四字码']) and item['机场名称(英文)'] == '':
- item['机场名称(英文)'] = item['机场四字码']
- if not '国家(地区)代码' in item or item['国家(地区)代码'] == '':
- continue
- for lang in langs:
- k = 'geo_{}'.format(lang)
- if not k in item:
- continue
- g = item[k]
- if len(g) == 0 or not 'address_components' in g or len(g['address_components']) == 0:
- continue
- cc = g['address_components']
- for c in cc:
- if not 'types' in c or len(c['types']) == 0 or not 'short_name' in c:
- continue
- if c['types'][0] == 'country' and c['short_name'] != item['国家(地区)代码']:
- item[k] = {}
- break
- s = json.dumps(arr, ensure_ascii=False)
- with open('out.json', 'w') as f:
- f.write(s)
- # 检查数据
- def test_geocode():
- arr = []
- with open("out.json", "r") as f:
- arr = json.load(f)
- langs = ['en', 'zh-CN']
- count_empty_code = 0
- flag_empty = {}
- flag_cc = {}
- flag_fail = {}
- for item in arr:
- if not '机场三字码' in item or item['机场三字码'] == '':
- count_empty_code += 1
- continue
- flag = False
- flag_sub = 0
- for lang in langs:
- k = 'geo_{}'.format(lang)
- if not k in item:
- flag_sub += 1
- continue
- g = item[k]
- if len(g) == 0:
- flag = flag or False
- continue
- flag = flag or ('geometry' in g and 'location' in g['geometry'] and 'lat' in g['geometry']['location'] and 'lng' in g['geometry']['location'])
- if 'address_components' in g and len(g['address_components']) > 0:
- cc = g['address_components']
- for c in cc:
- if not 'types' in c or len(c['types']) == 0 or not 'short_name' in c:
- continue
- if c['types'][0] == 'country' and c['short_name'] != item['国家(地区)代码']:
- flag_cc[item['机场三字码']] = 1
- break
- if not flag:
- flag_empty[item['机场三字码']] = 1
- if flag_sub >= 2:
- flag_fail[item['机场三字码']] = 1
- print("无三字码的数据数量: {}".format(count_empty_code))
- print("Google地图信息为空的数据数量: {}".format(len(flag_empty)))
- if len(flag_empty) > 0:
- print(flag_empty)
- print("Google地图信息国家不一致的数据数量: {}".format(len(flag_cc)))
- if len(flag_cc) > 0:
- print(flag_cc)
- print("无Google地图信息(拉数据失败)的数据数量: {}".format(len(flag_fail)))
- if len(flag_fail) > 0:
- print(flag_fail)
- # 根据机场三字码,重新从 Google Geocoding AP 刷数据
- def req_geocode():
- # arr = [{"机场三字码":"SZX"}]
- arr = []
- with open("out.json", "r") as f:
- arr = json.load(f)
- langs = ['en', 'zh-CN']
- is_skip_empty = False # geo_ 为空 dict 的,是否不重新拉数据
- i = 0
- for item in arr:
- try:
- if not '机场三字码' in item or item['机场三字码'] == '':
- continue
- flag = False
- for lang in langs:
- k = 'geo_{}'.format(lang)
- if not k in item:
- continue
- g = item[k]
- if len(g) == 0:
- flag = flag or is_skip_empty
- continue
- flag = flag or ('geometry' in g and 'location' in g['geometry'] and 'lat' in g['geometry']['location'] and 'lng' in g['geometry']['location'])
- if flag:
- continue
-
- for lang in langs:
- k = 'geo_{}'.format(lang)
- uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}%20{}%20Airport%2C{}&language={}&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(
- item['国家(地区)名称'] if item['国家(地区)名称'] else item['国家(地区)代码'],
- item['机场名称(英文)'] if item['机场名称(英文)'] else item['机场三字码'],
- item['城市名(英文)'],
- lang)
- text = requests.get(uri).text
- res = json.loads(text).get('results')
- if res and len(res) > 0:
- geo = res[0]
- if geo:
- item[k] = geo
- else:
- item[k] = {}
- else:
- item[k] = {}
-
- s = json.dumps(item, ensure_ascii=False)
- print(item)
- i += 1
- if i % 5 == 0:
- time.sleep(1)
- except Exception as e:
- print(f"Exception in geocode process: {e=}, {type(e)=}")
- s = json.dumps(arr, ensure_ascii=False)
- # print(s)
- with open('out.json', 'w') as f:
- f.write(s)
- # 抓取机场三字码等信息
- def get_info():
- pattern1 = re.compile(r'^-*\d+\.*\d+$')
- pattern2 = re.compile(r'^[A-Z]{3}$')
- pattern3 = re.compile(r'^[A-Z]{2}$')
- pattern4 = re.compile(r'^[A-Z]{4}$')
- pattern5 = re.compile(r'^[A-Za-z\s,]+$')
- pattern6 = re.compile(r'^[\u4e00-\u9fa5]+$')
- result = []
- n = 290
- for i in range(1, n):
- page = i
- try:
- text = requests.get("https://airportcode.bmcx.com/" + str(page) + "__airportcode/")
- res = text.text
- html = BeautifulSoup(res, features="html.parser")
- tabs = html.find_all(name = "table")
- if len(tabs) > 1:
- arr = tabs[1].find_all(name = "tr")
- if len(arr) > 2:
- for tr in arr[2:]:
- href = tr.find(name = "a").attrs.get("href")
- if href != None:
- try:
- sub_text = requests.get("https://airportcode.bmcx.com" + href)
- sub_res = sub_text.text
- sub_html = BeautifulSoup(sub_res, features="html.parser")
- sub_tabs = sub_html.find_all(name = "table")
- if len(sub_tabs) > 1:
- sub_arr = sub_tabs[1].find_all(name = "tr")
- item = {}
- for sub_tr in sub_arr:
- sub_t = sub_tr.find_all(name = "td")
- item[sub_t[0].string] = sub_t[1].string
- flag = False
- name = ""
- swap = ""
- if item["纬度"] != '' and not pattern1.match(item["纬度"]):
- name = "纬度"
- swap = item["纬度"]
- flag = True
- if item["经度"] != '' and not pattern1.match(item["经度"]):
- name = "经度"
- swap = item["经度"]
- flag = True
- if flag:
- for k, v in item.items():
- if k != "纬度" and k != "经度" and pattern1.match(v):
- item[name] = v
- item[k] = swap
- break
- flag = False
- swap = ""
- if item["机场三字码"] == '' or not pattern2.match(item["机场三字码"]):
- swap = item["机场三字码"]
- flag = True
- if flag:
- for k, v in item.items():
- if k != "机场三字码" and pattern2.match(v):
- item["机场三字码"] = v
- item[k] = swap
- break
- item["机场三字码"] = item["机场三字码"].upper()
- if item["国家(地区)代码"] == '' and pattern3.match(item["纬度"]):
- item["国家(地区)代码"] = item["纬度"]
- item["纬度"] = ''
- if item["国家(地区)代码"] == '' and pattern3.match(item["经度"]):
- item["国家(地区)代码"] = item["经度"]
- item["经度"] = ''
- if item["机场四字码"] == '' and pattern4.match(item["纬度"]):
- item["机场四字码"] = item["纬度"]
- item["纬度"] = ''
- if item["机场四字码"] == '' and pattern4.match(item["经度"]):
- item["机场四字码"] = item["经度"]
- item["经度"] = ''
- if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["纬度"]):
- item["机场名称(英文)"] = item["纬度"]
- item["纬度"] = ''
- if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["经度"]):
- item["机场名称(英文)"] = item["经度"]
- item["经度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["纬度"]):
- item["城市名(英文)"] = item["纬度"]
- item["纬度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["经度"]):
- item["城市名(英文)"] = item["经度"]
- item["经度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["纬度"]):
- item["城市名"] = item["纬度"]
- item["纬度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["经度"]):
- item["城市名"] = item["经度"]
- item["经度"] = ''
- if item["国家(地区)名称"] == '' and pattern6.match(item["纬度"]):
- item["国家(地区)名称"] = item["纬度"]
- item["纬度"] = ''
- if item["国家(地区)名称"] == '' and pattern6.match(item["经度"]):
- item["国家(地区)名称"] = item["经度"]
- item["经度"] = ''
- if item["城市名"] == '' and pattern6.match(item["纬度"]):
- item["城市名"] = item["纬度"]
- item["纬度"] = ''
- if item["城市名"] == '' and pattern6.match(item["经度"]):
- item["城市名"] = item["经度"]
- item["经度"] = ''
- result.append(item)
- except Exception as e:
- print(f"Exception in sub process: {e=}, {type(e)=}")
- except Exception as ex:
- print(f"Exception in main loop process: {ex=}, {type(ex)=}")
- s = json.dumps(result, ensure_ascii=False)
- # print(s)
- with open('origin.json', 'w') as f:
- f.write(s)
- # 结果校验
- arr = []
- with open("origin.json", "r") as f:
- arr = json.load(f)
- ct = 0
- for item in arr:
- swap = ""
- if item["纬度"] != '' and not pattern1.match(item["纬度"]):
- print(item["机场三字码"])
- ct += 1
- if item["经度"] != '' and not pattern1.match(item["经度"]):
- print(item["机场三字码"])
- ct += 1
- if item["机场三字码"] != '' and not pattern2.match(item["机场三字码"]):
- ct += 1
-
- if ct > 0:
- print('Sth wrong ' + ct)
- # s = json.dumps(arr, ensure_ascii=False)
- # with open('origin.json', 'w') as f:
- # f.write(s)
|