|
@@ -0,0 +1,184 @@
|
|
|
|
+import json
|
|
|
|
+import re
|
|
|
|
+import time
|
|
|
|
+
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
+import requests
|
|
|
|
+
|
|
|
|
+# 根据机场三字码,重新从 Google Geocoding AP 刷数据
|
|
|
|
+def req_geocode():
|
|
|
|
+ # arr = [{"机场三字码":"SZX"}]
|
|
|
|
+ arr = []
|
|
|
|
+ with open("json2.json", "r") as f:
|
|
|
|
+ arr = json.load(f)
|
|
|
|
+
|
|
|
|
+ langs = ['en', 'zh']
|
|
|
|
+
|
|
|
|
+ i = 0
|
|
|
|
+ for item in arr:
|
|
|
|
+ try:
|
|
|
|
+ if item['机场三字码'] == '':
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ for lang in langs:
|
|
|
|
+ uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}%20Airport%2C{}%2C{}&language={}&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(item['机场三字码'], item['城市名(英文)'], item['国家(地区)代码'], lang)
|
|
|
|
+ text = requests.get(uri).text
|
|
|
|
+ res = json.loads(text).get('results')
|
|
|
|
+ if res != None and len(res) > 0:
|
|
|
|
+ geo = res[0].get('address_components')
|
|
|
|
+ if geo != None:
|
|
|
|
+ item['geo_{}'.format(lang)] = geo
|
|
|
|
+
|
|
|
|
+ s = json.dumps(item, ensure_ascii=False)
|
|
|
|
+ print(item)
|
|
|
|
+
|
|
|
|
+ i += 1
|
|
|
|
+ if i % 3 == 0:
|
|
|
|
+ time.sleep(1)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ print(f"Exception in geocode process: {e=}, {type(e)=}")
|
|
|
|
+
|
|
|
|
+ s = json.dumps(arr, ensure_ascii=False)
|
|
|
|
+ # print(s)
|
|
|
|
+ with open('json3.json', 'w') as f:
|
|
|
|
+ f.write(s)
|
|
|
|
+
|
|
|
|
+# 抓取机场三字码等信息
|
|
|
|
+def get_info():
|
|
|
|
+ pattern1 = re.compile(r'^-*\d+\.*\d+$')
|
|
|
|
+ pattern2 = re.compile(r'^[A-Z]{3}$')
|
|
|
|
+ pattern3 = re.compile(r'^[A-Z]{2}$')
|
|
|
|
+ pattern4 = re.compile(r'^[A-Z]{4}$')
|
|
|
|
+ pattern5 = re.compile(r'^[A-Za-z\s,]+$')
|
|
|
|
+ pattern6 = re.compile(r'^[\u4e00-\u9fa5]+$')
|
|
|
|
+ result = []
|
|
|
|
+ n = 290
|
|
|
|
+ for i in range(1, n):
|
|
|
|
+ page = i
|
|
|
|
+ try:
|
|
|
|
+ text = requests.get("https://airportcode.bmcx.com/" + str(page) + "__airportcode/")
|
|
|
|
+ res = text.text
|
|
|
|
+ html = BeautifulSoup(res, features="html.parser")
|
|
|
|
+ tabs = html.find_all(name = "table")
|
|
|
|
+ if len(tabs) > 1:
|
|
|
|
+ arr = tabs[1].find_all(name = "tr")
|
|
|
|
+ if len(arr) > 2:
|
|
|
|
+ for tr in arr[2:]:
|
|
|
|
+ href = tr.find(name = "a").attrs.get("href")
|
|
|
|
+ if href != None:
|
|
|
|
+ try:
|
|
|
|
+ sub_text = requests.get("https://airportcode.bmcx.com" + href)
|
|
|
|
+ sub_res = sub_text.text
|
|
|
|
+ sub_html = BeautifulSoup(sub_res, features="html.parser")
|
|
|
|
+ sub_tabs = sub_html.find_all(name = "table")
|
|
|
|
+ if len(sub_tabs) > 1:
|
|
|
|
+ sub_arr = sub_tabs[1].find_all(name = "tr")
|
|
|
|
+ item = {}
|
|
|
|
+ for sub_tr in sub_arr:
|
|
|
|
+ sub_t = sub_tr.find_all(name = "td")
|
|
|
|
+ item[sub_t[0].string] = sub_t[1].string
|
|
|
|
+ flag = False
|
|
|
|
+ name = ""
|
|
|
|
+ swap = ""
|
|
|
|
+ if item["纬度"] != '' and not pattern1.match(item["纬度"]):
|
|
|
|
+ name = "纬度"
|
|
|
|
+ swap = item["纬度"]
|
|
|
|
+ flag = True
|
|
|
|
+ if item["经度"] != '' and not pattern1.match(item["经度"]):
|
|
|
|
+ name = "经度"
|
|
|
|
+ swap = item["经度"]
|
|
|
|
+ flag = True
|
|
|
|
+ if flag:
|
|
|
|
+ for k, v in item.items():
|
|
|
|
+ if k != "纬度" and k != "经度" and pattern1.match(v):
|
|
|
|
+ item[name] = v
|
|
|
|
+ item[k] = swap
|
|
|
|
+ break
|
|
|
|
+ flag = False
|
|
|
|
+ swap = ""
|
|
|
|
+ if item["机场三字码"] == '' or not pattern2.match(item["机场三字码"]):
|
|
|
|
+ swap = item["机场三字码"]
|
|
|
|
+ flag = True
|
|
|
|
+ if flag:
|
|
|
|
+ for k, v in item.items():
|
|
|
|
+ if k != "机场三字码" and pattern2.match(v):
|
|
|
|
+ item["机场三字码"] = v
|
|
|
|
+ item[k] = swap
|
|
|
|
+ break
|
|
|
|
+ item["机场三字码"] = item["机场三字码"].upper()
|
|
|
|
+ if item["国家(地区)代码"] == '' and pattern3.match(item["纬度"]):
|
|
|
|
+ item["国家(地区)代码"] = item["纬度"]
|
|
|
|
+ item["纬度"] = ''
|
|
|
|
+ if item["国家(地区)代码"] == '' and pattern3.match(item["经度"]):
|
|
|
|
+ item["国家(地区)代码"] = item["经度"]
|
|
|
|
+ item["经度"] = ''
|
|
|
|
+ if item["机场四字码"] == '' and pattern4.match(item["纬度"]):
|
|
|
|
+ item["机场四字码"] = item["纬度"]
|
|
|
|
+ item["纬度"] = ''
|
|
|
|
+ if item["机场四字码"] == '' and pattern4.match(item["经度"]):
|
|
|
|
+ item["机场四字码"] = item["经度"]
|
|
|
|
+ item["经度"] = ''
|
|
|
|
+ if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["纬度"]):
|
|
|
|
+ item["机场名称(英文)"] = item["纬度"]
|
|
|
|
+ item["纬度"] = ''
|
|
|
|
+ if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["经度"]):
|
|
|
|
+ item["机场名称(英文)"] = item["经度"]
|
|
|
|
+ item["经度"] = ''
|
|
|
|
+ if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["纬度"]):
|
|
|
|
+ item["城市名(英文)"] = item["纬度"]
|
|
|
|
+ item["纬度"] = ''
|
|
|
|
+ if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["经度"]):
|
|
|
|
+ item["城市名(英文)"] = item["经度"]
|
|
|
|
+ item["经度"] = ''
|
|
|
|
+ if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["纬度"]):
|
|
|
|
+ item["城市名"] = item["纬度"]
|
|
|
|
+ item["纬度"] = ''
|
|
|
|
+ if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["经度"]):
|
|
|
|
+ item["城市名"] = item["经度"]
|
|
|
|
+ item["经度"] = ''
|
|
|
|
+ if item["国家(地区)名称"] == '' and pattern6.match(item["纬度"]):
|
|
|
|
+ item["国家(地区)名称"] = item["纬度"]
|
|
|
|
+ item["纬度"] = ''
|
|
|
|
+ if item["国家(地区)名称"] == '' and pattern6.match(item["经度"]):
|
|
|
|
+ item["国家(地区)名称"] = item["经度"]
|
|
|
|
+ item["经度"] = ''
|
|
|
|
+ if item["城市名"] == '' and pattern6.match(item["纬度"]):
|
|
|
|
+ item["城市名"] = item["纬度"]
|
|
|
|
+ item["纬度"] = ''
|
|
|
|
+ if item["城市名"] == '' and pattern6.match(item["经度"]):
|
|
|
|
+ item["城市名"] = item["经度"]
|
|
|
|
+ item["经度"] = ''
|
|
|
|
+ result.append(item)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ print(f"Exception in sub process: {e=}, {type(e)=}")
|
|
|
|
+ except Exception as ex:
|
|
|
|
+ print(f"Exception in main loop process: {ex=}, {type(ex)=}")
|
|
|
|
+
|
|
|
|
+ s = json.dumps(result, ensure_ascii=False)
|
|
|
|
+ # print(s)
|
|
|
|
+ with open('json2.json', 'w') as f:
|
|
|
|
+ f.write(s)
|
|
|
|
+
|
|
|
|
+ # 结果校验
|
|
|
|
+ arr = []
|
|
|
|
+ with open("json2.json", "r") as f:
|
|
|
|
+ arr = json.load(f)
|
|
|
|
+
|
|
|
|
+ ct = 0
|
|
|
|
+ for item in arr:
|
|
|
|
+ swap = ""
|
|
|
|
+ if item["纬度"] != '' and not pattern1.match(item["纬度"]):
|
|
|
|
+ print(item["机场三字码"])
|
|
|
|
+ ct += 1
|
|
|
|
+ if item["经度"] != '' and not pattern1.match(item["经度"]):
|
|
|
|
+ print(item["机场三字码"])
|
|
|
|
+ ct += 1
|
|
|
|
+ if item["机场三字码"] != '' and not pattern2.match(item["机场三字码"]):
|
|
|
|
+ ct += 1
|
|
|
|
+
|
|
|
|
+ if ct > 0:
|
|
|
|
+ print('Sth wrong ' + ct)
|
|
|
|
+
|
|
|
|
+ # s = json.dumps(arr, ensure_ascii=False)
|
|
|
|
+ # with open('json2.json', 'w') as f:
|
|
|
|
+ # f.write(s)
|