get_info.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. import json
  2. import re
  3. import time
  4. from bs4 import BeautifulSoup
  5. import requests
  6. # 清理错误数据
  7. def clean_geocode():
  8. arr = []
  9. with open("out.json", "r") as f:
  10. arr = json.load(f)
  11. langs = ['en', 'zh-CN']
  12. pattern3 = re.compile(r'^[A-Z]{2}$')
  13. pattern4 = re.compile(r'^[A-Z]{4}$')
  14. for item in arr:
  15. if not pattern3.match(item['国家(地区)代码']):
  16. swap = item['国家(地区)代码']
  17. for k, v in item.items():
  18. if k.startswith('geo_') or k == '国家(地区)代码' or not isinstance(v, str):
  19. continue
  20. if pattern3.match(v):
  21. item['国家(地区)代码'] = v
  22. item[k] = swap
  23. if not pattern4.match(item['机场四字码']) and item['机场名称(英文)'] == '':
  24. item['机场名称(英文)'] = item['机场四字码']
  25. if not '国家(地区)代码' in item or item['国家(地区)代码'] == '':
  26. continue
  27. for lang in langs:
  28. k = 'geo_{}'.format(lang)
  29. if not k in item:
  30. continue
  31. g = item[k]
  32. if len(g) == 0 or not 'address_components' in g or len(g['address_components']) == 0:
  33. continue
  34. cc = g['address_components']
  35. for c in cc:
  36. if not 'types' in c or len(c['types']) == 0 or not 'short_name' in c:
  37. continue
  38. if c['types'][0] == 'country' and c['short_name'] != item['国家(地区)代码']:
  39. item[k] = {}
  40. break
  41. s = json.dumps(arr, ensure_ascii=False)
  42. with open('out.json', 'w') as f:
  43. f.write(s)
  44. # 检查数据
  45. def test_geocode():
  46. arr = []
  47. with open("out.json", "r") as f:
  48. arr = json.load(f)
  49. langs = ['en', 'zh-CN']
  50. count_empty_code = 0
  51. flag_empty = {}
  52. flag_cc = {}
  53. flag_fail = {}
  54. for item in arr:
  55. if not '机场三字码' in item or item['机场三字码'] == '':
  56. count_empty_code += 1
  57. continue
  58. flag = False
  59. flag_sub = 0
  60. for lang in langs:
  61. k = 'geo_{}'.format(lang)
  62. if not k in item:
  63. flag_sub += 1
  64. continue
  65. g = item[k]
  66. if len(g) == 0:
  67. flag = flag or False
  68. continue
  69. flag = flag or ('geometry' in g and 'location' in g['geometry'] and 'lat' in g['geometry']['location'] and 'lng' in g['geometry']['location'])
  70. if 'address_components' in g and len(g['address_components']) > 0:
  71. cc = g['address_components']
  72. for c in cc:
  73. if not 'types' in c or len(c['types']) == 0 or not 'short_name' in c:
  74. continue
  75. if c['types'][0] == 'country' and c['short_name'] != item['国家(地区)代码']:
  76. flag_cc[item['机场三字码']] = 1
  77. break
  78. if not flag:
  79. flag_empty[item['机场三字码']] = 1
  80. if flag_sub >= 2:
  81. flag_fail[item['机场三字码']] = 1
  82. print("无三字码的数据数量: {}".format(count_empty_code))
  83. print("Google地图信息为空的数据数量: {}".format(len(flag_empty)))
  84. if len(flag_empty) > 0:
  85. print(flag_empty)
  86. print("Google地图信息国家不一致的数据数量: {}".format(len(flag_cc)))
  87. if len(flag_cc) > 0:
  88. print(flag_cc)
  89. print("无Google地图信息(拉数据失败)的数据数量: {}".format(len(flag_fail)))
  90. if len(flag_fail) > 0:
  91. print(flag_fail)
  92. # 根据机场三字码,重新从 Google Geocoding AP 刷数据
  93. def req_geocode():
  94. # arr = [{"机场三字码":"SZX"}]
  95. arr = []
  96. with open("out.json", "r") as f:
  97. arr = json.load(f)
  98. langs = ['en', 'zh-CN']
  99. is_skip_empty = False # geo_ 为空 dict 的,是否不重新拉数据
  100. i = 0
  101. for item in arr:
  102. try:
  103. if not '机场三字码' in item or item['机场三字码'] == '':
  104. continue
  105. flag = False
  106. for lang in langs:
  107. k = 'geo_{}'.format(lang)
  108. if not k in item:
  109. continue
  110. g = item[k]
  111. if len(g) == 0:
  112. flag = flag or is_skip_empty
  113. continue
  114. flag = flag or ('geometry' in g and 'location' in g['geometry'] and 'lat' in g['geometry']['location'] and 'lng' in g['geometry']['location'])
  115. if flag:
  116. continue
  117. for lang in langs:
  118. k = 'geo_{}'.format(lang)
  119. uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}%20{}%20Airport%2C{}&language={}&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(
  120. item['国家(地区)名称'] if item['国家(地区)名称'] else item['国家(地区)代码'],
  121. item['机场名称(英文)'] if item['机场名称(英文)'] else item['机场三字码'],
  122. item['城市名(英文)'],
  123. lang)
  124. text = requests.get(uri).text
  125. res = json.loads(text).get('results')
  126. if res and len(res) > 0:
  127. geo = res[0]
  128. if geo:
  129. item[k] = geo
  130. else:
  131. item[k] = {}
  132. else:
  133. item[k] = {}
  134. s = json.dumps(item, ensure_ascii=False)
  135. print(item)
  136. i += 1
  137. if i % 5 == 0:
  138. time.sleep(1)
  139. except Exception as e:
  140. print(f"Exception in geocode process: {e=}, {type(e)=}")
  141. s = json.dumps(arr, ensure_ascii=False)
  142. # print(s)
  143. with open('out.json', 'w') as f:
  144. f.write(s)
  145. # 抓取机场三字码等信息
  146. def get_info():
  147. pattern1 = re.compile(r'^-*\d+\.*\d+$')
  148. pattern2 = re.compile(r'^[A-Z]{3}$')
  149. pattern3 = re.compile(r'^[A-Z]{2}$')
  150. pattern4 = re.compile(r'^[A-Z]{4}$')
  151. pattern5 = re.compile(r'^[A-Za-z\s,]+$')
  152. pattern6 = re.compile(r'^[\u4e00-\u9fa5]+$')
  153. result = []
  154. n = 290
  155. for i in range(1, n):
  156. page = i
  157. try:
  158. text = requests.get("https://airportcode.bmcx.com/" + str(page) + "__airportcode/")
  159. res = text.text
  160. html = BeautifulSoup(res, features="html.parser")
  161. tabs = html.find_all(name = "table")
  162. if len(tabs) > 1:
  163. arr = tabs[1].find_all(name = "tr")
  164. if len(arr) > 2:
  165. for tr in arr[2:]:
  166. href = tr.find(name = "a").attrs.get("href")
  167. if href != None:
  168. try:
  169. sub_text = requests.get("https://airportcode.bmcx.com" + href)
  170. sub_res = sub_text.text
  171. sub_html = BeautifulSoup(sub_res, features="html.parser")
  172. sub_tabs = sub_html.find_all(name = "table")
  173. if len(sub_tabs) > 1:
  174. sub_arr = sub_tabs[1].find_all(name = "tr")
  175. item = {}
  176. for sub_tr in sub_arr:
  177. sub_t = sub_tr.find_all(name = "td")
  178. item[sub_t[0].string] = sub_t[1].string
  179. flag = False
  180. name = ""
  181. swap = ""
  182. if item["纬度"] != '' and not pattern1.match(item["纬度"]):
  183. name = "纬度"
  184. swap = item["纬度"]
  185. flag = True
  186. if item["经度"] != '' and not pattern1.match(item["经度"]):
  187. name = "经度"
  188. swap = item["经度"]
  189. flag = True
  190. if flag:
  191. for k, v in item.items():
  192. if k != "纬度" and k != "经度" and pattern1.match(v):
  193. item[name] = v
  194. item[k] = swap
  195. break
  196. flag = False
  197. swap = ""
  198. if item["机场三字码"] == '' or not pattern2.match(item["机场三字码"]):
  199. swap = item["机场三字码"]
  200. flag = True
  201. if flag:
  202. for k, v in item.items():
  203. if k != "机场三字码" and pattern2.match(v):
  204. item["机场三字码"] = v
  205. item[k] = swap
  206. break
  207. item["机场三字码"] = item["机场三字码"].upper()
  208. if item["国家(地区)代码"] == '' and pattern3.match(item["纬度"]):
  209. item["国家(地区)代码"] = item["纬度"]
  210. item["纬度"] = ''
  211. if item["国家(地区)代码"] == '' and pattern3.match(item["经度"]):
  212. item["国家(地区)代码"] = item["经度"]
  213. item["经度"] = ''
  214. if item["机场四字码"] == '' and pattern4.match(item["纬度"]):
  215. item["机场四字码"] = item["纬度"]
  216. item["纬度"] = ''
  217. if item["机场四字码"] == '' and pattern4.match(item["经度"]):
  218. item["机场四字码"] = item["经度"]
  219. item["经度"] = ''
  220. if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["纬度"]):
  221. item["机场名称(英文)"] = item["纬度"]
  222. item["纬度"] = ''
  223. if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["经度"]):
  224. item["机场名称(英文)"] = item["经度"]
  225. item["经度"] = ''
  226. if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["纬度"]):
  227. item["城市名(英文)"] = item["纬度"]
  228. item["纬度"] = ''
  229. if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["经度"]):
  230. item["城市名(英文)"] = item["经度"]
  231. item["经度"] = ''
  232. if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["纬度"]):
  233. item["城市名"] = item["纬度"]
  234. item["纬度"] = ''
  235. if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["经度"]):
  236. item["城市名"] = item["经度"]
  237. item["经度"] = ''
  238. if item["国家(地区)名称"] == '' and pattern6.match(item["纬度"]):
  239. item["国家(地区)名称"] = item["纬度"]
  240. item["纬度"] = ''
  241. if item["国家(地区)名称"] == '' and pattern6.match(item["经度"]):
  242. item["国家(地区)名称"] = item["经度"]
  243. item["经度"] = ''
  244. if item["城市名"] == '' and pattern6.match(item["纬度"]):
  245. item["城市名"] = item["纬度"]
  246. item["纬度"] = ''
  247. if item["城市名"] == '' and pattern6.match(item["经度"]):
  248. item["城市名"] = item["经度"]
  249. item["经度"] = ''
  250. result.append(item)
  251. except Exception as e:
  252. print(f"Exception in sub process: {e=}, {type(e)=}")
  253. except Exception as ex:
  254. print(f"Exception in main loop process: {ex=}, {type(ex)=}")
  255. s = json.dumps(result, ensure_ascii=False)
  256. # print(s)
  257. with open('origin.json', 'w') as f:
  258. f.write(s)
  259. # 结果校验
  260. arr = []
  261. with open("origin.json", "r") as f:
  262. arr = json.load(f)
  263. ct = 0
  264. for item in arr:
  265. swap = ""
  266. if item["纬度"] != '' and not pattern1.match(item["纬度"]):
  267. print(item["机场三字码"])
  268. ct += 1
  269. if item["经度"] != '' and not pattern1.match(item["经度"]):
  270. print(item["机场三字码"])
  271. ct += 1
  272. if item["机场三字码"] != '' and not pattern2.match(item["机场三字码"]):
  273. ct += 1
  274. if ct > 0:
  275. print('Sth wrong ' + ct)
  276. # s = json.dumps(arr, ensure_ascii=False)
  277. # with open('origin.json', 'w') as f:
  278. # f.write(s)