get_info.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. import json
  2. import os
  3. import re
  4. import time
  5. from bs4 import BeautifulSoup
  6. import pandas as pd
  7. import requests
  8. def req_iata_info():
  9. city_df = pd.read_excel('city.xlsx', sheet_name='city')
  10. origin = []
  11. with open("airport_other_city.json", "r") as f:
  12. origin = json.load(f)
  13. p_list = {'日本': 900085, '澳大利亚': 900216, '新西兰': 900222}
  14. arrs = {'日本': [], '澳大利亚': [], '新西兰': []}
  15. new_origin = []
  16. for item in origin:
  17. try:
  18. if 'country_name' not in item or 'iata' not in item or item['country_name'] == '' or item['iata'] == '' or item['country_name'] not in p_list:
  19. new_origin.append(item)
  20. continue
  21. uri = 'http://www.yuntisoft.com/airport/{}.html'.format(item['iata'])
  22. text = requests.get(uri).text
  23. html = BeautifulSoup(text, features="html.parser")
  24. ul = html.find(name = "ul", attrs={'class': 'port_items'})
  25. if ul:
  26. li_arr = ul.find_all(name = "li")
  27. if li_arr and len(li_arr) > 0:
  28. for li in li_arr:
  29. text = li.text
  30. idx = text.find(':')
  31. if idx <= 0:
  32. continue
  33. title = text[0:idx].strip()
  34. value = text[idx + 1:].strip()
  35. if len(title) == 0 or len(value) == 0:
  36. continue
  37. if len(item['icao']) == 0 and title == 'ICAO(四字码)':
  38. item['icao'] = value
  39. elif title == '机场名':
  40. idx = value.find(' ')
  41. if idx > 0:
  42. item['airport_cn_name'] = value[0:idx]
  43. item['airport_en_name'] = value[idx + 1:]
  44. elif title == '所属城市':
  45. idx = value.find(' ')
  46. if idx > 0:
  47. item['city_cn_name'] = value[0:idx]
  48. item['city_en_name'] = value[idx + 1:]
  49. if city_df.loc[(city_df['CountryCode'] == p_list[item['country_name']]) & (city_df['ZHName'] == item['city_cn_name'])].empty:
  50. item['c'] = '+'
  51. else:
  52. item['c'] = item['city_cn_name']
  53. if 'c' not in item:
  54. item['c'] = '?'
  55. arrs[item['country_name']].append(item)
  56. print('{}'.format(item))
  57. except Exception as e:
  58. print(f"Exception in iata process: {e=}, {type(e)=}")
  59. for key, arr in arrs.items():
  60. if arr and len(arr) > 0:
  61. s = json.dumps(arr, ensure_ascii=False)
  62. with open('{}_iata.json'.format(key), 'w') as f:
  63. f.write(s)
  64. if new_origin and len(new_origin) > 0:
  65. s = json.dumps(new_origin, ensure_ascii=False)
  66. with open('airport_other_city.json', 'w') as f:
  67. f.write(s)
  68. def req_iata_for_city():
  69. pth = '/Users/marion/Desktop/airport_change'
  70. files = os.listdir(pth)
  71. arr = []
  72. for file in files:
  73. file_path = os.path.join(pth, file)
  74. if not os.path.isfile(file_path) or not os.path.splitext(file)[-1] == '.json':
  75. continue
  76. origin = []
  77. with open(file_path, "r") as f:
  78. origin = json.load(f)
  79. i = 0;
  80. for item in origin:
  81. try:
  82. if not item['c'] == '+':
  83. continue
  84. str = '{} {}'.format(item['country_name'], item['city_cn_name'])
  85. uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&language=en&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(str)
  86. text = requests.get(uri).text
  87. res = json.loads(text).get('results')
  88. if res and len(res) > 0:
  89. geo = res[0]
  90. if geo:
  91. new_item = {
  92. 'country_name': item['country_name'],
  93. 'ZHName': item['city_cn_name'],
  94. 'geo_en': geo
  95. }
  96. arr.append(new_item)
  97. print('{}'.format(new_item))
  98. i += 1
  99. if i % 5 == 0:
  100. time.sleep(1)
  101. except Exception as e:
  102. print(f"Exception in iata for city process: {e=}, {type(e)=}")
  103. if arr and len(arr) > 0:
  104. s = json.dumps(arr, ensure_ascii=False)
  105. # print(s)
  106. with open('city_new_geo.json', 'w') as f:
  107. f.write(s)
  108. def req_geocode_for_city():
  109. country_df = pd.read_excel('city.xlsx', sheet_name='country')
  110. province_df = pd.read_excel('city.xlsx', sheet_name='province')
  111. city_df = pd.read_excel('city.xlsx', sheet_name='city')
  112. arr = []
  113. for idx, row in city_df.iterrows():
  114. try:
  115. if row['CountryCode'] == 1 or row['ProvinceCode'] == 1:
  116. continue
  117. str = ''
  118. if row['CountryCode'] == 10:
  119. provinces = province_df.loc[province_df['ProvinceCode']==row['ProvinceCode'], ['ZHName']]
  120. if not provinces.empty:
  121. province_name = provinces.iloc[0]['ZHName']
  122. str = '中国 {} {}'.format(province_name, row['ZHName'])
  123. else:
  124. countries = country_df.loc[country_df['CountryCode']==row['CountryCode'], ['ZHName']]
  125. if not countries.empty:
  126. country_name = countries.iloc[0]['ZHName']
  127. str = '{} {}'.format(country_name, row['ZHName'])
  128. if str == '':
  129. continue
  130. uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&language=en&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(str)
  131. text = requests.get(uri).text
  132. res = json.loads(text).get('results')
  133. if res and len(res) > 0:
  134. geo = res[0]
  135. if geo:
  136. new_item = {
  137. 'CityCode': row['CityCode'],
  138. 'ZHName': row['ZHName'],
  139. 'geo_en': geo
  140. }
  141. arr.append(new_item)
  142. print('{} {}'.format(idx, new_item))
  143. if idx % 5 == 0:
  144. time.sleep(1)
  145. except Exception as e:
  146. print(f"Exception in geocode for city process: {e=}, {type(e)=}")
  147. if arr and len(arr) > 0:
  148. s = json.dumps(arr, ensure_ascii=False)
  149. # print(s)
  150. with open('city_geo.json', 'w') as f:
  151. f.write(s)
  152. # 清理错误数据
  153. def clean_geocode():
  154. arr = []
  155. with open("airport.json", "r") as f:
  156. arr = json.load(f)
  157. langs = ['en', 'zh-CN']
  158. pattern3 = re.compile(r'^[A-Z]{2}$')
  159. pattern4 = re.compile(r'^[A-Z]{4}$')
  160. for item in arr:
  161. if not pattern3.match(item['国家(地区)代码']):
  162. swap = item['国家(地区)代码']
  163. for k, v in item.items():
  164. if k.startswith('geo_') or k == '国家(地区)代码' or not isinstance(v, str):
  165. continue
  166. if pattern3.match(v):
  167. item['国家(地区)代码'] = v
  168. item[k] = swap
  169. if not pattern4.match(item['机场四字码']) and item['机场名称(英文)'] == '':
  170. item['机场名称(英文)'] = item['机场四字码']
  171. if not '国家(地区)代码' in item or item['国家(地区)代码'] == '':
  172. continue
  173. for lang in langs:
  174. k = 'geo_{}'.format(lang)
  175. if not k in item:
  176. continue
  177. g = item[k]
  178. if len(g) == 0 or not 'address_components' in g or len(g['address_components']) == 0:
  179. continue
  180. cc = g['address_components']
  181. for c in cc:
  182. if not 'types' in c or len(c['types']) == 0 or not 'short_name' in c:
  183. continue
  184. if c['types'][0] == 'country' and c['short_name'] != item['国家(地区)代码']:
  185. item[k] = {}
  186. break
  187. s = json.dumps(arr, ensure_ascii=False)
  188. with open('airport.json', 'w') as f:
  189. f.write(s)
  190. # 检查数据
  191. def test_geocode():
  192. arr = []
  193. with open("airport.json", "r") as f:
  194. arr = json.load(f)
  195. langs = ['en', 'zh-CN']
  196. count_empty_code = 0
  197. flag_empty = {}
  198. flag_cc = {}
  199. flag_fail = {}
  200. for item in arr:
  201. if not '机场三字码' in item or item['机场三字码'] == '':
  202. count_empty_code += 1
  203. continue
  204. flag = False
  205. flag_sub = 0
  206. for lang in langs:
  207. k = 'geo_{}'.format(lang)
  208. if not k in item:
  209. flag_sub += 1
  210. continue
  211. g = item[k]
  212. if len(g) == 0:
  213. flag = flag or False
  214. continue
  215. flag = flag or ('geometry' in g and 'location' in g['geometry'] and 'lat' in g['geometry']['location'] and 'lng' in g['geometry']['location'])
  216. if 'address_components' in g and len(g['address_components']) > 0:
  217. cc = g['address_components']
  218. for c in cc:
  219. if not 'types' in c or len(c['types']) == 0 or not 'short_name' in c:
  220. continue
  221. if c['types'][0] == 'country' and c['short_name'] != item['国家(地区)代码']:
  222. flag_cc[item['机场三字码']] = 1
  223. break
  224. if not flag:
  225. flag_empty[item['机场三字码']] = 1
  226. if flag_sub >= 2:
  227. flag_fail[item['机场三字码']] = 1
  228. print("无三字码的数据数量: {}".format(count_empty_code))
  229. print("Google地图信息为空的数据数量: {}".format(len(flag_empty)))
  230. if len(flag_empty) > 0:
  231. print(flag_empty)
  232. print("Google地图信息国家不一致的数据数量: {}".format(len(flag_cc)))
  233. if len(flag_cc) > 0:
  234. print(flag_cc)
  235. print("无Google地图信息(拉数据失败)的数据数量: {}".format(len(flag_fail)))
  236. if len(flag_fail) > 0:
  237. print(flag_fail)
  238. # 根据机场三字码,重新从 Google Geocoding AP 刷数据
  239. def req_geocode():
  240. # arr = [{"机场三字码":"SZX"}]
  241. arr = []
  242. with open("airport.json", "r") as f:
  243. arr = json.load(f)
  244. langs = ['en', 'zh-CN']
  245. is_skip_empty = False # geo_ 为空 dict 的,是否不重新拉数据
  246. i = 0
  247. for item in arr:
  248. try:
  249. if not '机场三字码' in item or item['机场三字码'] == '':
  250. continue
  251. flag = False
  252. for lang in langs:
  253. k = 'geo_{}'.format(lang)
  254. if not k in item:
  255. continue
  256. g = item[k]
  257. if len(g) == 0:
  258. flag = flag or is_skip_empty
  259. continue
  260. flag = flag or ('geometry' in g and 'location' in g['geometry'] and 'lat' in g['geometry']['location'] and 'lng' in g['geometry']['location'])
  261. if flag:
  262. continue
  263. for lang in langs:
  264. k = 'geo_{}'.format(lang)
  265. uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}%20{}%20Airport%2C{}&language={}&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(
  266. item['国家(地区)名称'] if item['国家(地区)名称'] else item['国家(地区)代码'],
  267. item['机场名称(英文)'] if item['机场名称(英文)'] else item['机场三字码'],
  268. item['城市名(英文)'],
  269. lang)
  270. text = requests.get(uri).text
  271. res = json.loads(text).get('results')
  272. if res and len(res) > 0:
  273. geo = res[0]
  274. if geo:
  275. item[k] = geo
  276. else:
  277. item[k] = {}
  278. else:
  279. item[k] = {}
  280. s = json.dumps(item, ensure_ascii=False)
  281. print(item)
  282. i += 1
  283. if i % 5 == 0:
  284. time.sleep(1)
  285. except Exception as e:
  286. print(f"Exception in geocode process: {e=}, {type(e)=}")
  287. s = json.dumps(arr, ensure_ascii=False)
  288. # print(s)
  289. with open('airport.json', 'w') as f:
  290. f.write(s)
  291. # 抓取机场三字码等信息
  292. def get_info():
  293. pattern1 = re.compile(r'^-*\d+\.*\d+$')
  294. pattern2 = re.compile(r'^[A-Z]{3}$')
  295. pattern3 = re.compile(r'^[A-Z]{2}$')
  296. pattern4 = re.compile(r'^[A-Z]{4}$')
  297. pattern5 = re.compile(r'^[A-Za-z\s,]+$')
  298. pattern6 = re.compile(r'^[\u4e00-\u9fa5]+$')
  299. result = []
  300. n = 290
  301. for i in range(1, n):
  302. page = i
  303. try:
  304. text = requests.get("https://airportcode.bmcx.com/" + str(page) + "__airportcode/")
  305. res = text.text
  306. html = BeautifulSoup(res, features="html.parser")
  307. tabs = html.find_all(name = "table")
  308. if len(tabs) > 1:
  309. arr = tabs[1].find_all(name = "tr")
  310. if len(arr) > 2:
  311. for tr in arr[2:]:
  312. href = tr.find(name = "a").attrs.get("href")
  313. if href != None:
  314. try:
  315. sub_text = requests.get("https://airportcode.bmcx.com" + href)
  316. sub_res = sub_text.text
  317. sub_html = BeautifulSoup(sub_res, features="html.parser")
  318. sub_tabs = sub_html.find_all(name = "table")
  319. if len(sub_tabs) > 1:
  320. sub_arr = sub_tabs[1].find_all(name = "tr")
  321. item = {}
  322. for sub_tr in sub_arr:
  323. sub_t = sub_tr.find_all(name = "td")
  324. item[sub_t[0].string] = sub_t[1].string
  325. flag = False
  326. name = ""
  327. swap = ""
  328. if item["纬度"] != '' and not pattern1.match(item["纬度"]):
  329. name = "纬度"
  330. swap = item["纬度"]
  331. flag = True
  332. if item["经度"] != '' and not pattern1.match(item["经度"]):
  333. name = "经度"
  334. swap = item["经度"]
  335. flag = True
  336. if flag:
  337. for k, v in item.items():
  338. if k != "纬度" and k != "经度" and pattern1.match(v):
  339. item[name] = v
  340. item[k] = swap
  341. break
  342. flag = False
  343. swap = ""
  344. if item["机场三字码"] == '' or not pattern2.match(item["机场三字码"]):
  345. swap = item["机场三字码"]
  346. flag = True
  347. if flag:
  348. for k, v in item.items():
  349. if k != "机场三字码" and pattern2.match(v):
  350. item["机场三字码"] = v
  351. item[k] = swap
  352. break
  353. item["机场三字码"] = item["机场三字码"].upper()
  354. if item["国家(地区)代码"] == '' and pattern3.match(item["纬度"]):
  355. item["国家(地区)代码"] = item["纬度"]
  356. item["纬度"] = ''
  357. if item["国家(地区)代码"] == '' and pattern3.match(item["经度"]):
  358. item["国家(地区)代码"] = item["经度"]
  359. item["经度"] = ''
  360. if item["机场四字码"] == '' and pattern4.match(item["纬度"]):
  361. item["机场四字码"] = item["纬度"]
  362. item["纬度"] = ''
  363. if item["机场四字码"] == '' and pattern4.match(item["经度"]):
  364. item["机场四字码"] = item["经度"]
  365. item["经度"] = ''
  366. if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["纬度"]):
  367. item["机场名称(英文)"] = item["纬度"]
  368. item["纬度"] = ''
  369. if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["经度"]):
  370. item["机场名称(英文)"] = item["经度"]
  371. item["经度"] = ''
  372. if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["纬度"]):
  373. item["城市名(英文)"] = item["纬度"]
  374. item["纬度"] = ''
  375. if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["经度"]):
  376. item["城市名(英文)"] = item["经度"]
  377. item["经度"] = ''
  378. if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["纬度"]):
  379. item["城市名"] = item["纬度"]
  380. item["纬度"] = ''
  381. if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["经度"]):
  382. item["城市名"] = item["经度"]
  383. item["经度"] = ''
  384. if item["国家(地区)名称"] == '' and pattern6.match(item["纬度"]):
  385. item["国家(地区)名称"] = item["纬度"]
  386. item["纬度"] = ''
  387. if item["国家(地区)名称"] == '' and pattern6.match(item["经度"]):
  388. item["国家(地区)名称"] = item["经度"]
  389. item["经度"] = ''
  390. if item["城市名"] == '' and pattern6.match(item["纬度"]):
  391. item["城市名"] = item["纬度"]
  392. item["纬度"] = ''
  393. if item["城市名"] == '' and pattern6.match(item["经度"]):
  394. item["城市名"] = item["经度"]
  395. item["经度"] = ''
  396. result.append(item)
  397. except Exception as e:
  398. print(f"Exception in sub process: {e=}, {type(e)=}")
  399. except Exception as ex:
  400. print(f"Exception in main loop process: {ex=}, {type(ex)=}")
  401. s = json.dumps(result, ensure_ascii=False)
  402. # print(s)
  403. with open('airport.json', 'w') as f:
  404. f.write(s)
  405. # 结果校验
  406. arr = []
  407. with open("airport.json", "r") as f:
  408. arr = json.load(f)
  409. ct = 0
  410. for item in arr:
  411. swap = ""
  412. if item["纬度"] != '' and not pattern1.match(item["纬度"]):
  413. print(item["机场三字码"])
  414. ct += 1
  415. if item["经度"] != '' and not pattern1.match(item["经度"]):
  416. print(item["机场三字码"])
  417. ct += 1
  418. if item["机场三字码"] != '' and not pattern2.match(item["机场三字码"]):
  419. ct += 1
  420. if ct > 0:
  421. print('Sth wrong ' + ct)
  422. # s = json.dumps(arr, ensure_ascii=False)
  423. # with open('airport.json', 'w') as f:
  424. # f.write(s)