get_info.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. import json
  2. import re
  3. import time
  4. from bs4 import BeautifulSoup
  5. import requests
  6. # 根据机场三字码,重新从 Google Geocoding AP 刷数据
  7. def req_geocode():
  8. # arr = [{"机场三字码":"SZX"}]
  9. arr = []
  10. with open("json2.json", "r") as f:
  11. arr = json.load(f)
  12. langs = ['en', 'zh']
  13. i = 0
  14. for item in arr:
  15. try:
  16. if item['机场三字码'] == '':
  17. continue
  18. for lang in langs:
  19. uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}%20Airport%2C{}%2C{}&language={}&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(item['机场三字码'], item['城市名(英文)'], item['国家(地区)代码'], lang)
  20. text = requests.get(uri).text
  21. res = json.loads(text).get('results')
  22. if res != None and len(res) > 0:
  23. geo = res[0].get('address_components')
  24. if geo != None:
  25. item['geo_{}'.format(lang)] = geo
  26. s = json.dumps(item, ensure_ascii=False)
  27. print(item)
  28. i += 1
  29. if i % 3 == 0:
  30. time.sleep(1)
  31. except Exception as e:
  32. print(f"Exception in geocode process: {e=}, {type(e)=}")
  33. s = json.dumps(arr, ensure_ascii=False)
  34. # print(s)
  35. with open('json3.json', 'w') as f:
  36. f.write(s)
  37. # 抓取机场三字码等信息
  38. def get_info():
  39. pattern1 = re.compile(r'^-*\d+\.*\d+$')
  40. pattern2 = re.compile(r'^[A-Z]{3}$')
  41. pattern3 = re.compile(r'^[A-Z]{2}$')
  42. pattern4 = re.compile(r'^[A-Z]{4}$')
  43. pattern5 = re.compile(r'^[A-Za-z\s,]+$')
  44. pattern6 = re.compile(r'^[\u4e00-\u9fa5]+$')
  45. result = []
  46. n = 290
  47. for i in range(1, n):
  48. page = i
  49. try:
  50. text = requests.get("https://airportcode.bmcx.com/" + str(page) + "__airportcode/")
  51. res = text.text
  52. html = BeautifulSoup(res, features="html.parser")
  53. tabs = html.find_all(name = "table")
  54. if len(tabs) > 1:
  55. arr = tabs[1].find_all(name = "tr")
  56. if len(arr) > 2:
  57. for tr in arr[2:]:
  58. href = tr.find(name = "a").attrs.get("href")
  59. if href != None:
  60. try:
  61. sub_text = requests.get("https://airportcode.bmcx.com" + href)
  62. sub_res = sub_text.text
  63. sub_html = BeautifulSoup(sub_res, features="html.parser")
  64. sub_tabs = sub_html.find_all(name = "table")
  65. if len(sub_tabs) > 1:
  66. sub_arr = sub_tabs[1].find_all(name = "tr")
  67. item = {}
  68. for sub_tr in sub_arr:
  69. sub_t = sub_tr.find_all(name = "td")
  70. item[sub_t[0].string] = sub_t[1].string
  71. flag = False
  72. name = ""
  73. swap = ""
  74. if item["纬度"] != '' and not pattern1.match(item["纬度"]):
  75. name = "纬度"
  76. swap = item["纬度"]
  77. flag = True
  78. if item["经度"] != '' and not pattern1.match(item["经度"]):
  79. name = "经度"
  80. swap = item["经度"]
  81. flag = True
  82. if flag:
  83. for k, v in item.items():
  84. if k != "纬度" and k != "经度" and pattern1.match(v):
  85. item[name] = v
  86. item[k] = swap
  87. break
  88. flag = False
  89. swap = ""
  90. if item["机场三字码"] == '' or not pattern2.match(item["机场三字码"]):
  91. swap = item["机场三字码"]
  92. flag = True
  93. if flag:
  94. for k, v in item.items():
  95. if k != "机场三字码" and pattern2.match(v):
  96. item["机场三字码"] = v
  97. item[k] = swap
  98. break
  99. item["机场三字码"] = item["机场三字码"].upper()
  100. if item["国家(地区)代码"] == '' and pattern3.match(item["纬度"]):
  101. item["国家(地区)代码"] = item["纬度"]
  102. item["纬度"] = ''
  103. if item["国家(地区)代码"] == '' and pattern3.match(item["经度"]):
  104. item["国家(地区)代码"] = item["经度"]
  105. item["经度"] = ''
  106. if item["机场四字码"] == '' and pattern4.match(item["纬度"]):
  107. item["机场四字码"] = item["纬度"]
  108. item["纬度"] = ''
  109. if item["机场四字码"] == '' and pattern4.match(item["经度"]):
  110. item["机场四字码"] = item["经度"]
  111. item["经度"] = ''
  112. if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["纬度"]):
  113. item["机场名称(英文)"] = item["纬度"]
  114. item["纬度"] = ''
  115. if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["经度"]):
  116. item["机场名称(英文)"] = item["经度"]
  117. item["经度"] = ''
  118. if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["纬度"]):
  119. item["城市名(英文)"] = item["纬度"]
  120. item["纬度"] = ''
  121. if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["经度"]):
  122. item["城市名(英文)"] = item["经度"]
  123. item["经度"] = ''
  124. if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["纬度"]):
  125. item["城市名"] = item["纬度"]
  126. item["纬度"] = ''
  127. if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["经度"]):
  128. item["城市名"] = item["经度"]
  129. item["经度"] = ''
  130. if item["国家(地区)名称"] == '' and pattern6.match(item["纬度"]):
  131. item["国家(地区)名称"] = item["纬度"]
  132. item["纬度"] = ''
  133. if item["国家(地区)名称"] == '' and pattern6.match(item["经度"]):
  134. item["国家(地区)名称"] = item["经度"]
  135. item["经度"] = ''
  136. if item["城市名"] == '' and pattern6.match(item["纬度"]):
  137. item["城市名"] = item["纬度"]
  138. item["纬度"] = ''
  139. if item["城市名"] == '' and pattern6.match(item["经度"]):
  140. item["城市名"] = item["经度"]
  141. item["经度"] = ''
  142. result.append(item)
  143. except Exception as e:
  144. print(f"Exception in sub process: {e=}, {type(e)=}")
  145. except Exception as ex:
  146. print(f"Exception in main loop process: {ex=}, {type(ex)=}")
  147. s = json.dumps(result, ensure_ascii=False)
  148. # print(s)
  149. with open('json2.json', 'w') as f:
  150. f.write(s)
  151. # 结果校验
  152. arr = []
  153. with open("json2.json", "r") as f:
  154. arr = json.load(f)
  155. ct = 0
  156. for item in arr:
  157. swap = ""
  158. if item["纬度"] != '' and not pattern1.match(item["纬度"]):
  159. print(item["机场三字码"])
  160. ct += 1
  161. if item["经度"] != '' and not pattern1.match(item["经度"]):
  162. print(item["机场三字码"])
  163. ct += 1
  164. if item["机场三字码"] != '' and not pattern2.match(item["机场三字码"]):
  165. ct += 1
  166. if ct > 0:
  167. print('Sth wrong ' + ct)
  168. # s = json.dumps(arr, ensure_ascii=False)
  169. # with open('json2.json', 'w') as f:
  170. # f.write(s)