__main__.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. import re
  2. import requests
  3. import json
  4. from bs4 import BeautifulSoup
  5. def main():
  6. # print('[{}] Hello world!'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))))
  7. # str = '188888'
  8. # n = len(str)
  9. # new_str = '****' if len(str) <= 4 else str[0:int(n/3)] + '****' + str[int(n*2/3):]
  10. # print(new_str)
  11. # 抓取机场三字码等信息
  12. pattern1 = re.compile(r'^-*\d+\.*\d+$')
  13. pattern2 = re.compile(r'^[A-Z]{3}$')
  14. pattern3 = re.compile(r'^[A-Z]{2}$')
  15. pattern4 = re.compile(r'^[A-Z]{4}$')
  16. pattern5 = re.compile(r'^[A-Za-z\s,]+$')
  17. pattern6 = re.compile(r'^[\u4e00-\u9fa5]+$')
  18. result = []
  19. n = 290
  20. for i in range(1, n):
  21. page = i
  22. try:
  23. text = requests.get("https://airportcode.bmcx.com/" + str(page) + "__airportcode/")
  24. res = text.text
  25. html = BeautifulSoup(res, features="html.parser")
  26. tabs = html.find_all(name = "table")
  27. if len(tabs) > 1:
  28. arr = tabs[1].find_all(name = "tr")
  29. if len(arr) > 2:
  30. for tr in arr[2:]:
  31. href = tr.find(name = "a").attrs.get("href")
  32. if href != None:
  33. try:
  34. sub_text = requests.get("https://airportcode.bmcx.com" + href)
  35. sub_res = sub_text.text
  36. sub_html = BeautifulSoup(sub_res, features="html.parser")
  37. sub_tabs = sub_html.find_all(name = "table")
  38. if len(sub_tabs) > 1:
  39. sub_arr = sub_tabs[1].find_all(name = "tr")
  40. item = {}
  41. for sub_tr in sub_arr:
  42. sub_t = sub_tr.find_all(name = "td")
  43. item[sub_t[0].string] = sub_t[1].string
  44. flag = False
  45. name = ""
  46. swap = ""
  47. if item["纬度"] != '' and not pattern1.match(item["纬度"]):
  48. name = "纬度"
  49. swap = item["纬度"]
  50. flag = True
  51. if item["经度"] != '' and not pattern1.match(item["经度"]):
  52. name = "经度"
  53. swap = item["经度"]
  54. flag = True
  55. if flag:
  56. for k, v in item.items():
  57. if k != "纬度" and k != "经度" and pattern1.match(v):
  58. item[name] = v
  59. item[k] = swap
  60. break
  61. flag = False
  62. swap = ""
  63. if item["机场三字码"] == '' or not pattern2.match(item["机场三字码"]):
  64. swap = item["机场三字码"]
  65. flag = True
  66. if flag:
  67. for k, v in item.items():
  68. if k != "机场三字码" and pattern2.match(v):
  69. item["机场三字码"] = v
  70. item[k] = swap
  71. break
  72. item["机场三字码"] = item["机场三字码"].upper()
  73. if item["国家(地区)代码"] == '' and pattern3.match(item["纬度"]):
  74. item["国家(地区)代码"] = item["纬度"]
  75. item["纬度"] = ''
  76. if item["国家(地区)代码"] == '' and pattern3.match(item["经度"]):
  77. item["国家(地区)代码"] = item["经度"]
  78. item["经度"] = ''
  79. if item["机场四字码"] == '' and pattern4.match(item["纬度"]):
  80. item["机场四字码"] = item["纬度"]
  81. item["纬度"] = ''
  82. if item["机场四字码"] == '' and pattern4.match(item["经度"]):
  83. item["机场四字码"] = item["经度"]
  84. item["经度"] = ''
  85. if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["纬度"]):
  86. item["机场名称(英文)"] = item["纬度"]
  87. item["纬度"] = ''
  88. if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["经度"]):
  89. item["机场名称(英文)"] = item["经度"]
  90. item["经度"] = ''
  91. if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["纬度"]):
  92. item["城市名(英文)"] = item["纬度"]
  93. item["纬度"] = ''
  94. if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["经度"]):
  95. item["城市名(英文)"] = item["经度"]
  96. item["经度"] = ''
  97. if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["纬度"]):
  98. item["城市名"] = item["纬度"]
  99. item["纬度"] = ''
  100. if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["经度"]):
  101. item["城市名"] = item["经度"]
  102. item["经度"] = ''
  103. if item["国家(地区)名称"] == '' and pattern6.match(item["纬度"]):
  104. item["国家(地区)名称"] = item["纬度"]
  105. item["纬度"] = ''
  106. if item["国家(地区)名称"] == '' and pattern6.match(item["经度"]):
  107. item["国家(地区)名称"] = item["经度"]
  108. item["经度"] = ''
  109. if item["城市名"] == '' and pattern6.match(item["纬度"]):
  110. item["城市名"] = item["纬度"]
  111. item["纬度"] = ''
  112. if item["城市名"] == '' and pattern6.match(item["经度"]):
  113. item["城市名"] = item["经度"]
  114. item["经度"] = ''
  115. result.append(item)
  116. except Exception as e:
  117. print(f"Exception in sub process: {e=}, {type(e)=}")
  118. except Exception as ex:
  119. print(f"Exception in main loop process: {ex=}, {type(ex)=}")
  120. s = json.dumps(result, ensure_ascii=False)
  121. # print(s)
  122. with open('json2.json', 'w') as f:
  123. f.write(s)
  124. # 结果校验
  125. arr = []
  126. with open("json2.json", "r") as f:
  127. arr = json.load(f)
  128. ct = 0
  129. for item in arr:
  130. swap = ""
  131. if item["纬度"] != '' and not pattern1.match(item["纬度"]):
  132. print(item["机场三字码"])
  133. ct += 1
  134. if item["经度"] != '' and not pattern1.match(item["经度"]):
  135. print(item["机场三字码"])
  136. ct += 1
  137. if item["机场三字码"] != '' and not pattern2.match(item["机场三字码"]):
  138. ct += 1
  139. if ct > 0:
  140. print('Sth wrong ' + ct)
  141. # s = json.dumps(arr, ensure_ascii=False)
  142. # with open('json2.json', 'w') as f:
  143. # f.write(s)
  144. # 程序入口
  145. if __name__ == '__main__':
  146. main()