123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155 |
- import re
- import requests
- import json
- from bs4 import BeautifulSoup
- def main():
- # print('[{}] Hello world!'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))))
- # str = '188888'
- # n = len(str)
- # new_str = '****' if len(str) <= 4 else str[0:int(n/3)] + '****' + str[int(n*2/3):]
- # print(new_str)
- # 抓取机场三字码等信息
- pattern1 = re.compile(r'^-*\d+\.*\d+$')
- pattern2 = re.compile(r'^[A-Z]{3}$')
- pattern3 = re.compile(r'^[A-Z]{2}$')
- pattern4 = re.compile(r'^[A-Z]{4}$')
- pattern5 = re.compile(r'^[A-Za-z\s,]+$')
- pattern6 = re.compile(r'^[\u4e00-\u9fa5]+$')
- result = []
- n = 290
- for i in range(1, n):
- page = i
- try:
- text = requests.get("https://airportcode.bmcx.com/" + str(page) + "__airportcode/")
- res = text.text
- html = BeautifulSoup(res, features="html.parser")
- tabs = html.find_all(name = "table")
- if len(tabs) > 1:
- arr = tabs[1].find_all(name = "tr")
- if len(arr) > 2:
- for tr in arr[2:]:
- href = tr.find(name = "a").attrs.get("href")
- if href != None:
- try:
- sub_text = requests.get("https://airportcode.bmcx.com" + href)
- sub_res = sub_text.text
- sub_html = BeautifulSoup(sub_res, features="html.parser")
- sub_tabs = sub_html.find_all(name = "table")
- if len(sub_tabs) > 1:
- sub_arr = sub_tabs[1].find_all(name = "tr")
- item = {}
- for sub_tr in sub_arr:
- sub_t = sub_tr.find_all(name = "td")
- item[sub_t[0].string] = sub_t[1].string
- flag = False
- name = ""
- swap = ""
- if item["纬度"] != '' and not pattern1.match(item["纬度"]):
- name = "纬度"
- swap = item["纬度"]
- flag = True
- if item["经度"] != '' and not pattern1.match(item["经度"]):
- name = "经度"
- swap = item["经度"]
- flag = True
- if flag:
- for k, v in item.items():
- if k != "纬度" and k != "经度" and pattern1.match(v):
- item[name] = v
- item[k] = swap
- break
- flag = False
- swap = ""
- if item["机场三字码"] == '' or not pattern2.match(item["机场三字码"]):
- swap = item["机场三字码"]
- flag = True
- if flag:
- for k, v in item.items():
- if k != "机场三字码" and pattern2.match(v):
- item["机场三字码"] = v
- item[k] = swap
- break
- item["机场三字码"] = item["机场三字码"].upper()
- if item["国家(地区)代码"] == '' and pattern3.match(item["纬度"]):
- item["国家(地区)代码"] = item["纬度"]
- item["纬度"] = ''
- if item["国家(地区)代码"] == '' and pattern3.match(item["经度"]):
- item["国家(地区)代码"] = item["经度"]
- item["经度"] = ''
- if item["机场四字码"] == '' and pattern4.match(item["纬度"]):
- item["机场四字码"] = item["纬度"]
- item["纬度"] = ''
- if item["机场四字码"] == '' and pattern4.match(item["经度"]):
- item["机场四字码"] = item["经度"]
- item["经度"] = ''
- if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["纬度"]):
- item["机场名称(英文)"] = item["纬度"]
- item["纬度"] = ''
- if item["机场名称(英文)"] == '' and item["城市名(英文)"] != '' and pattern5.match(item["经度"]):
- item["机场名称(英文)"] = item["经度"]
- item["经度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["纬度"]):
- item["城市名(英文)"] = item["纬度"]
- item["纬度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] == '' and pattern5.match(item["经度"]):
- item["城市名(英文)"] = item["经度"]
- item["经度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["纬度"]):
- item["城市名"] = item["纬度"]
- item["纬度"] = ''
- if item["机场名称(英文)"] != '' and item["城市名(英文)"] != '' and item["城市名"] == '' and pattern5.match(item["经度"]):
- item["城市名"] = item["经度"]
- item["经度"] = ''
- if item["国家(地区)名称"] == '' and pattern6.match(item["纬度"]):
- item["国家(地区)名称"] = item["纬度"]
- item["纬度"] = ''
- if item["国家(地区)名称"] == '' and pattern6.match(item["经度"]):
- item["国家(地区)名称"] = item["经度"]
- item["经度"] = ''
- if item["城市名"] == '' and pattern6.match(item["纬度"]):
- item["城市名"] = item["纬度"]
- item["纬度"] = ''
- if item["城市名"] == '' and pattern6.match(item["经度"]):
- item["城市名"] = item["经度"]
- item["经度"] = ''
- result.append(item)
- except Exception as e:
- print(f"Exception in sub process: {e=}, {type(e)=}")
- except Exception as ex:
- print(f"Exception in main loop process: {ex=}, {type(ex)=}")
- s = json.dumps(result, ensure_ascii=False)
- # print(s)
- with open('json2.json', 'w') as f:
- f.write(s)
- # 结果校验
- arr = []
- with open("json2.json", "r") as f:
- arr = json.load(f)
- ct = 0
- for item in arr:
- swap = ""
- if item["纬度"] != '' and not pattern1.match(item["纬度"]):
- print(item["机场三字码"])
- ct += 1
- if item["经度"] != '' and not pattern1.match(item["经度"]):
- print(item["机场三字码"])
- ct += 1
- if item["机场三字码"] != '' and not pattern2.match(item["机场三字码"]):
- ct += 1
-
- if ct > 0:
- print('Sth wrong ' + ct)
- # s = json.dumps(arr, ensure_ascii=False)
- # with open('json2.json', 'w') as f:
- # f.write(s)
- # 程序入口
- if __name__ == '__main__':
- main()
|