#!/usr/bin/env python # -*- coding: utf-8 -*- import json import math import numbers import re import time from urllib.parse import quote import requests from bs4 import BeautifulSoup import pandas as pd from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, String, Text, BigInteger, Integer, Float, Boolean, JSON, create_engine from sqlalchemy.orm import sessionmaker BRAND_PATTERN = re.compile(r'>\s*Brand\s*\s*\s*([^<>]+)\s*') HOTEL_LINK_PATTERN = re.compile(r'/venue/(\d+)/([^/]+)/') DISTANCE_PATTERN = re.compile(r'(\d+)\s*(\w+)\s*\((\d+)\s*\w+\s*\/\s*([^)]+)\)') ENCODING = 'utf-8' def test(): # get_hotels() # get_hotels_rooms() # get_hotels_rooms_ext() # insert_to_mysql() # get_loc() convert_geo() # insert_saas_to_mysql() pass def get_hotels(): items = [] for i in range(1, 241): try: resp = requests.get('https://www.1000meetings.com/en/area/1/world/p{}'.format(i)) resp.encoding = ENCODING doc = BeautifulSoup(resp.text, 'html.parser') for li in doc.select('#venuelist li'): item = {} a_arr = li.select('.venueheader .line1 .name a') if len(a_arr) == 0: continue a = a_arr[0] item['hotel_name'] = rep(a.string) item['link'] = quote(a['href'].strip()) g = HOTEL_LINK_PATTERN.findall(item['link']) if len(g) == 0 or len(g[0]) < 1: continue item['hotel_id'] = g[0][0] item['slug'] = quote(g[0][1].strip()) area_arr = li.select('.venueheader .line1 .area a') area_arr_len = len(area_arr) if area_arr_len == 1: item['country'] = area_arr[0].string.strip().strip('[').strip(']') item['city'] = item['country'] elif area_arr_len == 2: item['country'] = area_arr[0].string.strip().strip('[').strip(']') item['city'] = area_arr[1].string.strip().strip('[').strip(']') else: item['country'] = area_arr[0].string.strip().strip('[').strip(']') item['province'] = area_arr[1].string.strip().strip('[').strip(']') if item['province'] == 'Hong Kong': item['city'] = item['province'] item['district'] = area_arr[2].string.strip().strip('[').strip(']') else: item['city'] = area_arr[2].string.strip().strip('[').strip(']') items.append(item) except Exception as e: print('get_hotels() failed. ', e) # print(items) df = pd.DataFrame(items) df.to_csv('hotels.csv', encoding=ENCODING, index=False, header=True) def get_hotels_rooms(): hotels = pd.read_csv('hotels.csv', encoding=ENCODING).to_dict('records') items = [] for h in hotels: try: if h['link'] == '': continue resp = requests.get('https://www.1000meetings.com{}'.format(h['link'])) resp.encoding = ENCODING doc = BeautifulSoup(resp.text, 'html.parser') for li in doc.select('#frame-header #line2-left li'): text = li.get_text().strip() if text.startswith('Address:'): text = text.lstrip('Address:') h['address'] = rep(text) elif text.startswith('Telephone:'): text = text.lstrip('Telephone:').strip().replace('\n','') text = re.sub(r'\s+', ' ', re.sub(r'[\t\(\)())]+', ' ', text)).replace('-', '').replace('ext.', '-').replace('、', '/').replace(',', '').replace('中国', '86').replace('China', '86').strip() if ' ' in text: idx = text.index(' ') code = re.sub('\++', '+', text[0:idx]) number = text[idx:].replace(' ', '').replace('+', '').strip() text = '{} {}'.format(code, number).strip() if re.match(r'^0+$', text) or text == 'nan': text = '' h['phone'] = text for div in doc.select('.MainBaseInfo .contactInfo .Line'): text = div.get_text().strip() if text.startswith('Web:') and len(div.select('a')) > 0: h['site_url'] = div.select('a')[0]['href'].strip() if div.select('a') else '' floor = '' rooms = [] for ul in doc.select('#meeting-room-chart ul'): room = {} if 'title' in ul['class']: continue elif 'floor' in ul['class']: li = ul.select('li.floorname') if len(li) > 0: floor = li[0].string elif 'rooms' in ul['class']: li = ul.select('li') if len(li) > 0: room['floor'] = rep(floor) room['name'] = rep(li[0].string) room['area'] = repq(li[2].string) room['height'] = repq(li[3].string) room['theatre'] = repq(li[4].string) room['classroom'] = repq(li[5].string) room['u_shape'] = repq(li[6].string) room['banquet'] = repq(li[7].string) room['cocktail'] = repq(li[8].string) room['boardroom'] = repq(li[9].string) rooms.append(room) if len(rooms) > 0: h['rooms'] = rooms except Exception as e: print('get_hotels_rooms() failed. ', e) items.append(h) # print(items) df = pd.DataFrame(items) df.to_csv('hotels_rooms.csv', encoding=ENCODING, index=False, header=True) def get_hotels_rooms_ext(): hotels = pd.read_csv('hotels_rooms.csv', encoding=ENCODING).to_dict('records') items = [] for h in hotels: try: if h['link'] == '': continue resp = requests.get('https://www.1000meetings.com.sg/meeting-destination/{}/{}/'.format(h['hotel_id'], h['slug'])) resp.encoding = ENCODING doc = BeautifulSoup(resp.text, 'html.parser') overview = doc.select('.hotel-details') if len(overview) > 0: name = '' for tag in overview[0].find_all(True): if tag.name == 'h4': name = tag.get_text() elif tag.name == 'p' and name == 'Hotel presentation': h['hotel_presentation'] = rep(tag.prettify()) name = '' elif tag.name == 'p' and name == 'Meeting Facilities': h['meeting_facilities'] = rep(tag.prettify()) name = '' images = doc.select('.portfolio-list') if len(images) > 0: img_arr = [] for img in images[0].select('img'): img_arr.append({'origin_name':quote(img['src'].strip().split('/')[-1])}) h['images'] = img_arr details = doc.select('table.hoteldetails tr') room_quantity = [] meeting_room_quantity = [] restaurant_remark = [] meeting_room_facilities = {} distances = [] topic = '' if len(details) > 0: for tr in details: if tr.find('th') and tr.find('th').find('h5'): topic = tr.find('th').find('h5').text.strip() elif tr.find('td') and tr.find('td').find('h5') and len(tr.select('td.type-info')) > 0: title = tr.find('td').find('h5').text.strip() value = tr.select('td.type-info')[0].text.strip() if topic == 'Room accommodation': if title == 'Total rooms': h['total_rooms'] = value elif title == 'Free wifi': h['hotel_free_wifi'] = '1' if value == 'Yes' else '0' else: room_quantity.append({'name':rep(title), 'quantity': repq(value)}) elif topic == 'Meeting space': if title == 'Meeting rooms': h['total_meeting_rooms'] = value elif title == 'Max surface': h['meeting_room_max_sqm'] = value.split(' ')[0] elif title == 'Max height': h['meeting_room_max_height'] = value.split(' ')[0] elif title == 'Projector resolution': h['projector_resolution'] = rep(value) elif title == 'Natural light': meeting_room_facilities['natural_light'] = '1' if value == 'Yes' else '0' elif title == 'Pillarless': meeting_room_facilities['pillarless'] = '1' if value == 'Yes' else '0' elif title == 'Outdoor space': meeting_room_facilities['outdoor'] = '1' if value == 'Yes' else '0' elif title == 'LED screen on-site': meeting_room_facilities['led_screen'] = '1' if value == 'Yes' else '0' elif title == 'Free wifi': meeting_room_facilities['wifi'] = '1' if value == 'Yes' else '0' elif title == 'VIP rooms': meeting_room_facilities['wifi'] = '1' if value == 'Yes' else '0' else: meeting_room_quantity.append({'name':rep(title), 'quantity': repq(value)}) elif topic == 'Restaurants': if title == 'Num restaurants': h['total_restaurants'] = value else: restaurant_remark.append({'name':rep(title), 'remark': rep(value)}) elif topic == 'Transportation': if title in ['Distance to airport', 'Distance to airport 2', 'Distance to train station', 'Distance to train station 2', 'Subway station', 'Subway station 2']: g = DISTANCE_PATTERN.findall(value) if len(g) == 0 or len(g[0]) < 4: continue name = rep(g[0][3].replace('- line', '').replace('-', '').replace('N/A', '')) distance = g[0][0] time = g[0][2] if len(name) == 0 or distance == '0' or time == '0': continue distances.append({'name': name, 'distance': distance + ('000' if g[0][1] == 'km' else ''), 'time': time}) if len(room_quantity) > 0: h['room_quantity'] = room_quantity if len(meeting_room_quantity) > 0: h['meeting_room_quantity'] = meeting_room_quantity if len(meeting_room_facilities) > 0: h['meeting_room_facilities'] = meeting_room_facilities if len(restaurant_remark) > 0: h['restaurant_remark'] = restaurant_remark if len(distances) > 0: h['distances'] = distances mds = doc.select('.slider-venue-display .col-md-3') if len(mds) > 0: for div in mds: html = re.sub(r'\n', '', div.prettify()) g = BRAND_PATTERN.findall(html) if len(g) > 0: h['brand'] = rep(g[0]) break h['attachments'] = ['https://www8.1000meetings.com/download/venue/pdf/{}/'.format(h['hotel_id'])] except Exception as e: print('get_hotels_rooms_ext() failed. ', e) items.append(h) # print(items) df = pd.DataFrame(items) df.to_csv('hotels_rooms_ext.csv', encoding=ENCODING, index=False, header=True) def repq(input): if not input: return '' input = input.strip() return '0' if input == '-' or input == '–' else input def rep(input): if not input: return '' return re.sub(r'\s+', ' ', input.replace('!', '!').replace('、', '/').replace('?', '?').replace(';', ';').replace('(', '(').replace(')', ')').replace(':', ':').replace('。', '.').replace(',', ',').replace('–', '-').replace('√', '-').replace('’', '\'').replace('‘', '\'').replace(' ', ' ').replace('“', '"').replace('”', '"')).strip().strip('/').strip() get_int = lambda v: 0 if not v or (isinstance(v, numbers.Number) and math.isnan(v)) else int(v) get_float = lambda v: 0 if not v or (isinstance(v, numbers.Number) and math.isnan(v)) else float(v) get_str = lambda v: '' if not v or (isinstance(v, numbers.Number) and math.isnan(v)) else '{}'.format(v) get_json = lambda v: '' if not v or (isinstance(v, numbers.Number) and math.isnan(v)) else json.dumps(v) get_bool = lambda v: v and not (isinstance(v, numbers.Number) and math.isnan(v)) and int(v) > 0 commit_buff_size = 500 session_maker = sessionmaker(bind=create_engine('mysql+pymysql://{}:{}@{}:{}/test'.format('root', '123456', 'localhost', 3306), echo=False, # 程序运行时反馈执行过程中的关键对象,包括ORM构建的sql语句 max_overflow=0, # 超过连接池大小外最多创建的连接 pool_size=5, # 连接池大小 pool_timeout=300, # 池中没有线程最多等待的时间,否则报错 pool_recycle=3600, # 多久之后对线程池中的线程进行一次连接的回收(重置) pool_pre_ping=True # 每次调用之前ping )) def insert_to_mysql(): hotels = pd.read_csv('hotels_rooms_ext.csv', encoding=ENCODING).to_dict('records') with session_maker() as session: i = 0 length = len(hotels) for h in hotels: i += 1 if not h['hotel_id'] or math.isnan(h['hotel_id']) or h['hotel_id'] <= 0: continue try: hotel = Hotel( hotel_id = h['hotel_id'], hotel_name = get_str(h['hotel_name']), link = get_str(h['link']), slug = get_str(h['slug']), country = get_str(h['country']), province = get_str(h['province']), city = get_str(h['city']), district = get_str(h['district']), address = get_str(h['address']), phone = get_str(h['phone']), site_url = get_str(h['site_url']), rooms = get_json(h['rooms']), hotel_presentation = get_str(h['hotel_presentation']), meeting_facilities = get_str(h['meeting_facilities']), images = get_json(h['images']), total_rooms = get_int(h['total_rooms']), hotel_free_wifi = get_bool(h['hotel_free_wifi']), total_meeting_rooms = get_int(h['total_meeting_rooms']), meeting_room_max_sqm = get_float(h['meeting_room_max_sqm']), meeting_room_max_height = get_float(h['meeting_room_max_height']), projector_resolution = get_str(h['projector_resolution']), total_restaurants = get_int(h['total_restaurants']), room_quantity = get_json(h['room_quantity']), meeting_room_quantity = get_json(h['meeting_room_quantity']), meeting_room_facilities = get_json(h['meeting_room_facilities']), restaurant_remark = get_json(h['restaurant_remark']), distances = get_json(h['distances']), brand = get_str(h['brand']), attachments = get_json(h['attachments']), # lon = get_float(h['lon']), # lat = get_float(h['lat']), # s_country = get_str(h['s_country']), # s_province = get_str(h['s_province']), # s_city = get_str(h['s_city']), # s_name = get_str(h['s_name']), # s_address = get_str(h['s_address']), # s_phone = get_str(h['s_phone']), # hop_id = get_int(h['hop_id']), # s_id = get_int(h['s_id']), # s_lon = get_float(h['s_lon']), # s_lat = get_float(h['s_lat']), # s_loc_type = get_int(h['s_loc_type']) ) session.add(hotel) if i % commit_buff_size == 0 or i >= length: session.commit() except Exception as e: print('insert_to_mysql() failed. ', e) def insert_saas_to_mysql(): hotels = pd.read_csv('saas_hotels.csv', encoding=ENCODING).to_dict('records') with session_maker() as session: i = 0 length = len(hotels) for h in hotels: i += 1 if not h['id'] or math.isnan(h['id']) or h['id'] <= 0: continue try: hotel = SaaSHotel( id = h['id'], hop_id = get_int(h['hop_id']), name = get_str(h['name']), en_name = get_str(h['en_name']), phone = get_str(h['phone']), address = get_str(h['address']), en_address = get_str(h['en_address']), lon = get_float(h['lon']), lat = get_float(h['lat']), loc_type = get_int(h['loc_type']), country = get_str(h['country']), province = get_str(h['province']), city = get_str(h['city']) ) session.add(hotel) if i % commit_buff_size == 0 or i >= length: session.commit() except Exception as e: print('insert_saas_to_mysql() failed. ', e) def get_loc(): pp1 = re.compile(r'^\+*0*0*[86-]*(\d{3,4})(\d{8})$') pp2 = re.compile(r'^\+*0*0*[86-]*(\d{3,4})(\d{8})-(.+)$') with session_maker() as session: hotels = session.query(Hotel).filter(Hotel.lon==None).all() i = 0 length = len(hotels) for h in hotels: i += 1 try: h.phone = re.sub(r'\s+', '-', h.phone) if h.country == 'China' and h.province not in ['Taiwan', 'Hong Kong', 'Macau']: g = pp1.findall(h.phone) if len(g) > 0 and len(g[0]) > 1: h.phone = '+86-{}-{}'.format(g[0][0], g[0][1]) g = pp2.findall(h.phone) if len(g) > 0 and len(g[0]) > 2: h.phone = '+86-{}-{}-{}'.format(g[0][0], g[0][1], g[0][2]) str = quote('{} {}'.format(h.hotel_name, h.address)) uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&language=en&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(str) text = requests.get(uri).text res = json.loads(text).get('results') if res and len(res) > 0: geo = res[0] if geo and geo.get('geometry') and geo.get('geometry').get('location') and geo.get('geometry').get('location').get('lat') and geo.get('geometry').get('location').get('lng'): h.lon = float(geo.get('geometry').get('location').get('lng')) h.lat = float(geo.get('geometry').get('location').get('lat')) if i % 50 == 0: time.sleep(1) if i % commit_buff_size == 0 or i >= length: session.commit() except Exception as e: print('get_loc() failed. ', e) def convert_geo(): with session_maker() as session: hotels = session.query(Hotel).filter((Hotel.s_loc_type == 1 or Hotel.s_loc_type == 3) and Hotel.s_lon.is_not(None) and Hotel.s_lat.is_not(None)).all() i = 0 length = len(hotels) for h in hotels: i += 1 try: if h.s_loc_type == 3: h.s_lon, h.s_lat = gcj02_to_wgs84(h.s_lon, h.s_lat) h.s_loc_type = 2 elif h.s_loc_type == 1: h.s_lon, h.s_lat = bd09_to_wgs84(h.s_lon, h.s_lat) h.s_loc_type = 2 if i % commit_buff_size == 0 or i >= length: session.commit() except Exception as e: print('convert_geo() failed. ', e) x_pi = 3.14159265358979324 * 3000.0 / 180.0 pi = 3.1415926535897932384626 # π a = 6378245.0 # 长半轴 ee = 0.00669342162296594323 # 偏心率平方 ''' 输入(经度,维度) ''' def bd09_to_gcj02(bd_lon, bd_lat): """ 百度坐标系(BD-09)转火星坐标系(GCJ-02) 百度——>谷歌、高德 :param bd_lat:百度坐标纬度 :param bd_lon:百度坐标经度 :return:转换后的坐标列表形式 """ x = bd_lon - 0.0065 y = bd_lat - 0.006 z = math.sqrt(x * x + y * y) - 0.00002 * math.sin(y * x_pi) theta = math.atan2(y, x) - 0.000003 * math.cos(x * x_pi) gg_lng = z * math.cos(theta) gg_lat = z * math.sin(theta) return [gg_lng, gg_lat] def gcj02_to_wgs84(lng, lat): """ GCJ02(火星坐标系)转GPS84 :param lng:火星坐标系的经度 :param lat:火星坐标系纬度 :return: """ if out_of_china(lng, lat): return [lng, lat] dlat = _transformlat(lng - 105.0, lat - 35.0) dlng = _transformlng(lng - 105.0, lat - 35.0) radlat = lat / 180.0 * pi magic = math.sin(radlat) magic = 1 - ee * magic * magic sqrtmagic = math.sqrt(magic) dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi) dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi) mglat = lat + dlat mglng = lng + dlng return [lng * 2 - mglng, lat * 2 - mglat] def bd09_to_wgs84(bd_lon, bd_lat): lon, lat = bd09_to_gcj02(bd_lon, bd_lat) return gcj02_to_wgs84(lon, lat) def bd09_to_wgs84(bd_lon, bd_lat): lon, lat = bd09_to_gcj02(bd_lon, bd_lat) return gcj02_to_wgs84(lon, lat) def gcj02_to_bd09(lng, lat): """ 火星坐标系(GCJ-02)转百度坐标系(BD-09) 谷歌、高德——>百度 :param lng:火星坐标经度 :param lat:火星坐标纬度 :return: """ z = math.sqrt(lng * lng + lat * lat) + 0.00002 * math.sin(lat * x_pi) theta = math.atan2(lat, lng) + 0.000003 * math.cos(lng * x_pi) bd_lng = z * math.cos(theta) + 0.0065 bd_lat = z * math.sin(theta) + 0.006 return [bd_lng, bd_lat] def wgs84_to_gcj02(lng, lat): """ WGS84转GCJ02(火星坐标系) :param lng:WGS84坐标系的经度 :param lat:WGS84坐标系的纬度 :return: """ if out_of_china(lng, lat): # 判断是否在国内 return [lng, lat] dlat = _transformlat(lng - 105.0, lat - 35.0) dlng = _transformlng(lng - 105.0, lat - 35.0) radlat = lat / 180.0 * pi magic = math.sin(radlat) magic = 1 - ee * magic * magic sqrtmagic = math.sqrt(magic) dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi) dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi) mglat = lat + dlat mglng = lng + dlng return [mglng, mglat] def wgs84_to_bd09(lon, lat): lon, lat = wgs84_to_gcj02(lon, lat) return gcj02_to_bd09(lon, lat) def out_of_china(lng, lat): """ 判断是否在国内,不在国内不做偏移 :param lng: :param lat: :return: """ return not (lng > 73.66 and lng < 135.05 and lat > 3.86 and lat < 53.55) def _transformlng(lng, lat): ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + 0.1 * lng * lat + 0.1 * math.sqrt(math.fabs(lng)) ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 * math.sin(2.0 * lng * pi)) * 2.0 / 3.0 ret += (20.0 * math.sin(lng * pi) + 40.0 * math.sin(lng / 3.0 * pi)) * 2.0 / 3.0 ret += (150.0 * math.sin(lng / 12.0 * pi) + 300.0 * math.sin(lng / 30.0 * pi)) * 2.0 / 3.0 return ret def _transformlat(lng, lat): ret = -100.0 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + 0.1 * lng * lat + 0.2 * math.sqrt(math.fabs(lng)) ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 * math.sin(2.0 * lng * pi)) * 2.0 / 3.0 ret += (20.0 * math.sin(lat * pi) + 40.0 * math.sin(lat / 3.0 * pi)) * 2.0 / 3.0 ret += (160.0 * math.sin(lat / 12.0 * pi) + 320 * math.sin(lat * pi / 30.0)) * 2.0 / 3.0 return ret Base = declarative_base() class Hotel(Base): __tablename__ = 'hotels' hotel_id = Column(Integer, primary_key=True) hotel_name = Column(String) link = Column(String) slug = Column(String) country = Column(String) province = Column(String) city = Column(String) district = Column(String) address = Column(String) phone = Column(String) site_url = Column(String) rooms = Column(JSON) hotel_presentation = Column(Text) meeting_facilities = Column(Text) images = Column(JSON) total_rooms = Column(Integer) hotel_free_wifi = Column(Boolean) total_meeting_rooms = Column(Integer) meeting_room_max_sqm = Column(Float) meeting_room_max_height = Column(Float) projector_resolution = Column(String) total_restaurants = Column(Integer) room_quantity = Column(JSON) meeting_room_quantity = Column(JSON) meeting_room_facilities = Column(JSON) restaurant_remark = Column(JSON) distances = Column(JSON) brand = Column(String) attachments = Column(JSON) lon = Column(Float) lat = Column(Float) s_country = Column(String) s_province = Column(String) s_city = Column(String) s_name = Column(String) s_address = Column(String) s_phone = Column(String) hop_id = Column(Integer) s_id = Column(BigInteger) s_lon = Column(Float) s_lat = Column(Float) s_loc_type = Column(Integer) class SaaSHotel(Base): __tablename__ = 'saas_hotel' id = Column(BigInteger, primary_key=True) hop_id = Column(Integer) name = Column(String) en_name = Column(String) phone = Column(String) address = Column(String) en_address = Column(String) lon = Column(Float) lat = Column(Float) loc_type = Column(Integer) country = Column(String) province = Column(String) city = Column(String)