#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import math
import numbers
import re
import time
from urllib.parse import quote
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, String, Text, BigInteger, Integer, Float, Boolean, JSON, create_engine
from sqlalchemy.orm import sessionmaker
BRAND_PATTERN = re.compile(r'>\s*Brand\s*
\s*\s*([^<>]+)\s*')
HOTEL_LINK_PATTERN = re.compile(r'/venue/(\d+)/([^/]+)/')
DISTANCE_PATTERN = re.compile(r'(\d+)\s*(\w+)\s*\((\d+)\s*\w+\s*\/\s*([^)]+)\)')
ENCODING = 'utf-8'
def test():
# get_hotels()
# get_hotels_rooms()
# get_hotels_rooms_ext()
# insert_to_mysql()
# get_loc()
convert_geo()
# insert_saas_to_mysql()
pass
def get_hotels():
items = []
for i in range(1, 241):
try:
resp = requests.get('https://www.1000meetings.com/en/area/1/world/p{}'.format(i))
resp.encoding = ENCODING
doc = BeautifulSoup(resp.text, 'html.parser')
for li in doc.select('#venuelist li'):
item = {}
a_arr = li.select('.venueheader .line1 .name a')
if len(a_arr) == 0:
continue
a = a_arr[0]
item['hotel_name'] = rep(a.string)
item['link'] = quote(a['href'].strip())
g = HOTEL_LINK_PATTERN.findall(item['link'])
if len(g) == 0 or len(g[0]) < 1:
continue
item['hotel_id'] = g[0][0]
item['slug'] = quote(g[0][1].strip())
area_arr = li.select('.venueheader .line1 .area a')
area_arr_len = len(area_arr)
if area_arr_len == 1:
item['country'] = area_arr[0].string.strip().strip('[').strip(']')
item['city'] = item['country']
elif area_arr_len == 2:
item['country'] = area_arr[0].string.strip().strip('[').strip(']')
item['city'] = area_arr[1].string.strip().strip('[').strip(']')
else:
item['country'] = area_arr[0].string.strip().strip('[').strip(']')
item['province'] = area_arr[1].string.strip().strip('[').strip(']')
if item['province'] == 'Hong Kong':
item['city'] = item['province']
item['district'] = area_arr[2].string.strip().strip('[').strip(']')
else:
item['city'] = area_arr[2].string.strip().strip('[').strip(']')
items.append(item)
except Exception as e:
print('get_hotels() failed. ', e)
# print(items)
df = pd.DataFrame(items)
df.to_csv('hotels.csv', encoding=ENCODING, index=False, header=True)
def get_hotels_rooms():
hotels = pd.read_csv('hotels.csv', encoding=ENCODING).to_dict('records')
items = []
for h in hotels:
try:
if h['link'] == '':
continue
resp = requests.get('https://www.1000meetings.com{}'.format(h['link']))
resp.encoding = ENCODING
doc = BeautifulSoup(resp.text, 'html.parser')
for li in doc.select('#frame-header #line2-left li'):
text = li.get_text().strip()
if text.startswith('Address:'):
text = text.lstrip('Address:')
h['address'] = rep(text)
elif text.startswith('Telephone:'):
text = text.lstrip('Telephone:').strip().replace('\n','')
text = re.sub(r'\s+', ' ', re.sub(r'[\t\(\)())]+', ' ', text)).replace('-', '').replace('ext.', '-').replace('、', '/').replace(',', '').replace('中国', '86').replace('China', '86').strip()
if ' ' in text:
idx = text.index(' ')
code = re.sub('\++', '+', text[0:idx])
number = text[idx:].replace(' ', '').replace('+', '').strip()
text = '{} {}'.format(code, number).strip()
if re.match(r'^0+$', text) or text == 'nan':
text = ''
h['phone'] = text
for div in doc.select('.MainBaseInfo .contactInfo .Line'):
text = div.get_text().strip()
if text.startswith('Web:') and len(div.select('a')) > 0:
h['site_url'] = div.select('a')[0]['href'].strip() if div.select('a') else ''
floor = ''
rooms = []
for ul in doc.select('#meeting-room-chart ul'):
room = {}
if 'title' in ul['class']:
continue
elif 'floor' in ul['class']:
li = ul.select('li.floorname')
if len(li) > 0:
floor = li[0].string
elif 'rooms' in ul['class']:
li = ul.select('li')
if len(li) > 0:
room['floor'] = rep(floor)
room['name'] = rep(li[0].string)
room['area'] = repq(li[2].string)
room['height'] = repq(li[3].string)
room['theatre'] = repq(li[4].string)
room['classroom'] = repq(li[5].string)
room['u_shape'] = repq(li[6].string)
room['banquet'] = repq(li[7].string)
room['cocktail'] = repq(li[8].string)
room['boardroom'] = repq(li[9].string)
rooms.append(room)
if len(rooms) > 0:
h['rooms'] = rooms
except Exception as e:
print('get_hotels_rooms() failed. ', e)
items.append(h)
# print(items)
df = pd.DataFrame(items)
df.to_csv('hotels_rooms.csv', encoding=ENCODING, index=False, header=True)
def get_hotels_rooms_ext():
hotels = pd.read_csv('hotels_rooms.csv', encoding=ENCODING).to_dict('records')
items = []
for h in hotels:
try:
if h['link'] == '':
continue
resp = requests.get('https://www.1000meetings.com.sg/meeting-destination/{}/{}/'.format(h['hotel_id'], h['slug']))
resp.encoding = ENCODING
doc = BeautifulSoup(resp.text, 'html.parser')
overview = doc.select('.hotel-details')
if len(overview) > 0:
name = ''
for tag in overview[0].find_all(True):
if tag.name == 'h4':
name = tag.get_text()
elif tag.name == 'p' and name == 'Hotel presentation':
h['hotel_presentation'] = rep(tag.prettify())
name = ''
elif tag.name == 'p' and name == 'Meeting Facilities':
h['meeting_facilities'] = rep(tag.prettify())
name = ''
images = doc.select('.portfolio-list')
if len(images) > 0:
img_arr = []
for img in images[0].select('img'):
img_arr.append({'origin_name':quote(img['src'].strip().split('/')[-1])})
h['images'] = img_arr
details = doc.select('table.hoteldetails tr')
room_quantity = []
meeting_room_quantity = []
restaurant_remark = []
meeting_room_facilities = {}
distances = []
topic = ''
if len(details) > 0:
for tr in details:
if tr.find('th') and tr.find('th').find('h5'):
topic = tr.find('th').find('h5').text.strip()
elif tr.find('td') and tr.find('td').find('h5') and len(tr.select('td.type-info')) > 0:
title = tr.find('td').find('h5').text.strip()
value = tr.select('td.type-info')[0].text.strip()
if topic == 'Room accommodation':
if title == 'Total rooms':
h['total_rooms'] = value
elif title == 'Free wifi':
h['hotel_free_wifi'] = '1' if value == 'Yes' else '0'
else:
room_quantity.append({'name':rep(title), 'quantity': repq(value)})
elif topic == 'Meeting space':
if title == 'Meeting rooms':
h['total_meeting_rooms'] = value
elif title == 'Max surface':
h['meeting_room_max_sqm'] = value.split(' ')[0]
elif title == 'Max height':
h['meeting_room_max_height'] = value.split(' ')[0]
elif title == 'Projector resolution':
h['projector_resolution'] = rep(value)
elif title == 'Natural light':
meeting_room_facilities['natural_light'] = '1' if value == 'Yes' else '0'
elif title == 'Pillarless':
meeting_room_facilities['pillarless'] = '1' if value == 'Yes' else '0'
elif title == 'Outdoor space':
meeting_room_facilities['outdoor'] = '1' if value == 'Yes' else '0'
elif title == 'LED screen on-site':
meeting_room_facilities['led_screen'] = '1' if value == 'Yes' else '0'
elif title == 'Free wifi':
meeting_room_facilities['wifi'] = '1' if value == 'Yes' else '0'
elif title == 'VIP rooms':
meeting_room_facilities['wifi'] = '1' if value == 'Yes' else '0'
else:
meeting_room_quantity.append({'name':rep(title), 'quantity': repq(value)})
elif topic == 'Restaurants':
if title == 'Num restaurants':
h['total_restaurants'] = value
else:
restaurant_remark.append({'name':rep(title), 'remark': rep(value)})
elif topic == 'Transportation':
if title in ['Distance to airport', 'Distance to airport 2', 'Distance to train station', 'Distance to train station 2', 'Subway station', 'Subway station 2']:
g = DISTANCE_PATTERN.findall(value)
if len(g) == 0 or len(g[0]) < 4:
continue
name = rep(g[0][3].replace('- line', '').replace('-', '').replace('N/A', ''))
distance = g[0][0]
time = g[0][2]
if len(name) == 0 or distance == '0' or time == '0':
continue
distances.append({'name': name, 'distance': distance + ('000' if g[0][1] == 'km' else ''), 'time': time})
if len(room_quantity) > 0:
h['room_quantity'] = room_quantity
if len(meeting_room_quantity) > 0:
h['meeting_room_quantity'] = meeting_room_quantity
if len(meeting_room_facilities) > 0:
h['meeting_room_facilities'] = meeting_room_facilities
if len(restaurant_remark) > 0:
h['restaurant_remark'] = restaurant_remark
if len(distances) > 0:
h['distances'] = distances
mds = doc.select('.slider-venue-display .col-md-3')
if len(mds) > 0:
for div in mds:
html = re.sub(r'\n', '', div.prettify())
g = BRAND_PATTERN.findall(html)
if len(g) > 0:
h['brand'] = rep(g[0])
break
h['attachments'] = ['https://www8.1000meetings.com/download/venue/pdf/{}/'.format(h['hotel_id'])]
except Exception as e:
print('get_hotels_rooms_ext() failed. ', e)
items.append(h)
# print(items)
df = pd.DataFrame(items)
df.to_csv('hotels_rooms_ext.csv', encoding=ENCODING, index=False, header=True)
def repq(input):
if not input:
return ''
input = input.strip()
return '0' if input == '-' or input == '–' else input
def rep(input):
if not input:
return ''
return re.sub(r'\s+', ' ', input.replace('!', '!').replace('、', '/').replace('?', '?').replace(';', ';').replace('(', '(').replace(')', ')').replace(':', ':').replace('。', '.').replace(',', ',').replace('–', '-').replace('√', '-').replace('’', '\'').replace('‘', '\'').replace(' ', ' ').replace('“', '"').replace('”', '"')).strip().strip('/').strip()
get_int = lambda v: 0 if not v or (isinstance(v, numbers.Number) and math.isnan(v)) else int(v)
get_float = lambda v: 0 if not v or (isinstance(v, numbers.Number) and math.isnan(v)) else float(v)
get_str = lambda v: '' if not v or (isinstance(v, numbers.Number) and math.isnan(v)) else '{}'.format(v)
get_json = lambda v: '' if not v or (isinstance(v, numbers.Number) and math.isnan(v)) else json.dumps(v)
get_bool = lambda v: v and not (isinstance(v, numbers.Number) and math.isnan(v)) and int(v) > 0
commit_buff_size = 500
session_maker = sessionmaker(bind=create_engine('mysql+pymysql://{}:{}@{}:{}/test'.format('root', '123456', 'localhost', 3306),
echo=False, # 程序运行时反馈执行过程中的关键对象,包括ORM构建的sql语句
max_overflow=0, # 超过连接池大小外最多创建的连接
pool_size=5, # 连接池大小
pool_timeout=300, # 池中没有线程最多等待的时间,否则报错
pool_recycle=3600, # 多久之后对线程池中的线程进行一次连接的回收(重置)
pool_pre_ping=True # 每次调用之前ping
))
def insert_to_mysql():
hotels = pd.read_csv('hotels_rooms_ext.csv', encoding=ENCODING).to_dict('records')
with session_maker() as session:
i = 0
length = len(hotels)
for h in hotels:
i += 1
if not h['hotel_id'] or math.isnan(h['hotel_id']) or h['hotel_id'] <= 0:
continue
try:
hotel = Hotel(
hotel_id = h['hotel_id'],
hotel_name = get_str(h['hotel_name']),
link = get_str(h['link']),
slug = get_str(h['slug']),
country = get_str(h['country']),
province = get_str(h['province']),
city = get_str(h['city']),
district = get_str(h['district']),
address = get_str(h['address']),
phone = get_str(h['phone']),
site_url = get_str(h['site_url']),
rooms = get_json(h['rooms']),
hotel_presentation = get_str(h['hotel_presentation']),
meeting_facilities = get_str(h['meeting_facilities']),
images = get_json(h['images']),
total_rooms = get_int(h['total_rooms']),
hotel_free_wifi = get_bool(h['hotel_free_wifi']),
total_meeting_rooms = get_int(h['total_meeting_rooms']),
meeting_room_max_sqm = get_float(h['meeting_room_max_sqm']),
meeting_room_max_height = get_float(h['meeting_room_max_height']),
projector_resolution = get_str(h['projector_resolution']),
total_restaurants = get_int(h['total_restaurants']),
room_quantity = get_json(h['room_quantity']),
meeting_room_quantity = get_json(h['meeting_room_quantity']),
meeting_room_facilities = get_json(h['meeting_room_facilities']),
restaurant_remark = get_json(h['restaurant_remark']),
distances = get_json(h['distances']),
brand = get_str(h['brand']),
attachments = get_json(h['attachments']),
# lon = get_float(h['lon']),
# lat = get_float(h['lat']),
# s_country = get_str(h['s_country']),
# s_province = get_str(h['s_province']),
# s_city = get_str(h['s_city']),
# s_name = get_str(h['s_name']),
# s_address = get_str(h['s_address']),
# s_phone = get_str(h['s_phone']),
# hop_id = get_int(h['hop_id']),
# s_id = get_int(h['s_id']),
# s_lon = get_float(h['s_lon']),
# s_lat = get_float(h['s_lat']),
# s_loc_type = get_int(h['s_loc_type'])
)
session.add(hotel)
if i % commit_buff_size == 0 or i >= length:
session.commit()
except Exception as e:
print('insert_to_mysql() failed. ', e)
def insert_saas_to_mysql():
hotels = pd.read_csv('saas_hotels.csv', encoding=ENCODING).to_dict('records')
with session_maker() as session:
i = 0
length = len(hotels)
for h in hotels:
i += 1
if not h['id'] or math.isnan(h['id']) or h['id'] <= 0:
continue
try:
hotel = SaaSHotel(
id = h['id'],
hop_id = get_int(h['hop_id']),
name = get_str(h['name']),
en_name = get_str(h['en_name']),
phone = get_str(h['phone']),
address = get_str(h['address']),
en_address = get_str(h['en_address']),
lon = get_float(h['lon']),
lat = get_float(h['lat']),
loc_type = get_int(h['loc_type']),
country = get_str(h['country']),
province = get_str(h['province']),
city = get_str(h['city'])
)
session.add(hotel)
if i % commit_buff_size == 0 or i >= length:
session.commit()
except Exception as e:
print('insert_saas_to_mysql() failed. ', e)
def get_loc():
pp1 = re.compile(r'^\+*0*0*[86-]*(\d{3,4})(\d{8})$')
pp2 = re.compile(r'^\+*0*0*[86-]*(\d{3,4})(\d{8})-(.+)$')
with session_maker() as session:
hotels = session.query(Hotel).filter(Hotel.lon==None).all()
i = 0
length = len(hotels)
for h in hotels:
i += 1
try:
h.phone = re.sub(r'\s+', '-', h.phone)
if h.country == 'China' and h.province not in ['Taiwan', 'Hong Kong', 'Macau']:
g = pp1.findall(h.phone)
if len(g) > 0 and len(g[0]) > 1:
h.phone = '+86-{}-{}'.format(g[0][0], g[0][1])
g = pp2.findall(h.phone)
if len(g) > 0 and len(g[0]) > 2:
h.phone = '+86-{}-{}-{}'.format(g[0][0], g[0][1], g[0][2])
str = quote('{} {}'.format(h.hotel_name, h.address))
uri = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&language=en&key=AIzaSyD0OfQuI1qV-VgTbVS8253RuU7Kt3ohtFo'.format(str)
text = requests.get(uri).text
res = json.loads(text).get('results')
if res and len(res) > 0:
geo = res[0]
if geo and geo.get('geometry') and geo.get('geometry').get('location') and geo.get('geometry').get('location').get('lat') and geo.get('geometry').get('location').get('lng'):
h.lon = float(geo.get('geometry').get('location').get('lng'))
h.lat = float(geo.get('geometry').get('location').get('lat'))
if i % 50 == 0:
time.sleep(1)
if i % commit_buff_size == 0 or i >= length:
session.commit()
except Exception as e:
print('get_loc() failed. ', e)
def convert_geo():
with session_maker() as session:
hotels = session.query(Hotel).filter((Hotel.s_loc_type == 1 or Hotel.s_loc_type == 3) and Hotel.s_lon.is_not(None) and Hotel.s_lat.is_not(None)).all()
i = 0
length = len(hotels)
for h in hotels:
i += 1
try:
if h.s_loc_type == 3:
h.s_lon, h.s_lat = gcj02_to_wgs84(h.s_lon, h.s_lat)
h.s_loc_type = 2
elif h.s_loc_type == 1:
h.s_lon, h.s_lat = bd09_to_wgs84(h.s_lon, h.s_lat)
h.s_loc_type = 2
if i % commit_buff_size == 0 or i >= length:
session.commit()
except Exception as e:
print('convert_geo() failed. ', e)
x_pi = 3.14159265358979324 * 3000.0 / 180.0
pi = 3.1415926535897932384626 # π
a = 6378245.0 # 长半轴
ee = 0.00669342162296594323 # 偏心率平方
'''
输入(经度,维度)
'''
def bd09_to_gcj02(bd_lon, bd_lat):
"""
百度坐标系(BD-09)转火星坐标系(GCJ-02)
百度——>谷歌、高德
:param bd_lat:百度坐标纬度
:param bd_lon:百度坐标经度
:return:转换后的坐标列表形式
"""
x = bd_lon - 0.0065
y = bd_lat - 0.006
z = math.sqrt(x * x + y * y) - 0.00002 * math.sin(y * x_pi)
theta = math.atan2(y, x) - 0.000003 * math.cos(x * x_pi)
gg_lng = z * math.cos(theta)
gg_lat = z * math.sin(theta)
return [gg_lng, gg_lat]
def gcj02_to_wgs84(lng, lat):
"""
GCJ02(火星坐标系)转GPS84
:param lng:火星坐标系的经度
:param lat:火星坐标系纬度
:return:
"""
if out_of_china(lng, lat):
return [lng, lat]
dlat = _transformlat(lng - 105.0, lat - 35.0)
dlng = _transformlng(lng - 105.0, lat - 35.0)
radlat = lat / 180.0 * pi
magic = math.sin(radlat)
magic = 1 - ee * magic * magic
sqrtmagic = math.sqrt(magic)
dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi)
dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi)
mglat = lat + dlat
mglng = lng + dlng
return [lng * 2 - mglng, lat * 2 - mglat]
def bd09_to_wgs84(bd_lon, bd_lat):
lon, lat = bd09_to_gcj02(bd_lon, bd_lat)
return gcj02_to_wgs84(lon, lat)
def bd09_to_wgs84(bd_lon, bd_lat):
lon, lat = bd09_to_gcj02(bd_lon, bd_lat)
return gcj02_to_wgs84(lon, lat)
def gcj02_to_bd09(lng, lat):
"""
火星坐标系(GCJ-02)转百度坐标系(BD-09) 谷歌、高德——>百度
:param lng:火星坐标经度
:param lat:火星坐标纬度
:return:
"""
z = math.sqrt(lng * lng + lat * lat) + 0.00002 * math.sin(lat * x_pi)
theta = math.atan2(lat, lng) + 0.000003 * math.cos(lng * x_pi)
bd_lng = z * math.cos(theta) + 0.0065
bd_lat = z * math.sin(theta) + 0.006
return [bd_lng, bd_lat]
def wgs84_to_gcj02(lng, lat):
"""
WGS84转GCJ02(火星坐标系)
:param lng:WGS84坐标系的经度
:param lat:WGS84坐标系的纬度
:return:
"""
if out_of_china(lng, lat): # 判断是否在国内
return [lng, lat]
dlat = _transformlat(lng - 105.0, lat - 35.0)
dlng = _transformlng(lng - 105.0, lat - 35.0)
radlat = lat / 180.0 * pi
magic = math.sin(radlat)
magic = 1 - ee * magic * magic
sqrtmagic = math.sqrt(magic)
dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi)
dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi)
mglat = lat + dlat
mglng = lng + dlng
return [mglng, mglat]
def wgs84_to_bd09(lon, lat):
lon, lat = wgs84_to_gcj02(lon, lat)
return gcj02_to_bd09(lon, lat)
def out_of_china(lng, lat):
"""
判断是否在国内,不在国内不做偏移
:param lng:
:param lat:
:return:
"""
return not (lng > 73.66 and lng < 135.05 and lat > 3.86 and lat < 53.55)
def _transformlng(lng, lat):
ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + 0.1 * lng * lat + 0.1 * math.sqrt(math.fabs(lng))
ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 * math.sin(2.0 * lng * pi)) * 2.0 / 3.0
ret += (20.0 * math.sin(lng * pi) + 40.0 * math.sin(lng / 3.0 * pi)) * 2.0 / 3.0
ret += (150.0 * math.sin(lng / 12.0 * pi) + 300.0 * math.sin(lng / 30.0 * pi)) * 2.0 / 3.0
return ret
def _transformlat(lng, lat):
ret = -100.0 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + 0.1 * lng * lat + 0.2 * math.sqrt(math.fabs(lng))
ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 * math.sin(2.0 * lng * pi)) * 2.0 / 3.0
ret += (20.0 * math.sin(lat * pi) + 40.0 * math.sin(lat / 3.0 * pi)) * 2.0 / 3.0
ret += (160.0 * math.sin(lat / 12.0 * pi) + 320 * math.sin(lat * pi / 30.0)) * 2.0 / 3.0
return ret
Base = declarative_base()
class Hotel(Base):
__tablename__ = 'hotels'
hotel_id = Column(Integer, primary_key=True)
hotel_name = Column(String)
link = Column(String)
slug = Column(String)
country = Column(String)
province = Column(String)
city = Column(String)
district = Column(String)
address = Column(String)
phone = Column(String)
site_url = Column(String)
rooms = Column(JSON)
hotel_presentation = Column(Text)
meeting_facilities = Column(Text)
images = Column(JSON)
total_rooms = Column(Integer)
hotel_free_wifi = Column(Boolean)
total_meeting_rooms = Column(Integer)
meeting_room_max_sqm = Column(Float)
meeting_room_max_height = Column(Float)
projector_resolution = Column(String)
total_restaurants = Column(Integer)
room_quantity = Column(JSON)
meeting_room_quantity = Column(JSON)
meeting_room_facilities = Column(JSON)
restaurant_remark = Column(JSON)
distances = Column(JSON)
brand = Column(String)
attachments = Column(JSON)
lon = Column(Float)
lat = Column(Float)
s_country = Column(String)
s_province = Column(String)
s_city = Column(String)
s_name = Column(String)
s_address = Column(String)
s_phone = Column(String)
hop_id = Column(Integer)
s_id = Column(BigInteger)
s_lon = Column(Float)
s_lat = Column(Float)
s_loc_type = Column(Integer)
class SaaSHotel(Base):
__tablename__ = 'saas_hotel'
id = Column(BigInteger, primary_key=True)
hop_id = Column(Integer)
name = Column(String)
en_name = Column(String)
phone = Column(String)
address = Column(String)
en_address = Column(String)
lon = Column(Float)
lat = Column(Float)
loc_type = Column(Integer)
country = Column(String)
province = Column(String)
city = Column(String)