郑毅 3 months ago
parent
commit
12f9095593
4 changed files with 288 additions and 0 deletions
  1. 2 0
      .gitignore
  2. 132 0
      spider_test/create_table.sql
  3. 1 0
      spider_test/hotels_202412101815.sql
  4. 153 0
      spider_test/querys.sql

+ 2 - 0
.gitignore

@@ -6,3 +6,5 @@ output
 vendor
 .DS_Store
 data
+*.zip
+*.7z

+ 132 - 0
spider_test/create_table.sql

@@ -0,0 +1,132 @@
+-- test.hotels definition
+
+CREATE TABLE `hotels` (
+  `hotel_id` int NOT NULL,
+  `hotel_name` varchar(256) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `link` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `slug` varchar(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `country` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `province` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `city` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `district` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `address` varchar(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `phone` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `site_url` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `rooms` json DEFAULT NULL,
+  `hotel_presentation` text,
+  `meeting_facilities` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci,
+  `images` json DEFAULT NULL,
+  `total_rooms` int DEFAULT NULL,
+  `hotel_free_wifi` tinyint(1) DEFAULT NULL,
+  `total_meeting_rooms` int DEFAULT NULL,
+  `meeting_room_max_sqm` float DEFAULT NULL,
+  `meeting_room_max_height` float DEFAULT NULL,
+  `projector_resolution` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `total_restaurants` int DEFAULT NULL,
+  `room_quantity` json DEFAULT NULL,
+  `meeting_room_quantity` json DEFAULT NULL,
+  `meeting_room_facilities` json DEFAULT NULL,
+  `restaurant_remark` json DEFAULT NULL,
+  `distances` json DEFAULT NULL,
+  `brand` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `attachments` json DEFAULT NULL,
+  `lon` decimal(12,8) DEFAULT NULL,
+  `lat` decimal(12,8) DEFAULT NULL,
+  `s_country` varchar(64) DEFAULT NULL,
+  `s_province` varchar(64) DEFAULT NULL,
+  `s_city` varchar(64) DEFAULT NULL,
+  `s_name` varchar(256) DEFAULT NULL,
+  `s_address` varchar(512) DEFAULT NULL,
+  `s_phone` varchar(64) DEFAULT NULL,
+  `hop_id` int DEFAULT NULL,
+  `s_id` bigint DEFAULT NULL,
+  `s_lon` decimal(12,8) DEFAULT NULL,
+  `s_lat` decimal(12,8) DEFAULT NULL,
+  `s_loc_type` tinyint DEFAULT NULL,
+  PRIMARY KEY (`hotel_id`),
+  KEY `hotels_hotel_id_IDX` (`hotel_id`) USING BTREE,
+  KEY `hotels_phone_IDX` (`phone`) USING BTREE,
+  KEY `hotels_hotel_name_IDX` (`hotel_name`) USING BTREE,
+  KEY `hotels_address_IDX` (`address`) USING BTREE,
+  KEY `hotels_country_IDX` (`country`) USING BTREE,
+  KEY `hotels_city_IDX` (`city`) USING BTREE,
+  KEY `hotels_s_id_IDX` (`s_id`) USING BTREE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
+
+-- test.hotels_temp definition
+
+CREATE TABLE `hotels_temp` (
+  `hotel_id` int NOT NULL,
+  `hotel_name` varchar(256) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `link` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `slug` varchar(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `country` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `province` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `city` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `district` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `address` varchar(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `phone` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `site_url` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `rooms` json DEFAULT NULL,
+  `hotel_presentation` text,
+  `meeting_facilities` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci,
+  `images` json DEFAULT NULL,
+  `total_rooms` int DEFAULT NULL,
+  `hotel_free_wifi` tinyint(1) DEFAULT NULL,
+  `total_meeting_rooms` int DEFAULT NULL,
+  `meeting_room_max_sqm` float DEFAULT NULL,
+  `meeting_room_max_height` float DEFAULT NULL,
+  `projector_resolution` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `total_restaurants` int DEFAULT NULL,
+  `room_quantity` json DEFAULT NULL,
+  `meeting_room_quantity` json DEFAULT NULL,
+  `meeting_room_facilities` json DEFAULT NULL,
+  `restaurant_remark` json DEFAULT NULL,
+  `distances` json DEFAULT NULL,
+  `brand` varchar(128) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `attachments` json DEFAULT NULL,
+  `lon` decimal(12,8) DEFAULT NULL,
+  `lat` decimal(12,8) DEFAULT NULL,
+  `s_country` varchar(64) DEFAULT NULL,
+  `s_province` varchar(64) DEFAULT NULL,
+  `s_city` varchar(64) DEFAULT NULL,
+  `s_name` varchar(256) DEFAULT NULL,
+  `s_address` varchar(512) DEFAULT NULL,
+  `s_phone` varchar(64) DEFAULT NULL,
+  `hop_id` int DEFAULT NULL,
+  `s_id` bigint DEFAULT NULL,
+  `s_lon` decimal(12,8) DEFAULT NULL,
+  `s_lat` decimal(12,8) DEFAULT NULL,
+  `s_loc_type` tinyint DEFAULT NULL,
+  PRIMARY KEY (`hotel_id`),
+  KEY `hotels_hotel_id_IDX` (`hotel_id`) USING BTREE,
+  KEY `hotels_phone_IDX` (`phone`) USING BTREE,
+  KEY `hotels_hotel_name_IDX` (`hotel_name`) USING BTREE,
+  KEY `hotels_address_IDX` (`address`) USING BTREE,
+  KEY `hotels_country_IDX` (`country`) USING BTREE,
+  KEY `hotels_city_IDX` (`city`) USING BTREE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
+
+-- test.saas_hotel definition
+
+CREATE TABLE `saas_hotel` (
+  `id` bigint NOT NULL,
+  `hop_id` int NOT NULL,
+  `name` varchar(256) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `en_name` varchar(256) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `phone` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `address` varchar(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `en_address` varchar(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `lon` decimal(12,8) DEFAULT NULL,
+  `lat` decimal(12,8) DEFAULT NULL,
+  `loc_type` tinyint DEFAULT NULL,
+  `country` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `province` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  `city` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  KEY `saas_hotel_en_name_IDX` (`en_name`) USING BTREE,
+  KEY `saas_hotel_phone_IDX` (`phone`) USING BTREE,
+  KEY `saas_hotel_en_address_IDX` (`en_address`) USING BTREE,
+  KEY `saas_hotel_country_IDX` (`country`) USING BTREE,
+  KEY `saas_hotel_city_IDX` (`city`) USING BTREE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

File diff suppressed because it is too large
+ 1 - 0
spider_test/hotels_202412101815.sql


+ 153 - 0
spider_test/querys.sql

@@ -0,0 +1,153 @@
+SELECT COUNT(1) from hotels;
+SELECT COUNT(1) from hst_hotel;
+SELECT COUNT(1) from saas_hotel; 
+
+SELECT MAX(LENGTH(en_name)),MAX(LENGTH(phone)),MAX(LENGTH(en_address)) from saas_hotel;
+SELECT MAX(LENGTH(hotel_name)),MAX(LENGTH(phone)),MAX(LENGTH(address)) from hotels;
+
+SELECT city FROM hotels group by city;
+SELECT province FROM hotels group by province;
+
+SELECT * FROM hotels WHERE country<>'China' or city='Hong Kong' or city='Macau' or province='Taiwan' ORDER BY country,province,city;
+
+SELECT * from saas_hotel limit 10;
+SELECT * from saas_hotel WHERE country<>'中国' and country<>'其他' limit 10;
+SELECT * from hotels  WHERE province='Taiwan' limit 10;
+SELECT * from hotels  WHERE country<>'China' limit 10;
+SELECT phone, lon, lat from hotels WHERE hotel_id=118;
+UPDATE hotels SET lon=null,lat=null WHERE hotel_id=118;
+SELECT * FROM hotels WHERE lon is null;
+
+SELECT hotel_id, COUNT(1) from hotels group by hotel_id HAVING count(1)>1; 
+
+INSERT INTO hotels_temp
+SELECT h.hotel_id, h.hotel_name, h.link, h.slug, h.country, h.province, h.city, h.district, h.address, h.phone, h.site_url, h.rooms, h.hotel_presentation, h.meeting_facilities,
+h.images, h.total_rooms, h.hotel_free_wifi, h.total_meeting_rooms, h.meeting_room_max_sqm, h.meeting_room_max_height, h.projector_resolution, h.total_restaurants,
+h.room_quantity, h.meeting_room_quantity, h.meeting_room_facilities, h.restaurant_remark, h.distances, h.brand, h.attachments, h.lon, h.lat, 
+sh.country as `s_country`, sh.province as `s_province`, sh.city as `s_city`, sh.en_name as `s_name`, sh.en_address as `s_address`, sh.phone as `s_phone`, 
+sh.hop_id, sh.id as `s_id`, sh.lon as `s_lon`, sh.lat as `s_lat`, sh.loc_type as `s_loc_type`
+FROM hotels h 
+inner join saas_hotel sh on lower(trim(sh.en_name)) = lower(trim(h.hotel_name)) 
+WHERE h.s_id is null and h.hotel_id in 
+(
+	SELECT h.hotel_id FROM hotels h 
+	inner join saas_hotel sh on lower(trim(sh.en_name)) = lower(trim(h.hotel_name)) 
+	WHERE h.s_id is null AND h.hotel_id not in (11930, 16280, 24797, 24329)
+	GROUP BY h.hotel_id HAVING COUNT(1)=1
+) ORDER BY h.country, h.province, h.city, h.district
+;
+
+INSERT INTO hotels_temp
+SELECT h.hotel_id, h.hotel_name, h.link, h.slug, h.country, h.province, h.city, h.district, h.address, h.phone, h.site_url, h.rooms, h.hotel_presentation, h.meeting_facilities,
+h.images, h.total_rooms, h.hotel_free_wifi, h.total_meeting_rooms, h.meeting_room_max_sqm, h.meeting_room_max_height, h.projector_resolution, h.total_restaurants,
+h.room_quantity, h.meeting_room_quantity, h.meeting_room_facilities, h.restaurant_remark, h.distances, h.brand, h.attachments, h.lon, h.lat, 
+sh.country as `s_country`, sh.province as `s_province`, sh.city as `s_city`, sh.en_name as `s_name`, sh.en_address as `s_address`, sh.phone as `s_phone`, 
+sh.hop_id, sh.id as `s_id`, sh.lon as `s_lon`, sh.lat as `s_lat`, sh.loc_type as `s_loc_type`
+FROM hotels h 
+inner join saas_hotel sh on lower(trim(sh.en_address)) = lower(trim(h.address)) 
+WHERE h.s_id is null and h.hotel_id in 
+(
+	SELECT h.hotel_id FROM hotels h 
+	inner join saas_hotel sh on lower(trim(sh.en_address)) = lower(trim(h.address)) 
+	WHERE h.s_id is null
+	GROUP BY h.hotel_id HAVING COUNT(1)=1
+) ORDER BY h.country, h.province, h.city, h.district
+;
+
+INSERT INTO hotels_temp
+SELECT h.hotel_id, h.hotel_name, h.link, h.slug, h.country, h.province, h.city, h.district, h.address, h.phone, h.site_url, h.rooms, h.hotel_presentation, h.meeting_facilities,
+h.images, h.total_rooms, h.hotel_free_wifi, h.total_meeting_rooms, h.meeting_room_max_sqm, h.meeting_room_max_height, h.projector_resolution, h.total_restaurants,
+h.room_quantity, h.meeting_room_quantity, h.meeting_room_facilities, h.restaurant_remark, h.distances, h.brand, h.attachments, h.lon, h.lat, 
+sh.country as `s_country`, sh.province as `s_province`, sh.city as `s_city`, sh.en_name as `s_name`, sh.en_address as `s_address`, sh.phone as `s_phone`, 
+sh.hop_id, sh.id as `s_id`, sh.lon as `s_lon`, sh.lat as `s_lat`, sh.loc_type as `s_loc_type`
+FROM hotels h 
+inner join saas_hotel sh on REPLACE(trim(BOTH '+' FROM sh.phone), '-', '') = REPLACE(trim(BOTH '+' FROM h.phone), '-', '')
+WHERE h.s_id is null and h.hotel_id in 
+(
+	SELECT h.hotel_id FROM hotels h 
+	inner join saas_hotel sh on REPLACE(trim(BOTH '+' FROM sh.phone), '-', '') = REPLACE(trim(BOTH '+' FROM h.phone), '-', '')
+	WHERE h.s_id is null AND h.hotel_id not in (29842, 16632)
+	GROUP BY h.hotel_id HAVING COUNT(1)=1
+) ORDER BY h.country, h.province, h.city, h.district
+;
+
+SELECT h.hotel_id, h.hotel_name, h.link, h.slug, h.country, h.province, h.city, h.district, h.address, h.phone, h.site_url, h.rooms, h.hotel_presentation, h.meeting_facilities,
+h.images, h.total_rooms, h.hotel_free_wifi, h.total_meeting_rooms, h.meeting_room_max_sqm, h.meeting_room_max_height, h.projector_resolution, h.total_restaurants,
+h.room_quantity, h.meeting_room_quantity, h.meeting_room_facilities, h.restaurant_remark, h.distances, h.brand, h.attachments, h.lon, h.lat, 
+sh.country as `s_country`, sh.province as `s_province`, sh.city as `s_city`, sh.en_name as `s_name`, sh.en_address as `s_address`, sh.phone as `s_phone`, 
+sh.hop_id, sh.id as `s_id`, sh.lon as `s_lon`, sh.lat as `s_lat`, sh.loc_type as `s_loc_type`
+FROM hotels h 
+inner join saas_hotel sh on ROUND(6378.138 * 2 * ASIN(SQRT(
+                POW(SIN((h.lat * PI() / 180 - sh.lat * PI() / 180) / 2), 2) + 
+                COS(h.lat * PI() / 180) * COS(sh.lat * PI() / 180) * POW(SIN((h.lon * PI() / 180 - sh.lon * PI() / 180) / 2), 2)
+                )) * 1000) <= 200
+WHERE h.s_id is null and h.hotel_id in 
+(
+	SELECT h.hotel_id FROM hotels h 
+	inner join saas_hotel sh on ROUND(6378.138 * 2 * ASIN(SQRT(
+                POW(SIN((h.lat * PI() / 180 - sh.lat * PI() / 180) / 2), 2) + 
+                COS(h.lat * PI() / 180) * COS(sh.lat * PI() / 180) * POW(SIN((h.lon * PI() / 180 - sh.lon * PI() / 180) / 2), 2)
+                )) * 1000) <= 200
+	WHERE h.s_id is null and h.lon is not null
+	GROUP BY h.hotel_id HAVING COUNT(1)=1
+) ORDER BY h.country, h.province, h.city, h.district
+;
+
+INSERT INTO hotels_temp
+SELECT h.hotel_id, h.hotel_name, h.link, h.slug, h.country, h.province, h.city, h.district, h.address, h.phone, h.site_url, h.rooms, h.hotel_presentation, h.meeting_facilities,
+h.images, h.total_rooms, h.hotel_free_wifi, h.total_meeting_rooms, h.meeting_room_max_sqm, h.meeting_room_max_height, h.projector_resolution, h.total_restaurants,
+h.room_quantity, h.meeting_room_quantity, h.meeting_room_facilities, h.restaurant_remark, h.distances, h.brand, h.attachments, h.lon, h.lat, 
+sh.country as `s_country`, sh.province as `s_province`, sh.city as `s_city`, sh.en_name as `s_name`, sh.en_address as `s_address`, sh.phone as `s_phone`, 
+sh.hop_id, sh.id as `s_id`, sh.lon as `s_lon`, sh.lat as `s_lat`, sh.loc_type as `s_loc_type`
+FROM hotels h 
+inner join saas_hotel sh on sh.hop_id=h.hop_id 
+WHERE h.s_id is null and h.hop_id is not null and h.hotel_id in 
+(
+	SELECT h.hotel_id FROM hotels h 
+	inner join saas_hotel sh on sh.hop_id=h.hop_id 
+	WHERE h.s_id is null and h.hop_id is not null
+	GROUP BY h.hotel_id HAVING COUNT(1)=1
+) ORDER BY h.country, h.province, h.city, h.district
+;
+
+
+UPDATE hotels h 
+inner join hotels_temp a on a.hotel_id=h.hotel_id 
+SET h.s_country=a.s_country, h.s_province=a.s_province, h.s_city=a.s_city, h.s_name=a.s_name, h.s_address=a.s_address, h.s_phone=a.s_phone, 
+h.hop_id=a.hop_id, h.s_id=a.s_id, h.s_lon=a.s_lon, h.s_lat=a.s_lat, h.s_loc_type=a.s_loc_type
+;
+
+SELECT 
+ROUND(6378.138 * 2 * ASIN(SQRT(
+                POW(SIN((lat * PI() / 180 - s_lat * PI() / 180) / 2), 2) + 
+                COS(lat * PI() / 180) * COS(s_lat * PI() / 180) * POW(SIN((lon * PI() / 180 - s_lon * PI() / 180) / 2), 2)
+                )) * 1000) AS distance, h.*
+FROM hotels h WHERE lon is not null and s_lon is not null order by distance desc; 
+
+
+
+SELECT SUM(CASE WHEN s_id IS NULL THEN 0 ELSE 1 END) as `mapped`, COUNT(1) as `total` FROM hotels;
+SELECT SUM(CASE WHEN s_id IS NULL THEN 0 ELSE 1 END) as `mapped`, COUNT(1) as `total` FROM hotels WHERE country<>'China' or city='Hong Kong' or city='Macau' or province='Taiwan';
+SELECT SUM(CASE WHEN s_id IS NULL THEN 0 ELSE 1 END) as `mapped`, COUNT(1) as `total` FROM hotels WHERE country<>'China';
+SELECT * FROM hotels WHERE s_id IS NULL and country<>'China' ORDER BY country,province,city;
+
+SELECT s_id, COUNT(1) FROM hotels group by s_id HAVING count(1)>1;
+
+SELECT * FROM hotels h;
+
+SELECT
+h.hotel_id as `out_id`, h.hop_id, h.hotel_name, h.country, h.province, h.city, h.district, h.address, h.phone, h.site_url
+FROM hotels h WHERE s_id IS NULL ORDER BY country, province, city, district;
+
+SELECT ROUND(6378.138 * 2 * ASIN(SQRT(
+                POW(SIN((lat * PI() / 180 - s_lat * PI() / 180) / 2), 2) + 
+                COS(lat * PI() / 180) * COS(s_lat * PI() / 180) * POW(SIN((lon * PI() / 180 - s_lon * PI() / 180) / 2), 2)
+                )) * 1000) AS distance, h.* FROM hotels h WHERE s_id in (976695731874210468,976710868580475183,976710868672750357,976728804883997105)
+ORDER BY s_id
+;
+
+UPDATE hotels h 
+SET h.s_country=NULL, h.s_province=NULL, h.s_city=NULL, h.s_name=NULL, h.s_address=NULL, h.s_phone=NULL, 
+h.hop_id=NULL, h.s_id=NULL, h.s_lon=NULL, h.s_lat=NULL, h.s_loc_type=NULL
+WHERE h.hotel_id in(16666, 18710, 18962, 32776, 31214, 30610, 30638, 24437, 16826)
+;

Some files were not shown because too many files changed in this diff