Browse Source

base spider

marion 4 years ago
parent
commit
210739ca9a

+ 1 - 1
go.mod

@@ -3,7 +3,7 @@ module git.aionnect.com/hello-go
 go 1.14
 
 require (
-	git.aionnect.com/aionnect/go-common v0.0.0-20200522082556-9e4068ef1418
+	git.aionnect.com/aionnect/go-common v0.0.0-20200717113424-66ed7fb6428f
 	git.wanpinghui.com/WPH/go_common v1.2.4
 	github.com/PuerkitoBio/goquery v1.5.1
 	github.com/Shopify/sarama v1.26.3

+ 0 - 22
spider/common/common.go

@@ -1,10 +1,8 @@
 package common
 
 import (
-	"golang.org/x/text/encoding/simplifiedchinese"
 	"io"
 	"math/rand"
-	"regexp"
 	"strings"
 	"sync"
 	"time"
@@ -99,12 +97,6 @@ var (
 	}
 )
 
-/* -----------------------------------------------------全局变量定义-------------------------------------------------- */
-var (
-	gbkPattern, _ = regexp.Compile(`meta\s+http-equiv="Content-Type"\s+content="text/html;\s*charset=gbk"`)
-	gbkDecoder    = simplifiedchinese.GBK.NewDecoder()
-)
-
 /* -----------------------------------------------------线程安全字典-------------------------------------------------- */
 type ConcurrentMap struct {
 	data   map[string]interface{}
@@ -207,20 +199,6 @@ type Target struct {
 }
 
 /* -------------------------------------------------------工具函数---------------------------------------------------- */
-// 字符编码转换
-func EncodeTrans(input []byte) []byte {
-	if nil == input || len(input) == 0 {
-		return input
-	}
-
-	if gbkPattern.Match(input) {
-		output, _ := gbkDecoder.Bytes(input)
-		return output
-	} else {
-		return input
-	}
-}
-
 // 随机秒数
 func RandSeconds() time.Duration {
 	rand.Seed(time.Now().UnixNano())

+ 4 - 5
spider/spiders/base_spider.go

@@ -69,8 +69,8 @@ func (ctx *spidersContext) router(target *common.Target) {
 
 // 爬虫主接口抽象
 type ISpider interface {
-	Name() string                                                  // 爬虫命名
-	Parse(item interface{}, body []byte, headers http.Header) ([]*common.Target, error) // 响应解析
+	Name() string                                                                           // 爬虫命名
+	Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) // 响应解析
 }
 
 // 自定义爬虫请求接口抽象
@@ -140,7 +140,7 @@ func (s *Spider) invoke(target *common.Target) {
 	}
 	s.promise.SetHeader("User-Agent", common.RandUserAgent(target.IsCell)) // 随机User-Agent
 	s.promise.SetHttpProxy(s.proxyDao.Get())                               // 随机代理
-	body, err := s.promise.Call(target.Method, target.URL, target.Data)
+	body, resp, err := s.promise.CallResponse(target.Method, target.URL, target.Data)
 	if nil != err {
 		fmt.Printf("http request [%s] %s failed. %s\n", target.Method, target.URL, err.Error())
 		return
@@ -149,9 +149,8 @@ func (s *Spider) invoke(target *common.Target) {
 		fmt.Printf("empty http response [%s] %s", target.Method, target.URL)
 		return
 	}
-	body = common.EncodeTrans(body)
 	var nextTargets []*common.Target
-	nextTargets, err = s.Parse(target.Item, body, nil)
+	nextTargets, err = s.Parse(target.Item, body, resp)
 	if nil != err {
 		fmt.Println(err.Error())
 		return

+ 1 - 1
spider/spiders/kuaidaili/free_paging.go

@@ -39,7 +39,7 @@ func (s *FreePagingSpider) Name() string {
 }
 
 // 响应解析
-func (s *FreePagingSpider) Parse(item interface{}, body []byte, headers http.Header) ([]*common.Target, error) {
+func (s *FreePagingSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) {
 	params := item.(*common.ProxyPagingParams)
 	if params.PagingNo == 2 { // 只爬一页,仅测试时使用
 		return nil, nil

+ 24 - 13
spider/spiders/qianlima/visitor_detail.go

@@ -24,14 +24,14 @@ const VisitorDetail = "VisitorDetail"
 /* -------------------------------------------------------全局变量---------------------------------------------------- */
 var (
 	visitorDetailPromise = utils.NewRequest(). // 详情页请求
-		SetHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9").
-		SetHeader("Accept-Encoding", "gzip, deflate").
-		SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
-		SetHeader("Cache-Control", "max-age=0").
-		SetHeader("Connection", "keep-alive").
-		SetHeader("Host", "www.qianlima.com").
-		SetHeader("Upgrade-Insecure-Requests", "1").
-		SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
+							SetHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9").
+							SetHeader("Accept-Encoding", "gzip, deflate").
+							SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
+							SetHeader("Cache-Control", "max-age=0").
+							SetHeader("Connection", "keep-alive").
+							SetHeader("Host", "www.qianlima.com").
+							SetHeader("Upgrade-Insecure-Requests", "1").
+							SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
 )
 
 /* -------------------------------------------------------爬虫实现---------------------------------------------------- */
@@ -73,10 +73,16 @@ func (s *VisitorDetailSpider) Name() string {
 }
 
 // 响应解析
-func (s *VisitorDetailSpider) Parse(item interface{}, body []byte, headers http.Header) ([]*common.Target, error) {
-	content, ok := item.(*items.Content)
-	if !ok {
-		return nil, errors.New(fmt.Sprintf("invaild item %+v", item))
+func (s *VisitorDetailSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) {
+	var content *items.Content
+	if nil == item {
+		content = &items.Content{}
+	} else {
+		var ok bool
+		content, ok = item.(*items.Content)
+		if !ok {
+			return nil, errors.New(fmt.Sprintf("invaild item %+v", item))
+		}
 	}
 
 	// 数据解析
@@ -96,9 +102,14 @@ func (s *VisitorDetailSpider) Parse(item interface{}, body []byte, headers http.
 	wen = strings.Replace(wen, "<input type=\"hidden\" id=\"zburl\" value=\"\"/>", "", -1)
 	wen = spacePattern.ReplaceAllString(wen, "")
 	content.Text = wen
+
 	// 数据解析后处理逻辑
 	s.resultMap.Set(strconv.Itoa(content.ContentId), nil) // 暂存结果到字典中(暂时无用,仅做进度打印,非核心代码)
-	s.exportToDB(content)                                 // 逐条储存结果到数据库
+	if content.ContentId == 0 {                           // 通常仅当测试未传递Item参数时会出现此情况
+		fmt.Printf("empty content id %+v\n", content)
+		return nil, nil
+	}
+	s.exportToDB(content) // 逐条储存结果到数据库
 	return nil, nil
 }
 

+ 10 - 10
spider/spiders/qianlima/visitor_paging.go

@@ -23,15 +23,15 @@ const VisitorPaging = "VisitorPaging"
 /* -------------------------------------------------------全局变量---------------------------------------------------- */
 var (
 	visitorPagingPromise = utils.NewRequest(). // 请求对象
-		SetHeader("Accept", "*/*").
-		SetHeader("Accept-Encoding", "gzip, deflate").
-		SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
-		SetHeader("Connection", "keep-alive").
-		SetHeader("Content-Length", "0").
-		SetHeader("Content-Type", "application/x-www-form-unlencoded").
-		SetHeader("Host", "search.qianlima.com").
-		SetHeader("Origin", "http://search.qianlima.com").
-		SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
+				SetHeader("Accept", "*/*").
+				SetHeader("Accept-Encoding", "gzip, deflate").
+				SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
+				SetHeader("Connection", "keep-alive").
+				SetHeader("Content-Length", "0").
+				SetHeader("Content-Type", "application/x-www-form-unlencoded").
+				SetHeader("Host", "search.qianlima.com").
+				SetHeader("Origin", "http://search.qianlima.com").
+				SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
 	spacePattern, _ = regexp.Compile(`\n\s*`) // 去空格正则
 )
 
@@ -57,7 +57,7 @@ func (s *VisitorPagingSpider) Name() string {
 }
 
 // 响应解析
-func (s *VisitorPagingSpider) Parse(item interface{}, body []byte, headers http.Header) ([]*common.Target, error) {
+func (s *VisitorPagingSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) {
 	params := item.(*items.PagingParams)
 	//if params.PagingNo == 2 { // 只爬一页,仅测试时使用
 	//	return nil, nil

+ 15 - 0
spider/spiders/qianlima/visitor_test.go

@@ -26,3 +26,18 @@ func TestVisitorSpider(t *testing.T) {
 	)
 	fmt.Println("Done!")
 }
+
+func TestVisitorDetailSpider(t *testing.T) {
+	// 单独测试详情页爬虫,主要为了测试不同网页编码是否解析正常
+	// 未传Item参数,不会写数据库
+	targets := []*common.Target{
+		{Key: VisitorDetail, Method: http.MethodGet, URL: "http://www.qianlima.com/zb/detail/20200710_186393093.html"}, // UTF8
+		{Key: VisitorDetail, Method: http.MethodGet, URL: "http://www.qianlima.com/zb/detail/20200705_185702517.html"}, // GBK
+		{Key: VisitorDetail, Method: http.MethodGet, URL: "http://www.qianlima.com/zb/detail/20200703_185597771.html"}, // GBK
+	}
+	for _, target := range targets {
+		spiders.Run(target,
+			NewVisitorDetailSpider(),
+		)
+	}
+}