|
@@ -24,14 +24,14 @@ const VisitorDetail = "VisitorDetail"
|
|
|
/* -------------------------------------------------------全局变量---------------------------------------------------- */
|
|
|
var (
|
|
|
visitorDetailPromise = utils.NewRequest(). // 详情页请求
|
|
|
- SetHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9").
|
|
|
- SetHeader("Accept-Encoding", "gzip, deflate").
|
|
|
- SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
|
|
|
- SetHeader("Cache-Control", "max-age=0").
|
|
|
- SetHeader("Connection", "keep-alive").
|
|
|
- SetHeader("Host", "www.qianlima.com").
|
|
|
- SetHeader("Upgrade-Insecure-Requests", "1").
|
|
|
- SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
|
|
|
+ SetHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9").
|
|
|
+ SetHeader("Accept-Encoding", "gzip, deflate").
|
|
|
+ SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
|
|
|
+ SetHeader("Cache-Control", "max-age=0").
|
|
|
+ SetHeader("Connection", "keep-alive").
|
|
|
+ SetHeader("Host", "www.qianlima.com").
|
|
|
+ SetHeader("Upgrade-Insecure-Requests", "1").
|
|
|
+ SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
|
|
|
)
|
|
|
|
|
|
/* -------------------------------------------------------爬虫实现---------------------------------------------------- */
|
|
@@ -73,10 +73,16 @@ func (s *VisitorDetailSpider) Name() string {
|
|
|
}
|
|
|
|
|
|
// 响应解析
|
|
|
-func (s *VisitorDetailSpider) Parse(item interface{}, body []byte, headers http.Header) ([]*common.Target, error) {
|
|
|
- content, ok := item.(*items.Content)
|
|
|
- if !ok {
|
|
|
- return nil, errors.New(fmt.Sprintf("invaild item %+v", item))
|
|
|
+func (s *VisitorDetailSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) {
|
|
|
+ var content *items.Content
|
|
|
+ if nil == item {
|
|
|
+ content = &items.Content{}
|
|
|
+ } else {
|
|
|
+ var ok bool
|
|
|
+ content, ok = item.(*items.Content)
|
|
|
+ if !ok {
|
|
|
+ return nil, errors.New(fmt.Sprintf("invaild item %+v", item))
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
// 数据解析
|
|
@@ -96,9 +102,14 @@ func (s *VisitorDetailSpider) Parse(item interface{}, body []byte, headers http.
|
|
|
wen = strings.Replace(wen, "<input type=\"hidden\" id=\"zburl\" value=\"\"/>", "", -1)
|
|
|
wen = spacePattern.ReplaceAllString(wen, "")
|
|
|
content.Text = wen
|
|
|
+
|
|
|
// 数据解析后处理逻辑
|
|
|
s.resultMap.Set(strconv.Itoa(content.ContentId), nil) // 暂存结果到字典中(暂时无用,仅做进度打印,非核心代码)
|
|
|
- s.exportToDB(content) // 逐条储存结果到数据库
|
|
|
+ if content.ContentId == 0 { // 通常仅当测试未传递Item参数时会出现此情况
|
|
|
+ fmt.Printf("empty content id %+v\n", content)
|
|
|
+ return nil, nil
|
|
|
+ }
|
|
|
+ s.exportToDB(content) // 逐条储存结果到数据库
|
|
|
return nil, nil
|
|
|
}
|
|
|
|