package qianlima
import (
"bytes"
"errors"
"fmt"
"git.aionnect.com/aionnect/go-common/utils"
"git.aionnect.com/aionnect/go-common/utils/date"
"git.aionnect.com/hello-go/spider/common"
"git.aionnect.com/hello-go/spider/dao"
"git.aionnect.com/hello-go/spider/spiders"
"git.aionnect.com/hello-go/spider/spiders/qianlima/items"
"github.com/PuerkitoBio/goquery"
"net/http"
"strconv"
"strings"
"time"
"xorm.io/xorm"
)
/* -------------------------------------------------------常量定义---------------------------------------------------- */
const VisitorDetail = "VisitorDetail"
/* -------------------------------------------------------全局变量---------------------------------------------------- */
var (
visitorDetailPromise = utils.NewRequest(). // 详情页请求
SetHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9").
SetHeader("Accept-Encoding", "gzip, deflate").
SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
SetHeader("Cache-Control", "max-age=0").
SetHeader("Connection", "keep-alive").
SetHeader("Host", "www.qianlima.com").
SetHeader("Upgrade-Insecure-Requests", "1").
SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
)
/* -------------------------------------------------------爬虫实现---------------------------------------------------- */
// 千里马项目网访客详情页爬虫类
type VisitorDetailSpider struct {
db *xorm.Engine // 数据库访问对象
resultMap *common.ConcurrentMap // 数据暂存价值对
ticker *time.Ticker // 显示进度的定时器
}
// 返回当前爬虫类相应爬虫调度类对象实例
func NewVisitorDetailSpider() spiders.ISpider {
spider := &VisitorDetailSpider{
db: dao.DB("spider"),
resultMap: common.NewConcurrentMap(),
ticker: time.NewTicker(5 * time.Second),
}
// 进度打印(非核心代码)
go func(ticker *time.Ticker) {
for {
<-ticker.C
println("---------------", spider.resultMap.Len())
}
}(spider.ticker)
return spider
}
// 获取请求对象
func (s *VisitorDetailSpider) GetPromise() *utils.RequestPromise {
return visitorDetailPromise
}
// 爬虫命名
func (s *VisitorDetailSpider) Name() string {
return VisitorDetail
}
// 响应解析
func (s *VisitorDetailSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) {
var content *items.Content
if nil == item {
content = &items.Content{}
} else {
var ok bool
content, ok = item.(*items.Content)
if !ok {
return nil, errors.New(fmt.Sprintf("invaild item %+v", item))
}
}
// 数据解析
dom, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
status := dom.Find(".wenshang .zhuangtai").First().Text()
content.Status = strings.TrimLeft(strings.TrimSpace(status), "状态:")
dom.Find(".wenshang .site a").Each(func(i int, selection *goquery.Selection) {
text := strings.TrimSpace(strings.Trim(strings.TrimSpace(selection.Text()), "-"))
if i == 0 {
content.Province = text
} else if i == 1 {
content.City = text
}
})
wen, _ := dom.Find("#wen").First().Html()
wen = strings.Replace(wen, "", "", -1)
wen = strings.Replace(wen, "", "", -1)
wen = spacePattern.ReplaceAllString(wen, "")
content.Text = wen
// 数据解析后处理逻辑
s.resultMap.Set(strconv.Itoa(content.ContentId), nil) // 暂存结果到字典中(暂时无用,仅做进度打印,非核心代码)
if content.ContentId == 0 { // 通常仅当测试未传递Item参数时会出现此情况
fmt.Printf("empty content id %+v\n", content)
return nil, nil
}
s.exportToDB(content) // 逐条储存结果到数据库
return nil, nil
}
// 爬虫执行结束后需进行的处理
func (s *VisitorDetailSpider) AfterExit() {
s.ticker.Stop()
println("@---------------", s.resultMap.Len()) // 详情页爬虫执行结束时再打印一下进度(非核心代码)
}
/* -------------------------------------------------------数据保存---------------------------------------------------- */
func (s *VisitorDetailSpider) exportToDB(content *items.Content) {
if nil == content {
return
}
if content.ID == 0 {
content.ID = utils.NextId()
}
content.CreatedAt = date.Now()
_, err := s.db.InsertOne(content)
if nil != err {
fmt.Printf("export to db failed %s %+v\n", err.Error(), content)
}
}