123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- package qianlima
- import (
- "bytes"
- "errors"
- "fmt"
- "git.aionnect.com/aionnect/go-common/utils"
- "git.aionnect.com/aionnect/go-common/utils/date"
- "git.aionnect.com/hello-go/spider/common"
- "git.aionnect.com/hello-go/spider/dao"
- "git.aionnect.com/hello-go/spider/spiders"
- "git.aionnect.com/hello-go/spider/spiders/qianlima/items"
- "github.com/PuerkitoBio/goquery"
- "net/http"
- "strconv"
- "strings"
- "time"
- "xorm.io/xorm"
- )
- /* -------------------------------------------------------常量定义---------------------------------------------------- */
- const VisitorDetail = "VisitorDetail"
- /* -------------------------------------------------------全局变量---------------------------------------------------- */
- var (
- visitorDetailPromise = utils.NewRequest(). // 详情页请求
- SetHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9").
- SetHeader("Accept-Encoding", "gzip, deflate").
- SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
- SetHeader("Cache-Control", "max-age=0").
- SetHeader("Connection", "keep-alive").
- SetHeader("Host", "www.qianlima.com").
- SetHeader("Upgrade-Insecure-Requests", "1").
- SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
- )
- /* -------------------------------------------------------爬虫实现---------------------------------------------------- */
- // 千里马项目网访客详情页爬虫类
- type VisitorDetailSpider struct {
- db *xorm.Engine // 数据库访问对象
- resultMap *common.ConcurrentMap // 数据暂存价值对
- ticker *time.Ticker // 显示进度的定时器
- }
- // 返回当前爬虫类相应爬虫调度类对象实例
- func NewVisitorDetailSpider() spiders.ISpider {
- spider := &VisitorDetailSpider{
- db: dao.DB("spider"),
- resultMap: common.NewConcurrentMap(),
- ticker: time.NewTicker(5 * time.Second),
- }
- // 进度打印(非核心代码)
- go func(ticker *time.Ticker) {
- for {
- <-ticker.C
- println("---------------", spider.resultMap.Len())
- }
- }(spider.ticker)
- return spider
- }
- // 获取请求对象
- func (s *VisitorDetailSpider) GetPromise() *utils.RequestPromise {
- return visitorDetailPromise
- }
- // 爬虫命名
- func (s *VisitorDetailSpider) Name() string {
- return VisitorDetail
- }
- // 响应解析
- func (s *VisitorDetailSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) {
- var content *items.Content
- if nil == item {
- content = &items.Content{}
- } else {
- var ok bool
- content, ok = item.(*items.Content)
- if !ok {
- return nil, errors.New(fmt.Sprintf("invaild item %+v", item))
- }
- }
- // 数据解析
- dom, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
- status := dom.Find(".wenshang .zhuangtai").First().Text()
- content.Status = strings.TrimLeft(strings.TrimSpace(status), "状态:")
- dom.Find(".wenshang .site a").Each(func(i int, selection *goquery.Selection) {
- text := strings.TrimSpace(strings.Trim(strings.TrimSpace(selection.Text()), "-"))
- if i == 0 {
- content.Province = text
- } else if i == 1 {
- content.City = text
- }
- })
- wen, _ := dom.Find("#wen").First().Html()
- wen = strings.Replace(wen, "<input type=\"hidden\" id=\"zbunit\" value=\"\"/>", "", -1)
- wen = strings.Replace(wen, "<input type=\"hidden\" id=\"zburl\" value=\"\"/>", "", -1)
- wen = spacePattern.ReplaceAllString(wen, "")
- content.Text = wen
- // 数据解析后处理逻辑
- s.resultMap.Set(strconv.Itoa(content.ContentId), nil) // 暂存结果到字典中(暂时无用,仅做进度打印,非核心代码)
- if content.ContentId == 0 { // 通常仅当测试未传递Item参数时会出现此情况
- fmt.Printf("empty content id %+v\n", content)
- return nil, nil
- }
- s.exportToDB(content) // 逐条储存结果到数据库
- return nil, nil
- }
- // 爬虫执行结束后需进行的处理
- func (s *VisitorDetailSpider) AfterExit() {
- s.ticker.Stop()
- println("@---------------", s.resultMap.Len()) // 详情页爬虫执行结束时再打印一下进度(非核心代码)
- }
- /* -------------------------------------------------------数据保存---------------------------------------------------- */
- func (s *VisitorDetailSpider) exportToDB(content *items.Content) {
- if nil == content {
- return
- }
- if content.ID == 0 {
- content.ID = utils.NextId()
- }
- content.CreatedAt = date.Now()
- _, err := s.db.InsertOne(content)
- if nil != err {
- fmt.Printf("export to db failed %s %+v\n", err.Error(), content)
- }
- }
|