package qianlima import ( "bytes" "errors" "fmt" "git.aionnect.com/aionnect/go-common/utils" "git.aionnect.com/aionnect/go-common/utils/date" "git.aionnect.com/hello-go/spider/common" "git.aionnect.com/hello-go/spider/dao" "git.aionnect.com/hello-go/spider/spiders" "git.aionnect.com/hello-go/spider/spiders/qianlima/items" "github.com/PuerkitoBio/goquery" "net/http" "strconv" "strings" "time" "xorm.io/xorm" ) /* -------------------------------------------------------常量定义---------------------------------------------------- */ const VisitorDetail = "VisitorDetail" /* -------------------------------------------------------全局变量---------------------------------------------------- */ var ( visitorDetailPromise = utils.NewRequest(). // 详情页请求 SetHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"). SetHeader("Accept-Encoding", "gzip, deflate"). SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8"). SetHeader("Cache-Control", "max-age=0"). SetHeader("Connection", "keep-alive"). SetHeader("Host", "www.qianlima.com"). SetHeader("Upgrade-Insecure-Requests", "1"). SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36") ) /* -------------------------------------------------------爬虫实现---------------------------------------------------- */ // 千里马项目网访客详情页爬虫类 type VisitorDetailSpider struct { db *xorm.Engine // 数据库访问对象 resultMap *common.ConcurrentMap // 数据暂存价值对 ticker *time.Ticker // 显示进度的定时器 } // 返回当前爬虫类相应爬虫调度类对象实例 func NewVisitorDetailSpider() spiders.ISpider { spider := &VisitorDetailSpider{ db: dao.DB("spider"), resultMap: common.NewConcurrentMap(), ticker: time.NewTicker(5 * time.Second), } // 进度打印(非核心代码) go func(ticker *time.Ticker) { for { <-ticker.C println("---------------", spider.resultMap.Len()) } }(spider.ticker) return spider } // 获取请求对象 func (s *VisitorDetailSpider) GetPromise() *utils.RequestPromise { return visitorDetailPromise } // 爬虫命名 func (s *VisitorDetailSpider) Name() string { return VisitorDetail } // 响应解析 func (s *VisitorDetailSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) { var content *items.Content if nil == item { content = &items.Content{} } else { var ok bool content, ok = item.(*items.Content) if !ok { return nil, errors.New(fmt.Sprintf("invaild item %+v", item)) } } // 数据解析 dom, _ := goquery.NewDocumentFromReader(bytes.NewReader(body)) status := dom.Find(".wenshang .zhuangtai").First().Text() content.Status = strings.TrimLeft(strings.TrimSpace(status), "状态:") dom.Find(".wenshang .site a").Each(func(i int, selection *goquery.Selection) { text := strings.TrimSpace(strings.Trim(strings.TrimSpace(selection.Text()), "-")) if i == 0 { content.Province = text } else if i == 1 { content.City = text } }) wen, _ := dom.Find("#wen").First().Html() wen = strings.Replace(wen, "", "", -1) wen = strings.Replace(wen, "", "", -1) wen = spacePattern.ReplaceAllString(wen, "") content.Text = wen // 数据解析后处理逻辑 s.resultMap.Set(strconv.Itoa(content.ContentId), nil) // 暂存结果到字典中(暂时无用,仅做进度打印,非核心代码) if content.ContentId == 0 { // 通常仅当测试未传递Item参数时会出现此情况 fmt.Printf("empty content id %+v\n", content) return nil, nil } s.exportToDB(content) // 逐条储存结果到数据库 return nil, nil } // 爬虫执行结束后需进行的处理 func (s *VisitorDetailSpider) AfterExit() { s.ticker.Stop() println("@---------------", s.resultMap.Len()) // 详情页爬虫执行结束时再打印一下进度(非核心代码) } /* -------------------------------------------------------数据保存---------------------------------------------------- */ func (s *VisitorDetailSpider) exportToDB(content *items.Content) { if nil == content { return } if content.ID == 0 { content.ID = utils.NextId() } content.CreatedAt = date.Now() _, err := s.db.InsertOne(content) if nil != err { fmt.Printf("export to db failed %s %+v\n", err.Error(), content) } }