123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- package qianlima
- import (
- "errors"
- "fmt"
- "git.aionnect.com/aionnect/go-common/utils"
- "git.aionnect.com/aionnect/go-common/utils/date"
- "git.aionnect.com/aionnect/go-common/utils/jsonutil"
- "git.aionnect.com/hello-go/spider/common"
- "git.aionnect.com/hello-go/spider/spiders"
- "git.aionnect.com/hello-go/spider/spiders/qianlima/items"
- "net/http"
- "regexp"
- "time"
- )
- /* -------------------------------------------------------常量定义---------------------------------------------------- */
- //const TargetUrl = "http://search.qianlima.com/api/v1/website/search?filtermode=1&timeType=101&areas=&types=-1&searchMode=0&keywords=led&beginTime=&endTime=&isfirst=true¤tPage=1&numPerPage=20"
- const TargetUrl = "http://search.qianlima.com/api/v1/website/search?keywords=led¤tPage=%d&numPerPage=50"
- const RefererUrl = "http://search.qianlima.com/?q=led"
- const VisitorPaging = "VisitorPaging"
- /* -------------------------------------------------------全局变量---------------------------------------------------- */
- var (
- visitorPagingPromise = utils.NewRequest(). // 请求对象
- SetHeader("Accept", "*/*").
- SetHeader("Accept-Encoding", "gzip, deflate").
- SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
- SetHeader("Connection", "keep-alive").
- SetHeader("Content-Length", "0").
- SetHeader("Content-Type", "application/x-www-form-unlencoded").
- SetHeader("Host", "search.qianlima.com").
- SetHeader("Origin", "http://search.qianlima.com").
- SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
- spacePattern, _ = regexp.Compile(`\n\s*`) // 去空格正则
- )
- /* -------------------------------------------------------爬虫实现---------------------------------------------------- */
- // 千里马项目网访客列表页爬虫类
- type VisitorPagingSpider struct{}
- // 返回当前爬虫类相应爬虫调度类对象实例
- func NewVisitorPagingSpider() spiders.ISpider {
- spider := &VisitorPagingSpider{}
- return spider
- }
- // 获取请求对象
- func (s *VisitorPagingSpider) GetPromise() *utils.RequestPromise {
- return visitorPagingPromise
- }
- // 爬虫命名
- func (s *VisitorPagingSpider) Name() string {
- return VisitorPaging
- }
- // 响应解析
- func (s *VisitorPagingSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) {
- params := item.(*items.PagingParams)
- //if params.PagingNo == 2 { // 只爬一页,仅测试时使用
- // return nil, nil
- //}
- // 数据解析
- var res items.Result
- err := jsonutil.Unmarshal(body, &res)
- if nil != err { // 无法解析的响应,异常终止
- return nil, errors.New(fmt.Sprint("decode paging response failed", err.Error()))
- }
- if nil == &res || nil == res.Data || res.Status != 200 || // 错误响应,异常终止
- res.Data.RowCount <= 0 || res.Data.PagesCount <= 0 ||
- nil == res.Data.Data || len(res.Data.Data) <= 0 {
- return nil, errors.New(fmt.Sprint("invalid paging response", string(body)))
- }
- // 当前分页列表,将项目循环加入详情页待爬队列(暂为顺序轮询)
- var targets []*common.Target
- limitDate := date.Today().AddDays(params.DaysLimit) // 千里马免费项目列表七天前存在大量重复无效数据
- flag := false
- for i := 0; i < len(res.Data.Data); i++ {
- item := res.Data.Data[i]
- if item.UpdateTime.After(limitDate) { // 爬到七天前的数据即置flag为否,不再爬后续分页,也可按照实际业务要求更改此逻辑
- flag = true
- break
- }
- target := &common.Target{
- Key: VisitorDetail,
- Method: http.MethodGet,
- URL: item.URL,
- Item: item, // 这个对象会传递给下一层的Parse方法
- }
- targets = append(targets, target)
- }
- // 判断是否终止分页轮询
- if flag { // 超限,终止
- return targets, nil
- }
- if res.Data.PagesCount <= params.PagingNo { // 尾页,终止
- return targets, nil
- }
- time.Sleep(common.RandSeconds()) // 随机暂停几秒
- // 轮询列表页下一页(暂为顺序轮询)
- params.PagingNo++
- url := fmt.Sprintf(TargetUrl, params.PagingNo)
- target := &common.Target{
- Key: VisitorPaging,
- Method: http.MethodPost,
- URL: url,
- Referer: RefererUrl,
- Item: params,
- }
- targets = append(targets, target)
- return targets, nil
- }
|