package qianlima import ( "errors" "fmt" "git.aionnect.com/aionnect/go-common/utils" "git.aionnect.com/aionnect/go-common/utils/date" "git.aionnect.com/aionnect/go-common/utils/jsonutil" "git.aionnect.com/hello-go/spider/common" "git.aionnect.com/hello-go/spider/spiders" "git.aionnect.com/hello-go/spider/spiders/qianlima/items" "net/http" "regexp" "time" ) /* -------------------------------------------------------常量定义---------------------------------------------------- */ //const TargetUrl = "http://search.qianlima.com/api/v1/website/search?filtermode=1&timeType=101&areas=&types=-1&searchMode=0&keywords=led&beginTime=&endTime=&isfirst=true¤tPage=1&numPerPage=20" const TargetUrl = "http://search.qianlima.com/api/v1/website/search?keywords=led¤tPage=%d&numPerPage=50" const RefererUrl = "http://search.qianlima.com/?q=led" const VisitorPaging = "VisitorPaging" /* -------------------------------------------------------全局变量---------------------------------------------------- */ var ( visitorPagingPromise = utils.NewRequest(). // 请求对象 SetHeader("Accept", "*/*"). SetHeader("Accept-Encoding", "gzip, deflate"). SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8"). SetHeader("Connection", "keep-alive"). SetHeader("Content-Length", "0"). SetHeader("Content-Type", "application/x-www-form-unlencoded"). SetHeader("Host", "search.qianlima.com"). SetHeader("Origin", "http://search.qianlima.com"). SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36") spacePattern, _ = regexp.Compile(`\n\s*`) // 去空格正则 ) /* -------------------------------------------------------爬虫实现---------------------------------------------------- */ // 千里马项目网访客列表页爬虫类 type VisitorPagingSpider struct{} // 返回当前爬虫类相应爬虫调度类对象实例 func NewVisitorPagingSpider() spiders.ISpider { spider := &VisitorPagingSpider{} return spider } // 获取请求对象 func (s *VisitorPagingSpider) GetPromise() *utils.RequestPromise { return visitorPagingPromise } // 爬虫命名 func (s *VisitorPagingSpider) Name() string { return VisitorPaging } // 响应解析 func (s *VisitorPagingSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) { params := item.(*items.PagingParams) //if params.PagingNo == 2 { // 只爬一页,仅测试时使用 // return nil, nil //} // 数据解析 var res items.Result err := jsonutil.Unmarshal(body, &res) if nil != err { // 无法解析的响应,异常终止 return nil, errors.New(fmt.Sprint("decode paging response failed", err.Error())) } if nil == &res || nil == res.Data || res.Status != 200 || // 错误响应,异常终止 res.Data.RowCount <= 0 || res.Data.PagesCount <= 0 || nil == res.Data.Data || len(res.Data.Data) <= 0 { return nil, errors.New(fmt.Sprint("invalid paging response", string(body))) } // 当前分页列表,将项目循环加入详情页待爬队列(暂为顺序轮询) var targets []*common.Target limitDate := date.Today().AddDays(params.DaysLimit) // 千里马免费项目列表七天前存在大量重复无效数据 flag := false for i := 0; i < len(res.Data.Data); i++ { item := res.Data.Data[i] if item.UpdateTime.After(limitDate) { // 爬到七天前的数据即置flag为否,不再爬后续分页,也可按照实际业务要求更改此逻辑 flag = true break } target := &common.Target{ Key: VisitorDetail, Method: http.MethodGet, URL: item.URL, Item: item, // 这个对象会传递给下一层的Parse方法 } targets = append(targets, target) } // 判断是否终止分页轮询 if flag { // 超限,终止 return targets, nil } if res.Data.PagesCount <= params.PagingNo { // 尾页,终止 return targets, nil } time.Sleep(common.RandSeconds()) // 随机暂停几秒 // 轮询列表页下一页(暂为顺序轮询) params.PagingNo++ url := fmt.Sprintf(TargetUrl, params.PagingNo) target := &common.Target{ Key: VisitorPaging, Method: http.MethodPost, URL: url, Referer: RefererUrl, Item: params, } targets = append(targets, target) return targets, nil }