visitor_paging.go 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. package qianlima
  2. import (
  3. "errors"
  4. "fmt"
  5. "git.aionnect.com/aionnect/go-common/utils"
  6. "git.aionnect.com/aionnect/go-common/utils/date"
  7. "git.aionnect.com/aionnect/go-common/utils/jsonutil"
  8. "git.aionnect.com/hello-go/spider/common"
  9. "git.aionnect.com/hello-go/spider/spiders"
  10. "git.aionnect.com/hello-go/spider/spiders/qianlima/items"
  11. "net/http"
  12. "regexp"
  13. "time"
  14. )
  15. /* -------------------------------------------------------常量定义---------------------------------------------------- */
  16. //const TargetUrl = "http://search.qianlima.com/api/v1/website/search?filtermode=1&timeType=101&areas=&types=-1&searchMode=0&keywords=led&beginTime=&endTime=&isfirst=true&currentPage=1&numPerPage=20"
  17. const TargetUrl = "http://search.qianlima.com/api/v1/website/search?keywords=led&currentPage=%d&numPerPage=50"
  18. const RefererUrl = "http://search.qianlima.com/?q=led"
  19. const VisitorPaging = "VisitorPaging"
  20. /* -------------------------------------------------------全局变量---------------------------------------------------- */
  21. var (
  22. visitorPagingPromise = utils.NewRequest(). // 请求对象
  23. SetHeader("Accept", "*/*").
  24. SetHeader("Accept-Encoding", "gzip, deflate").
  25. SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
  26. SetHeader("Connection", "keep-alive").
  27. SetHeader("Content-Length", "0").
  28. SetHeader("Content-Type", "application/x-www-form-unlencoded").
  29. SetHeader("Host", "search.qianlima.com").
  30. SetHeader("Origin", "http://search.qianlima.com").
  31. SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
  32. spacePattern, _ = regexp.Compile(`\n\s*`) // 去空格正则
  33. )
  34. /* -------------------------------------------------------爬虫实现---------------------------------------------------- */
  35. // 千里马项目网访客列表页爬虫类
  36. type VisitorPagingSpider struct{}
  37. // 返回当前爬虫类相应爬虫调度类对象实例
  38. func NewVisitorPagingSpider() spiders.ISpider {
  39. spider := &VisitorPagingSpider{}
  40. return spider
  41. }
  42. // 获取请求对象
  43. func (s *VisitorPagingSpider) GetPromise() *utils.RequestPromise {
  44. return visitorPagingPromise
  45. }
  46. // 爬虫命名
  47. func (s *VisitorPagingSpider) Name() string {
  48. return VisitorPaging
  49. }
  50. // 响应解析
  51. func (s *VisitorPagingSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) {
  52. params := item.(*items.PagingParams)
  53. //if params.PagingNo == 2 { // 只爬一页,仅测试时使用
  54. // return nil, nil
  55. //}
  56. // 数据解析
  57. var res items.Result
  58. err := jsonutil.Unmarshal(body, &res)
  59. if nil != err { // 无法解析的响应,异常终止
  60. return nil, errors.New(fmt.Sprint("decode paging response failed", err.Error()))
  61. }
  62. if nil == &res || nil == res.Data || res.Status != 200 || // 错误响应,异常终止
  63. res.Data.RowCount <= 0 || res.Data.PagesCount <= 0 ||
  64. nil == res.Data.Data || len(res.Data.Data) <= 0 {
  65. return nil, errors.New(fmt.Sprint("invalid paging response", string(body)))
  66. }
  67. // 当前分页列表,将项目循环加入详情页待爬队列(暂为顺序轮询)
  68. var targets []*common.Target
  69. limitDate := date.Today().AddDays(params.DaysLimit) // 千里马免费项目列表七天前存在大量重复无效数据
  70. flag := false
  71. for i := 0; i < len(res.Data.Data); i++ {
  72. item := res.Data.Data[i]
  73. if item.UpdateTime.After(limitDate) { // 爬到七天前的数据即置flag为否,不再爬后续分页,也可按照实际业务要求更改此逻辑
  74. flag = true
  75. break
  76. }
  77. target := &common.Target{
  78. Key: VisitorDetail,
  79. Method: http.MethodGet,
  80. URL: item.URL,
  81. Item: item, // 这个对象会传递给下一层的Parse方法
  82. }
  83. targets = append(targets, target)
  84. }
  85. // 判断是否终止分页轮询
  86. if flag { // 超限,终止
  87. return targets, nil
  88. }
  89. if res.Data.PagesCount <= params.PagingNo { // 尾页,终止
  90. return targets, nil
  91. }
  92. time.Sleep(common.RandSeconds()) // 随机暂停几秒
  93. // 轮询列表页下一页(暂为顺序轮询)
  94. params.PagingNo++
  95. url := fmt.Sprintf(TargetUrl, params.PagingNo)
  96. target := &common.Target{
  97. Key: VisitorPaging,
  98. Method: http.MethodPost,
  99. URL: url,
  100. Referer: RefererUrl,
  101. Item: params,
  102. }
  103. targets = append(targets, target)
  104. return targets, nil
  105. }