free_paging.go 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. package kuaidaili
  2. import (
  3. "bytes"
  4. "errors"
  5. "fmt"
  6. "git.aionnect.com/aionnect/go-common/utils/date"
  7. "git.aionnect.com/hello-go/spider/common"
  8. "git.aionnect.com/hello-go/spider/dao"
  9. "git.aionnect.com/hello-go/spider/spiders"
  10. "github.com/PuerkitoBio/goquery"
  11. "net/http"
  12. "strconv"
  13. "strings"
  14. )
  15. /* -------------------------------------------------------常量定义---------------------------------------------------- */
  16. const TargetUrl = "https://www.kuaidaili.com/free/inha/%d/"
  17. const KuaiDaiLi = "KuaiDaiLi"
  18. /* -------------------------------------------------------爬虫实现---------------------------------------------------- */
  19. // 千里马项目网访客详情页爬虫类
  20. type FreePagingSpider struct {
  21. proxyDao *dao.ProxyDao // 代理信息数据访问对象
  22. }
  23. // 返回当前爬虫类相应爬虫调度类对象实例
  24. func NewFreePagingSpider() spiders.ISpider {
  25. spider := &FreePagingSpider{
  26. proxyDao: dao.NewProxyDao(),
  27. }
  28. return spider
  29. }
  30. // 爬虫命名
  31. func (s *FreePagingSpider) Name() string {
  32. return KuaiDaiLi
  33. }
  34. // 响应解析
  35. func (s *FreePagingSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) {
  36. params := item.(*common.ProxyPagingParams)
  37. if params.PagingNo == 2 { // 只爬一页,仅测试时使用
  38. return nil, nil
  39. }
  40. // 数据解析
  41. dom, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
  42. pageNo := dom.Find("#listnav li a").Last().Text()
  43. totalPage, err := strconv.Atoi(pageNo)
  44. if nil != err { // 获取不到尾页信息,异常终止
  45. return nil, errors.New(fmt.Sprintf("wrong last page bumber %s", err.Error()))
  46. }
  47. flag := true
  48. var proxies []*common.ProxyInfo
  49. dom.Find(".table-bordered tbody tr").Each(func(i int, row *goquery.Selection) {
  50. flag = false // 有数据
  51. proxy := &common.ProxyInfo{}
  52. row.Find("td").Each(func(j int, cell *goquery.Selection) {
  53. switch j {
  54. case 0:
  55. proxy.IP = common.ConvertToIP(cell.Text())
  56. case 1:
  57. proxy.Port, _ = strconv.Atoi(strings.TrimSpace(cell.Text()))
  58. case 2:
  59. proxy.Anonymity = strings.TrimSpace(cell.Text())
  60. case 3:
  61. proxy.Type = strings.ToUpper(strings.TrimSpace(cell.Text()))
  62. case 4:
  63. proxy.Location = strings.TrimSpace(cell.Text())
  64. case 5:
  65. proxy.Speed = common.ConvertToSpeed(cell.Text())
  66. case 6:
  67. proxy.UpdateTime = date.ParseDatetime(cell.Text())
  68. }
  69. })
  70. if proxy.IP != "" && proxy.Port >= 0 && proxy.Anonymity == common.High && (proxy.Type == common.HTTP || proxy.Type == common.HTTPS) {
  71. if proxy.Port == 0 {
  72. if proxy.Type == common.HTTP {
  73. proxy.Port = 80
  74. } else {
  75. proxy.Port = 443
  76. }
  77. }
  78. proxies = append(proxies, proxy)
  79. }
  80. })
  81. // 保存
  82. s.proxyDao.Save(proxies)
  83. // 判断是否终止分页轮询
  84. if flag { // 当前分页无数据,异常终止
  85. return nil, errors.New(fmt.Sprintf("paging has no rows %d", params.PagingNo))
  86. }
  87. params.Count += len(proxies)
  88. if params.Count > params.Limit { // 超限,终止
  89. return nil, nil
  90. }
  91. if totalPage <= params.PagingNo { // 尾页,终止
  92. return nil, nil
  93. }
  94. // 轮询列表页下一页(暂为顺序轮询)
  95. params.PagingNo++
  96. url := fmt.Sprintf(TargetUrl, params.PagingNo)
  97. target := &common.Target{
  98. Key: KuaiDaiLi,
  99. Method: http.MethodGet,
  100. URL: url,
  101. Item: params,
  102. }
  103. return []*common.Target{target}, nil
  104. }