visitor_detail.go 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. package qianlima
  2. import (
  3. "bytes"
  4. "errors"
  5. "fmt"
  6. "git.aionnect.com/aionnect/go-common/utils"
  7. "git.aionnect.com/aionnect/go-common/utils/date"
  8. "git.aionnect.com/hello-go/spider/common"
  9. "git.aionnect.com/hello-go/spider/dao"
  10. "git.aionnect.com/hello-go/spider/spiders"
  11. "git.aionnect.com/hello-go/spider/spiders/qianlima/items"
  12. "github.com/PuerkitoBio/goquery"
  13. "net/http"
  14. "strconv"
  15. "strings"
  16. "time"
  17. "xorm.io/xorm"
  18. )
  19. /* -------------------------------------------------------常量定义---------------------------------------------------- */
  20. const VisitorDetail = "VisitorDetail"
  21. /* -------------------------------------------------------全局变量---------------------------------------------------- */
  22. var (
  23. visitorDetailPromise = utils.NewRequest(). // 详情页请求
  24. SetHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9").
  25. SetHeader("Accept-Encoding", "gzip, deflate").
  26. SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
  27. SetHeader("Cache-Control", "max-age=0").
  28. SetHeader("Connection", "keep-alive").
  29. SetHeader("Host", "www.qianlima.com").
  30. SetHeader("Upgrade-Insecure-Requests", "1").
  31. SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
  32. )
  33. /* -------------------------------------------------------爬虫实现---------------------------------------------------- */
  34. // 千里马项目网访客详情页爬虫类
  35. type VisitorDetailSpider struct {
  36. db *xorm.Engine // 数据库访问对象
  37. resultMap *common.ConcurrentMap // 数据暂存价值对
  38. ticker *time.Ticker // 显示进度的定时器
  39. }
  40. // 返回当前爬虫类相应爬虫调度类对象实例
  41. func NewVisitorDetailSpider() spiders.ISpider {
  42. spider := &VisitorDetailSpider{
  43. db: dao.DB("spider"),
  44. resultMap: common.NewConcurrentMap(),
  45. ticker: time.NewTicker(5 * time.Second),
  46. }
  47. // 进度打印(非核心代码)
  48. go func(ticker *time.Ticker) {
  49. for {
  50. <-ticker.C
  51. println("---------------", spider.resultMap.Len())
  52. }
  53. }(spider.ticker)
  54. return spider
  55. }
  56. // 获取请求对象
  57. func (s *VisitorDetailSpider) GetPromise() *utils.RequestPromise {
  58. return visitorDetailPromise
  59. }
  60. // 爬虫命名
  61. func (s *VisitorDetailSpider) Name() string {
  62. return VisitorDetail
  63. }
  64. // 响应解析
  65. func (s *VisitorDetailSpider) Parse(item interface{}, body []byte, headers http.Header) ([]*common.Target, error) {
  66. content, ok := item.(*items.Content)
  67. if !ok {
  68. return nil, errors.New(fmt.Sprintf("invaild item %+v", item))
  69. }
  70. // 数据解析
  71. dom, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
  72. status := dom.Find(".wenshang .zhuangtai").First().Text()
  73. content.Status = strings.TrimLeft(strings.TrimSpace(status), "状态:")
  74. dom.Find(".wenshang .site a").Each(func(i int, selection *goquery.Selection) {
  75. text := strings.TrimSpace(strings.Trim(strings.TrimSpace(selection.Text()), "-"))
  76. if i == 0 {
  77. content.Province = text
  78. } else if i == 1 {
  79. content.City = text
  80. }
  81. })
  82. wen, _ := dom.Find("#wen").First().Html()
  83. wen = strings.Replace(wen, "<input type=\"hidden\" id=\"zbunit\" value=\"\"/>", "", -1)
  84. wen = strings.Replace(wen, "<input type=\"hidden\" id=\"zburl\" value=\"\"/>", "", -1)
  85. wen = spacePattern.ReplaceAllString(wen, "")
  86. content.Text = wen
  87. // 数据解析后处理逻辑
  88. s.resultMap.Set(strconv.Itoa(content.ContentId), nil) // 暂存结果到字典中(暂时无用,仅做进度打印,非核心代码)
  89. s.exportToDB(content) // 逐条储存结果到数据库
  90. return nil, nil
  91. }
  92. // 爬虫执行结束后需进行的处理
  93. func (s *VisitorDetailSpider) AfterExit() {
  94. s.ticker.Stop()
  95. println("@---------------", s.resultMap.Len()) // 详情页爬虫执行结束时再打印一下进度(非核心代码)
  96. }
  97. /* -------------------------------------------------------数据保存---------------------------------------------------- */
  98. func (s *VisitorDetailSpider) exportToDB(content *items.Content) {
  99. if nil == content {
  100. return
  101. }
  102. if content.ID == 0 {
  103. content.ID = utils.NextId()
  104. }
  105. content.CreatedAt = date.Now()
  106. _, err := s.db.InsertOne(content)
  107. if nil != err {
  108. fmt.Printf("export to db failed %s %+v\n", err.Error(), content)
  109. }
  110. }