visitor_detail.go 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. package qianlima
  2. import (
  3. "bytes"
  4. "errors"
  5. "fmt"
  6. "git.aionnect.com/aionnect/go-common/utils"
  7. "git.aionnect.com/aionnect/go-common/utils/date"
  8. "git.aionnect.com/hello-go/spider/common"
  9. "git.aionnect.com/hello-go/spider/dao"
  10. "git.aionnect.com/hello-go/spider/spiders"
  11. "git.aionnect.com/hello-go/spider/spiders/qianlima/items"
  12. "github.com/PuerkitoBio/goquery"
  13. "net/http"
  14. "strconv"
  15. "strings"
  16. "time"
  17. "xorm.io/xorm"
  18. )
  19. /* -------------------------------------------------------常量定义---------------------------------------------------- */
  20. const VisitorDetail = "VisitorDetail"
  21. /* -------------------------------------------------------全局变量---------------------------------------------------- */
  22. var (
  23. visitorDetailPromise = utils.NewRequest(). // 详情页请求
  24. SetHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9").
  25. SetHeader("Accept-Encoding", "gzip, deflate").
  26. SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
  27. SetHeader("Cache-Control", "max-age=0").
  28. SetHeader("Connection", "keep-alive").
  29. SetHeader("Host", "www.qianlima.com").
  30. SetHeader("Upgrade-Insecure-Requests", "1").
  31. SetHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36")
  32. )
  33. /* -------------------------------------------------------爬虫实现---------------------------------------------------- */
  34. // 千里马项目网访客详情页爬虫类
  35. type VisitorDetailSpider struct {
  36. db *xorm.Engine // 数据库访问对象
  37. resultMap *common.ConcurrentMap // 数据暂存价值对
  38. ticker *time.Ticker // 显示进度的定时器
  39. }
  40. // 返回当前爬虫类相应爬虫调度类对象实例
  41. func NewVisitorDetailSpider() spiders.ISpider {
  42. spider := &VisitorDetailSpider{
  43. db: dao.DB("spider"),
  44. resultMap: common.NewConcurrentMap(),
  45. ticker: time.NewTicker(5 * time.Second),
  46. }
  47. // 进度打印(非核心代码)
  48. go func(ticker *time.Ticker) {
  49. for {
  50. <-ticker.C
  51. println("---------------", spider.resultMap.Len())
  52. }
  53. }(spider.ticker)
  54. return spider
  55. }
  56. // 获取请求对象
  57. func (s *VisitorDetailSpider) GetPromise() *utils.RequestPromise {
  58. return visitorDetailPromise
  59. }
  60. // 爬虫命名
  61. func (s *VisitorDetailSpider) Name() string {
  62. return VisitorDetail
  63. }
  64. // 响应解析
  65. func (s *VisitorDetailSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) {
  66. var content *items.Content
  67. if nil == item {
  68. content = &items.Content{}
  69. } else {
  70. var ok bool
  71. content, ok = item.(*items.Content)
  72. if !ok {
  73. return nil, errors.New(fmt.Sprintf("invaild item %+v", item))
  74. }
  75. }
  76. // 数据解析
  77. dom, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
  78. status := dom.Find(".wenshang .zhuangtai").First().Text()
  79. content.Status = strings.TrimLeft(strings.TrimSpace(status), "状态:")
  80. dom.Find(".wenshang .site a").Each(func(i int, selection *goquery.Selection) {
  81. text := strings.TrimSpace(strings.Trim(strings.TrimSpace(selection.Text()), "-"))
  82. if i == 0 {
  83. content.Province = text
  84. } else if i == 1 {
  85. content.City = text
  86. }
  87. })
  88. wen, _ := dom.Find("#wen").First().Html()
  89. wen = strings.Replace(wen, "<input type=\"hidden\" id=\"zbunit\" value=\"\"/>", "", -1)
  90. wen = strings.Replace(wen, "<input type=\"hidden\" id=\"zburl\" value=\"\"/>", "", -1)
  91. wen = spacePattern.ReplaceAllString(wen, "")
  92. content.Text = wen
  93. // 数据解析后处理逻辑
  94. s.resultMap.Set(strconv.Itoa(content.ContentId), nil) // 暂存结果到字典中(暂时无用,仅做进度打印,非核心代码)
  95. if content.ContentId == 0 { // 通常仅当测试未传递Item参数时会出现此情况
  96. fmt.Printf("empty content id %+v\n", content)
  97. return nil, nil
  98. }
  99. s.exportToDB(content) // 逐条储存结果到数据库
  100. return nil, nil
  101. }
  102. // 爬虫执行结束后需进行的处理
  103. func (s *VisitorDetailSpider) AfterExit() {
  104. s.ticker.Stop()
  105. println("@---------------", s.resultMap.Len()) // 详情页爬虫执行结束时再打印一下进度(非核心代码)
  106. }
  107. /* -------------------------------------------------------数据保存---------------------------------------------------- */
  108. func (s *VisitorDetailSpider) exportToDB(content *items.Content) {
  109. if nil == content {
  110. return
  111. }
  112. if content.ID == 0 {
  113. content.ID = utils.NextId()
  114. }
  115. content.CreatedAt = date.Now()
  116. _, err := s.db.InsertOne(content)
  117. if nil != err {
  118. fmt.Printf("export to db failed %s %+v\n", err.Error(), content)
  119. }
  120. }