base_spider.go 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. package spiders
  2. import (
  3. "fmt"
  4. "git.aionnect.com/aionnect/go-common/utils"
  5. "git.aionnect.com/hello-go/spider/common"
  6. "git.aionnect.com/hello-go/spider/dao"
  7. "net/http"
  8. "strings"
  9. "sync"
  10. )
  11. // 爬虫调度上下文
  12. type spidersContext struct {
  13. counter sync.WaitGroup // 同步器
  14. spiderMap map[string]*Spider // 多级爬虫价值对
  15. }
  16. // 开始爬虫调度
  17. func Run(startingTarget *common.Target, spiders ...ISpider) {
  18. if nil == startingTarget || nil == spiders || len(spiders) == 0 {
  19. return
  20. }
  21. // 爬虫调度上下文初始化
  22. ctx := &spidersContext{
  23. spiderMap: make(map[string]*Spider),
  24. }
  25. for i := 0; i < len(spiders); i++ {
  26. spider := newSpider(spiders[i], ctx) // 包装上爬虫执行类
  27. name := strings.TrimSpace(spider.Name())
  28. if _, ok := ctx.spiderMap[name]; !ok {
  29. ctx.spiderMap[name] = spider
  30. }
  31. }
  32. // 从起始页开始处理
  33. ctx.router(startingTarget)
  34. // 等待爬虫执行完毕
  35. ctx.counter.Wait()
  36. // 爬虫执行完毕后处理
  37. for _, spider := range ctx.spiderMap {
  38. if after, ok := spider.ISpider.(IAfterSpiderExit); ok {
  39. after.AfterExit()
  40. }
  41. close(spider.queue)
  42. }
  43. }
  44. // 爬虫路由处理
  45. func (ctx *spidersContext) router(target *common.Target) {
  46. if nil == target || nil == ctx.spiderMap || len(ctx.spiderMap) == 0 {
  47. fmt.Println("spidersContext.router() invalid params")
  48. return
  49. }
  50. target.Key = strings.TrimSpace(target.Key)
  51. if target.Key == "" {
  52. fmt.Println("spidersContext.router() target key can not be empty")
  53. return
  54. }
  55. if spider, ok := ctx.spiderMap[target.Key]; ok {
  56. ctx.counter.Add(1)
  57. spider.Send(target)
  58. }
  59. }
  60. // 爬虫主接口抽象
  61. type ISpider interface {
  62. Name() string // 爬虫命名
  63. Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) // 响应解析
  64. }
  65. // 自定义爬虫请求接口抽象
  66. type IPromiseDefiner interface {
  67. GetPromise() *utils.RequestPromise
  68. }
  69. // 爬虫执行结束后需进行的处理接口抽象
  70. type IAfterSpiderExit interface {
  71. AfterExit()
  72. }
  73. // 爬虫执行类(模拟抽象类及模版方法模式)
  74. type Spider struct {
  75. ISpider
  76. queue chan *common.Target // 待爬队列
  77. promise *utils.RequestPromise // 请求对象
  78. context *spidersContext // 爬虫调度上下文的引用
  79. proxyDao *dao.ProxyDao // 代理信息数据访问对象
  80. }
  81. // 返回新的爬虫调度类对象实例(模版方法)
  82. func newSpider(instance ISpider, ctx *spidersContext) *Spider {
  83. spider := &Spider{
  84. ISpider: instance,
  85. queue: make(chan *common.Target, 5),
  86. context: ctx,
  87. proxyDao: dao.NewProxyDao(),
  88. }
  89. if definer, ok := instance.(IPromiseDefiner); ok {
  90. spider.promise = definer.GetPromise()
  91. } else {
  92. spider.promise = utils.NewRequest()
  93. }
  94. go spider.handle() // 启动待爬队列异步处理协程
  95. return spider
  96. }
  97. // 新待爬请求入队
  98. func (s *Spider) Send(target *common.Target) {
  99. if nil == target {
  100. return
  101. }
  102. // 入队不启动新协程,起到bufferSize满时阻塞等待的效果
  103. s.queue <- target
  104. }
  105. // 待爬队列处理
  106. func (s *Spider) handle() {
  107. for {
  108. select {
  109. case target, ok := <-s.queue:
  110. if ok {
  111. s.invoke(target)
  112. s.context.counter.Done() // 同步器扣减
  113. }
  114. }
  115. }
  116. }
  117. // 请求并处理(模版方法)
  118. func (s *Spider) invoke(target *common.Target) {
  119. target.Referer = strings.TrimSpace(target.Referer)
  120. if target.Referer != "" {
  121. s.promise = s.promise.SetHeader("Referer", target.Referer)
  122. }
  123. s.promise.SetHeader("User-Agent", common.RandUserAgent(target.IsCell)) // 随机User-Agent
  124. s.promise.SetHttpProxy(s.proxyDao.Get()) // 随机代理
  125. body, resp, err := s.promise.CallResponse(target.Method, target.URL, target.Data)
  126. if nil != err {
  127. fmt.Printf("http request [%s] %s failed. %s\n", target.Method, target.URL, err.Error())
  128. return
  129. }
  130. if nil == body || len(body) <= 0 {
  131. fmt.Printf("empty http response [%s] %s", target.Method, target.URL)
  132. return
  133. }
  134. var nextTargets []*common.Target
  135. nextTargets, err = s.Parse(target.Item, body, resp)
  136. if nil != err {
  137. fmt.Println(err.Error())
  138. return
  139. }
  140. if nil == nextTargets || len(nextTargets) == 0 {
  141. return
  142. }
  143. for i := 0; i < len(nextTargets); i++ {
  144. next := nextTargets[i]
  145. s.context.router(next)
  146. }
  147. }