123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 |
- package kuaidaili
- import (
- "bytes"
- "errors"
- "fmt"
- "git.aionnect.com/aionnect/go-common/utils/date"
- "git.aionnect.com/hello-go/spider/common"
- "git.aionnect.com/hello-go/spider/dao"
- "git.aionnect.com/hello-go/spider/spiders"
- "github.com/PuerkitoBio/goquery"
- "net/http"
- "strconv"
- "strings"
- )
- /* -------------------------------------------------------常量定义---------------------------------------------------- */
- const TargetUrl = "https://www.kuaidaili.com/free/inha/%d/"
- const KuaiDaiLi = "KuaiDaiLi"
- /* -------------------------------------------------------爬虫实现---------------------------------------------------- */
- // 千里马项目网访客详情页爬虫类
- type FreePagingSpider struct {
- proxyDao *dao.ProxyDao // 代理信息数据访问对象
- }
- // 返回当前爬虫类相应爬虫调度类对象实例
- func NewFreePagingSpider() spiders.ISpider {
- spider := &FreePagingSpider{
- proxyDao: dao.NewProxyDao(),
- }
- return spider
- }
- // 爬虫命名
- func (s *FreePagingSpider) Name() string {
- return KuaiDaiLi
- }
- // 响应解析
- func (s *FreePagingSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) {
- params := item.(*common.ProxyPagingParams)
- if params.PagingNo == 2 { // 只爬一页,仅测试时使用
- return nil, nil
- }
- // 数据解析
- dom, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
- pageNo := dom.Find("#listnav li a").Last().Text()
- totalPage, err := strconv.Atoi(pageNo)
- if nil != err { // 获取不到尾页信息,异常终止
- return nil, errors.New(fmt.Sprintf("wrong last page bumber %s", err.Error()))
- }
- flag := true
- var proxies []*common.ProxyInfo
- dom.Find(".table-bordered tbody tr").Each(func(i int, row *goquery.Selection) {
- flag = false // 有数据
- proxy := &common.ProxyInfo{}
- row.Find("td").Each(func(j int, cell *goquery.Selection) {
- switch j {
- case 0:
- proxy.IP = common.ConvertToIP(cell.Text())
- case 1:
- proxy.Port, _ = strconv.Atoi(strings.TrimSpace(cell.Text()))
- case 2:
- proxy.Anonymity = strings.TrimSpace(cell.Text())
- case 3:
- proxy.Type = strings.ToUpper(strings.TrimSpace(cell.Text()))
- case 4:
- proxy.Location = strings.TrimSpace(cell.Text())
- case 5:
- proxy.Speed = common.ConvertToSpeed(cell.Text())
- case 6:
- proxy.UpdateTime = date.ParseDatetime(cell.Text())
- }
- })
- if proxy.IP != "" && proxy.Port >= 0 && proxy.Anonymity == common.High && (proxy.Type == common.HTTP || proxy.Type == common.HTTPS) {
- if proxy.Port == 0 {
- if proxy.Type == common.HTTP {
- proxy.Port = 80
- } else {
- proxy.Port = 443
- }
- }
- proxies = append(proxies, proxy)
- }
- })
- // 保存
- s.proxyDao.Save(proxies)
- // 判断是否终止分页轮询
- if flag { // 当前分页无数据,异常终止
- return nil, errors.New(fmt.Sprintf("paging has no rows %d", params.PagingNo))
- }
- params.Count += len(proxies)
- if params.Count > params.Limit { // 超限,终止
- return nil, nil
- }
- if totalPage <= params.PagingNo { // 尾页,终止
- return nil, nil
- }
- // 轮询列表页下一页(暂为顺序轮询)
- params.PagingNo++
- url := fmt.Sprintf(TargetUrl, params.PagingNo)
- target := &common.Target{
- Key: KuaiDaiLi,
- Method: http.MethodGet,
- URL: url,
- Item: params,
- }
- return []*common.Target{target}, nil
- }
|