package kuaidaili import ( "bytes" "errors" "fmt" "git.aionnect.com/aionnect/go-common/utils/date" "git.aionnect.com/hello-go/spider/common" "git.aionnect.com/hello-go/spider/dao" "git.aionnect.com/hello-go/spider/spiders" "github.com/PuerkitoBio/goquery" "net/http" "strconv" "strings" ) /* -------------------------------------------------------常量定义---------------------------------------------------- */ const TargetUrl = "https://www.kuaidaili.com/free/inha/%d/" const KuaiDaiLi = "KuaiDaiLi" /* -------------------------------------------------------爬虫实现---------------------------------------------------- */ // 千里马项目网访客详情页爬虫类 type FreePagingSpider struct { proxyDao *dao.ProxyDao // 代理信息数据访问对象 } // 返回当前爬虫类相应爬虫调度类对象实例 func NewFreePagingSpider() spiders.ISpider { spider := &FreePagingSpider{ proxyDao: dao.NewProxyDao(), } return spider } // 爬虫命名 func (s *FreePagingSpider) Name() string { return KuaiDaiLi } // 响应解析 func (s *FreePagingSpider) Parse(item interface{}, body []byte, response *http.Response) ([]*common.Target, error) { params := item.(*common.ProxyPagingParams) if params.PagingNo == 2 { // 只爬一页,仅测试时使用 return nil, nil } // 数据解析 dom, _ := goquery.NewDocumentFromReader(bytes.NewReader(body)) pageNo := dom.Find("#listnav li a").Last().Text() totalPage, err := strconv.Atoi(pageNo) if nil != err { // 获取不到尾页信息,异常终止 return nil, errors.New(fmt.Sprintf("wrong last page bumber %s", err.Error())) } flag := true var proxies []*common.ProxyInfo dom.Find(".table-bordered tbody tr").Each(func(i int, row *goquery.Selection) { flag = false // 有数据 proxy := &common.ProxyInfo{} row.Find("td").Each(func(j int, cell *goquery.Selection) { switch j { case 0: proxy.IP = common.ConvertToIP(cell.Text()) case 1: proxy.Port, _ = strconv.Atoi(strings.TrimSpace(cell.Text())) case 2: proxy.Anonymity = strings.TrimSpace(cell.Text()) case 3: proxy.Type = strings.ToUpper(strings.TrimSpace(cell.Text())) case 4: proxy.Location = strings.TrimSpace(cell.Text()) case 5: proxy.Speed = common.ConvertToSpeed(cell.Text()) case 6: proxy.UpdateTime = date.ParseDatetime(cell.Text()) } }) if proxy.IP != "" && proxy.Port >= 0 && proxy.Anonymity == common.High && (proxy.Type == common.HTTP || proxy.Type == common.HTTPS) { if proxy.Port == 0 { if proxy.Type == common.HTTP { proxy.Port = 80 } else { proxy.Port = 443 } } proxies = append(proxies, proxy) } }) // 保存 s.proxyDao.Save(proxies) // 判断是否终止分页轮询 if flag { // 当前分页无数据,异常终止 return nil, errors.New(fmt.Sprintf("paging has no rows %d", params.PagingNo)) } params.Count += len(proxies) if params.Count > params.Limit { // 超限,终止 return nil, nil } if totalPage <= params.PagingNo { // 尾页,终止 return nil, nil } // 轮询列表页下一页(暂为顺序轮询) params.PagingNo++ url := fmt.Sprintf(TargetUrl, params.PagingNo) target := &common.Target{ Key: KuaiDaiLi, Method: http.MethodGet, URL: url, Item: params, } return []*common.Target{target}, nil }