package browser_robot import ( "fmt" "github.com/go-vgo/robotgo" "github.com/tebeka/selenium" "github.com/tebeka/selenium/chrome" "log" "math/rand" "os" "os/exec" "path/filepath" "regexp" "runtime" "strings" "time" ) // ------------------------------------ 一条纯UI操作的可视化golang爬虫简单示例 --------------------------------------------- // TODO 仅为示例程序,需要优化结构,且实际使用缺少代理池、随机User-Agent替换、行为策略处理、页面访问历史和路由记录及处理等 // download selenium standalone server jar // https://www.seleniumhq.org/download/ // download gecko driver // https://github.com/mozilla/geckodriver/releases // download chrome driver // https://sites.google.com/a/chromium.org/chromedriver/downloads // or // http://npm.taobao.org/mirrors/chromedriver/ // go get github.com/go-vgo/robotgo // git clone https://github.com/googleapis/google-api-go-client $GOPATH/src/google.golang.org/api // git clone https://github.com/googleapis/google-cloud-go $GOPATH/src/cloud.google.com/go // git clone https://github.com/golang/oauth2 $GOPATH/src/golang.org/x/oauth2 // git clone https://github.com/census-instrumentation/opencensus-go $GOPATH/src/go.opencensus.io // cd $GOPATH/src/github.com/tebeka/selenium/vendor // go get -d ./... // go run init.go --alsologtostderr const ( seleniumPath = "/Users/marion/go/src/github.com/tebeka/selenium/" standaloneServerPath = seleniumPath + "vendor/selenium-server-standalone-3.141.59.jar" geckoDriverPath = seleniumPath + "vendor/geckodriver" servicePort = 9515 ) var quit = make(chan bool) var linkRex = regexp.MustCompile(`(\S*[^.]+\.[^.]+)\s+`) type osAttrs struct { RunCommand string ChromeDriverPath string } type osMap map[string]*osAttrs var currentOS = osMap{ "windows": {RunCommand: "cmd /c start", ChromeDriverPath: seleniumPath + "vendor/chromedriver.exe"}, "darwin": {RunCommand: "open", ChromeDriverPath: seleniumPath + "vendor/chromedriver"}, "linux": {RunCommand: "xdg-open", ChromeDriverPath: seleniumPath + "vendor/chromedriver_linux"}, } // 手工打开系统默认浏览器 func (m osMap) Run(uri string) error { attrs, ok := m[runtime.GOOS] if !ok { return fmt.Errorf("don't know how to open things on %s platform", runtime.GOOS) } cmd := exec.Command(attrs.RunCommand, uri) return cmd.Start() } func GetCurrentDirectory() string { dir, err := filepath.Abs(filepath.Dir(os.Args[0])) if err != nil { log.Fatal(err) } return strings.Replace(dir, "\\", "/", -1) //将\替换成/ } func (m osMap) GetChromeDriverPath() string { attrs, ok := m[runtime.GOOS] if !ok { return "" } return attrs.ChromeDriverPath // return path.Join(GetCurrentDirectory(), attrs.ChromeDriverPath) } // 鼠标位置随机偏移 func pos(x int, y int, xRange int, yRange int) (int, int) { var xPixels, yPixels int if xRange > 0 { xPixels = rand.Intn(xRange / 2) } if rand.Intn(2) == 1 { x = x + xPixels } else { x = x - xPixels } if yRange > 0 { yPixels = rand.Intn(yRange / 2) } if rand.Intn(2) == 1 { y = y + yPixels } else { y = y - yPixels } return x, y } func main() { service, webDriver, err := StartChrome() if err != nil { panic(err) } defer func(s *selenium.Service, wd selenium.WebDriver) { _ = wd.Quit() _ = s.Stop() }(service, webDriver) sWidth, sHeight := robotgo.GetScreenSize() fmt.Println("screen size:", sWidth, sHeight) // 随机鼠标起始位置 robotgo.MoveMouseSmooth(pos(sWidth/2, sHeight/2, sWidth, sHeight)) // TODO 模拟人移动鼠标 // 打开百度首页 fmt.Println("1. open baidu home page") err = webDriver.Get("https://www.baidu.com") if err != nil { fmt.Printf("Failed to load page: %s\n", err) return } _ = webDriver.MaximizeWindow("") // 搜索关键词,进入搜索结果列表页 fmt.Println("2. search keyword") time.Sleep(time.Second * 2) searchBar, _ := webDriver.FindElement(selenium.ByID, "kw") if searchBar == nil { fmt.Println("there no search bar in home page") return } loc, _ := searchBar.Location() // 160 是浏览器头部高度? robotgo.MoveMouseSmooth(pos(sWidth/2, 160+loc.Y, 540, 36)) robotgo.Click() time.Sleep(time.Second * 2) robotgo.TypeString("LED广告屏") // TODO 模拟人打字 time.Sleep(time.Second * 3) leftList, _ := webDriver.FindElement(selenium.ByID, "content_left") if leftList == nil { fmt.Println("there no links list in target page") return } // 获取所有超链接豆腐块,但是后续需要注意筛除掉反爬的蜜罐超链接(例如display=none的) items, _ := leftList.FindElements(selenium.ByTagName, "h3") if len(items) == 0 { fmt.Println("there no links in target page") return } for _, item := range items { if isDisplayed, _ := item.IsDisplayed(); !isDisplayed { continue } a, _ := item.FindElement(selenium.ByTagName, "a") if a == nil { continue } href, _ := a.GetAttribute("href") parent, err := item.FindElement(selenium.ByXPATH, "..") if parent == nil || err != nil { continue } if className, _ := parent.GetAttribute("class"); !strings.Contains(className, "c-container") { parent, _ = parent.FindElement(selenium.ByXPATH, "..") } sammyLinks, _ := parent.FindElements(selenium.ByXPATH, "div[last()]//a") if len(sammyLinks) > 0 { isTargetSite := false for _, a := range sammyLinks { t, _ := a.Text() if t == "" { continue } arr := linkRex.FindStringSubmatch(t) if len(arr) > 1 { siteHost := strings.ToLower(strings.Trim(arr[1], "/")) if siteHost == "www.novisled.com" { // 随便写了个网站测试用,后续处理,如果是目标网站,则继续移动鼠标到它对链接上点击处理,如果未显示先滚动到它的位置 println("found target site,host", siteHost, " href:", href) isTargetSite = true break } } } if isTargetSite { p, _ := a.Location() fmt.Println("link element position:", p.X, p.Y) y := p.Y // TODO 平滑滚动 if p.Y > sHeight { rd := rand.Intn(50) gap := p.Y - sHeight - rd robotgo.ScrollMouse(gap, "down") y = rd + 234 } // 继续干点啥 fmt.Println("link element new position:", p.X, y) robotgo.MoveMouseSmooth(p.X+30, y) fmt.Println("3. go to target site") robotgo.Click() } } } <-quit } // 启动火狐浏览器 func StartFirefox() (service *selenium.Service, webDriver selenium.WebDriver, err error) { opts := []selenium.ServiceOption{ // selenium.StartFrameBuffer(), // Start an X frame buffer for the browser to run in. selenium.GeckoDriver(geckoDriverPath), // Specify the path to GeckoDriver in order to use Firefox. selenium.Output(os.Stderr), // Output debug information to STDERR. } caps := selenium.Capabilities{ "browserName": "firefox", } // 启动GeckoDriver service, err = selenium.NewGeckoDriverService(standaloneServerPath, servicePort, opts...) if err != nil { log.Printf("Error starting the GeckoDriver server: %v", err) return } // 调起浏览器 webDriver, err = selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", servicePort)) if err != nil { log.Printf("Error starting the GeckoDriver web driver: %v", err) return } return } // 启动谷歌浏览器 func StartChrome() (service *selenium.Service, webDriver selenium.WebDriver, err error) { var opts []selenium.ServiceOption caps := selenium.Capabilities{ "browserName": "chrome", } // 禁止加载图片,加快渲染速度 //imagCaps := map[string]interface{}{ // "profile.managed_default_content_settings.images": 2, //} chromeCaps := chrome.Capabilities{ // Prefs: imagCaps, Path: "", Args: []string{ "--start-maximized", // "--headless", // 设置Chrome无头模式 "--no-sandbox", "--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7", // 模拟user-agent,防反爬 }, } caps.AddChrome(chromeCaps) // 启动ChromeDriver service, err = selenium.NewChromeDriverService(currentOS.GetChromeDriverPath(), servicePort, opts...) if err != nil { log.Printf("Error starting the ChromeDriver server: %v", err) return } // 调起chrome浏览器 webDriver, err = selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", servicePort)) if err != nil { log.Printf("Error starting the ChromeDriver web driver: %v", err) return } return }