123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294 |
- package browser_robot
- import (
- "fmt"
- "github.com/go-vgo/robotgo"
- "github.com/tebeka/selenium"
- "github.com/tebeka/selenium/chrome"
- "log"
- "math/rand"
- "os"
- "os/exec"
- "path/filepath"
- "regexp"
- "runtime"
- "strings"
- "time"
- )
- // ------------------------------------ 一条纯UI操作的可视化golang爬虫简单示例 ---------------------------------------------
- // TODO 仅为示例程序,需要优化结构,且实际使用缺少代理池、随机User-Agent替换、行为策略处理、页面访问历史和路由记录及处理等
- // download selenium standalone server jar
- // https://www.seleniumhq.org/download/
- // download gecko driver
- // https://github.com/mozilla/geckodriver/releases
- // download chrome driver
- // https://sites.google.com/a/chromium.org/chromedriver/downloads
- // or
- // http://npm.taobao.org/mirrors/chromedriver/
- // go get github.com/go-vgo/robotgo
- // git clone https://github.com/googleapis/google-api-go-client $GOPATH/src/google.golang.org/api
- // git clone https://github.com/googleapis/google-cloud-go $GOPATH/src/cloud.google.com/go
- // git clone https://github.com/golang/oauth2 $GOPATH/src/golang.org/x/oauth2
- // git clone https://github.com/census-instrumentation/opencensus-go $GOPATH/src/go.opencensus.io
- // cd $GOPATH/src/github.com/tebeka/selenium/vendor
- // go get -d ./...
- // go run init.go --alsologtostderr
- const (
- seleniumPath = "/Users/marion/go/src/github.com/tebeka/selenium/"
- standaloneServerPath = seleniumPath + "vendor/selenium-server-standalone-3.141.59.jar"
- geckoDriverPath = seleniumPath + "vendor/geckodriver"
- servicePort = 9515
- )
- var quit = make(chan bool)
- var linkRex = regexp.MustCompile(`(\S*[^.]+\.[^.]+)\s+`)
- type osAttrs struct {
- RunCommand string
- ChromeDriverPath string
- }
- type osMap map[string]*osAttrs
- var currentOS = osMap{
- "windows": {RunCommand: "cmd /c start", ChromeDriverPath: seleniumPath + "vendor/chromedriver.exe"},
- "darwin": {RunCommand: "open", ChromeDriverPath: seleniumPath + "vendor/chromedriver"},
- "linux": {RunCommand: "xdg-open", ChromeDriverPath: seleniumPath + "vendor/chromedriver_linux"},
- }
- // 手工打开系统默认浏览器
- func (m osMap) Run(uri string) error {
- attrs, ok := m[runtime.GOOS]
- if !ok {
- return fmt.Errorf("don't know how to open things on %s platform", runtime.GOOS)
- }
- cmd := exec.Command(attrs.RunCommand, uri)
- return cmd.Start()
- }
- func GetCurrentDirectory() string {
- dir, err := filepath.Abs(filepath.Dir(os.Args[0]))
- if err != nil {
- log.Fatal(err)
- }
- return strings.Replace(dir, "\\", "/", -1) //将\替换成/
- }
- func (m osMap) GetChromeDriverPath() string {
- attrs, ok := m[runtime.GOOS]
- if !ok {
- return ""
- }
- return attrs.ChromeDriverPath
- // return path.Join(GetCurrentDirectory(), attrs.ChromeDriverPath)
- }
- // 鼠标位置随机偏移
- func pos(x int, y int, xRange int, yRange int) (int, int) {
- var xPixels, yPixels int
- if xRange > 0 {
- xPixels = rand.Intn(xRange / 2)
- }
- if rand.Intn(2) == 1 {
- x = x + xPixels
- } else {
- x = x - xPixels
- }
- if yRange > 0 {
- yPixels = rand.Intn(yRange / 2)
- }
- if rand.Intn(2) == 1 {
- y = y + yPixels
- } else {
- y = y - yPixels
- }
- return x, y
- }
- func main() {
- service, webDriver, err := StartChrome()
- if err != nil {
- panic(err)
- }
- defer func(s *selenium.Service, wd selenium.WebDriver) {
- _ = wd.Quit()
- _ = s.Stop()
- }(service, webDriver)
- sWidth, sHeight := robotgo.GetScreenSize()
- fmt.Println("screen size:", sWidth, sHeight)
- // 随机鼠标起始位置
- robotgo.MoveMouseSmooth(pos(sWidth/2, sHeight/2, sWidth, sHeight)) // TODO 模拟人移动鼠标
- // 打开百度首页
- fmt.Println("1. open baidu home page")
- err = webDriver.Get("https://www.baidu.com")
- if err != nil {
- fmt.Printf("Failed to load page: %s\n", err)
- return
- }
- _ = webDriver.MaximizeWindow("")
- // 搜索关键词,进入搜索结果列表页
- fmt.Println("2. search keyword")
- time.Sleep(time.Second * 2)
- searchBar, _ := webDriver.FindElement(selenium.ByID, "kw")
- if searchBar == nil {
- fmt.Println("there no search bar in home page")
- return
- }
- loc, _ := searchBar.Location()
- // 160 是浏览器头部高度?
- robotgo.MoveMouseSmooth(pos(sWidth/2, 160+loc.Y, 540, 36))
- robotgo.Click()
- time.Sleep(time.Second * 2)
- robotgo.TypeString("LED广告屏") // TODO 模拟人打字
- time.Sleep(time.Second * 3)
- leftList, _ := webDriver.FindElement(selenium.ByID, "content_left")
- if leftList == nil {
- fmt.Println("there no links list in target page")
- return
- }
- // 获取所有超链接豆腐块,但是后续需要注意筛除掉反爬的蜜罐超链接(例如display=none的)
- items, _ := leftList.FindElements(selenium.ByTagName, "h3")
- if len(items) == 0 {
- fmt.Println("there no links in target page")
- return
- }
- for _, item := range items {
- if isDisplayed, _ := item.IsDisplayed(); !isDisplayed {
- continue
- }
- a, _ := item.FindElement(selenium.ByTagName, "a")
- if a == nil {
- continue
- }
- href, _ := a.GetAttribute("href")
- parent, err := item.FindElement(selenium.ByXPATH, "..")
- if parent == nil || err != nil {
- continue
- }
- if className, _ := parent.GetAttribute("class"); !strings.Contains(className, "c-container") {
- parent, _ = parent.FindElement(selenium.ByXPATH, "..")
- }
- sammyLinks, _ := parent.FindElements(selenium.ByXPATH, "div[last()]//a")
- if len(sammyLinks) > 0 {
- isTargetSite := false
- for _, a := range sammyLinks {
- t, _ := a.Text()
- if t == "" {
- continue
- }
- arr := linkRex.FindStringSubmatch(t)
- if len(arr) > 1 {
- siteHost := strings.ToLower(strings.Trim(arr[1], "/"))
- if siteHost == "www.novisled.com" { // 随便写了个网站测试用,后续处理,如果是目标网站,则继续移动鼠标到它对链接上点击处理,如果未显示先滚动到它的位置
- println("found target site,host", siteHost, " href:", href)
- isTargetSite = true
- break
- }
- }
- }
- if isTargetSite {
- p, _ := a.Location()
- fmt.Println("link element position:", p.X, p.Y)
- y := p.Y
- // TODO 平滑滚动
- if p.Y > sHeight {
- rd := rand.Intn(50)
- gap := p.Y - sHeight - rd
- robotgo.ScrollMouse(gap, "down")
- y = rd + 234
- }
- // 继续干点啥
- fmt.Println("link element new position:", p.X, y)
- robotgo.MoveMouseSmooth(p.X+30, y)
- fmt.Println("3. go to target site")
- robotgo.Click()
- }
- }
- }
- <-quit
- }
- // 启动火狐浏览器
- func StartFirefox() (service *selenium.Service, webDriver selenium.WebDriver, err error) {
- opts := []selenium.ServiceOption{
- // selenium.StartFrameBuffer(), // Start an X frame buffer for the browser to run in.
- selenium.GeckoDriver(geckoDriverPath), // Specify the path to GeckoDriver in order to use Firefox.
- selenium.Output(os.Stderr), // Output debug information to STDERR.
- }
- caps := selenium.Capabilities{
- "browserName": "firefox",
- }
- // 启动GeckoDriver
- service, err = selenium.NewGeckoDriverService(standaloneServerPath, servicePort, opts...)
- if err != nil {
- log.Printf("Error starting the GeckoDriver server: %v", err)
- return
- }
- // 调起浏览器
- webDriver, err = selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", servicePort))
- if err != nil {
- log.Printf("Error starting the GeckoDriver web driver: %v", err)
- return
- }
- return
- }
- // 启动谷歌浏览器
- func StartChrome() (service *selenium.Service, webDriver selenium.WebDriver, err error) {
- var opts []selenium.ServiceOption
- caps := selenium.Capabilities{
- "browserName": "chrome",
- }
- // 禁止加载图片,加快渲染速度
- //imagCaps := map[string]interface{}{
- // "profile.managed_default_content_settings.images": 2,
- //}
- chromeCaps := chrome.Capabilities{
- // Prefs: imagCaps,
- Path: "",
- Args: []string{
- "--start-maximized",
- // "--headless", // 设置Chrome无头模式
- "--no-sandbox",
- "--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7", // 模拟user-agent,防反爬
- },
- }
- caps.AddChrome(chromeCaps)
- // 启动ChromeDriver
- service, err = selenium.NewChromeDriverService(currentOS.GetChromeDriverPath(), servicePort, opts...)
- if err != nil {
- log.Printf("Error starting the ChromeDriver server: %v", err)
- return
- }
- // 调起chrome浏览器
- webDriver, err = selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", servicePort))
- if err != nil {
- log.Printf("Error starting the ChromeDriver web driver: %v", err)
- return
- }
- return
- }
|