browser_robot.go 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. package browser_robot
  2. import (
  3. "fmt"
  4. "github.com/go-vgo/robotgo"
  5. "github.com/tebeka/selenium"
  6. "github.com/tebeka/selenium/chrome"
  7. "log"
  8. "math/rand"
  9. "os"
  10. "os/exec"
  11. "path/filepath"
  12. "regexp"
  13. "runtime"
  14. "strings"
  15. "time"
  16. )
  17. // ------------------------------------ 一条纯UI操作的可视化golang爬虫简单示例 ---------------------------------------------
  18. // TODO 仅为示例程序,需要优化结构,且实际使用缺少代理池、随机User-Agent替换、行为策略处理、页面访问历史和路由记录及处理等
  19. // download selenium standalone server jar
  20. // https://www.seleniumhq.org/download/
  21. // download gecko driver
  22. // https://github.com/mozilla/geckodriver/releases
  23. // download chrome driver
  24. // https://sites.google.com/a/chromium.org/chromedriver/downloads
  25. // or
  26. // http://npm.taobao.org/mirrors/chromedriver/
  27. // go get github.com/go-vgo/robotgo
  28. // git clone https://github.com/googleapis/google-api-go-client $GOPATH/src/google.golang.org/api
  29. // git clone https://github.com/googleapis/google-cloud-go $GOPATH/src/cloud.google.com/go
  30. // git clone https://github.com/golang/oauth2 $GOPATH/src/golang.org/x/oauth2
  31. // git clone https://github.com/census-instrumentation/opencensus-go $GOPATH/src/go.opencensus.io
  32. // cd $GOPATH/src/github.com/tebeka/selenium/vendor
  33. // go get -d ./...
  34. // go run init.go --alsologtostderr
  35. const (
  36. seleniumPath = "/Users/marion/go/src/github.com/tebeka/selenium/"
  37. standaloneServerPath = seleniumPath + "vendor/selenium-server-standalone-3.141.59.jar"
  38. geckoDriverPath = seleniumPath + "vendor/geckodriver"
  39. servicePort = 9515
  40. )
  41. var quit = make(chan bool)
  42. var linkRex = regexp.MustCompile(`(\S*[^.]+\.[^.]+)\s+`)
  43. type osAttrs struct {
  44. RunCommand string
  45. ChromeDriverPath string
  46. }
  47. type osMap map[string]*osAttrs
  48. var currentOS = osMap{
  49. "windows": {RunCommand: "cmd /c start", ChromeDriverPath: seleniumPath + "vendor/chromedriver.exe"},
  50. "darwin": {RunCommand: "open", ChromeDriverPath: seleniumPath + "vendor/chromedriver"},
  51. "linux": {RunCommand: "xdg-open", ChromeDriverPath: seleniumPath + "vendor/chromedriver_linux"},
  52. }
  53. // 手工打开系统默认浏览器
  54. func (m osMap) Run(uri string) error {
  55. attrs, ok := m[runtime.GOOS]
  56. if !ok {
  57. return fmt.Errorf("don't know how to open things on %s platform", runtime.GOOS)
  58. }
  59. cmd := exec.Command(attrs.RunCommand, uri)
  60. return cmd.Start()
  61. }
  62. func GetCurrentDirectory() string {
  63. dir, err := filepath.Abs(filepath.Dir(os.Args[0]))
  64. if err != nil {
  65. log.Fatal(err)
  66. }
  67. return strings.Replace(dir, "\\", "/", -1) //将\替换成/
  68. }
  69. func (m osMap) GetChromeDriverPath() string {
  70. attrs, ok := m[runtime.GOOS]
  71. if !ok {
  72. return ""
  73. }
  74. return attrs.ChromeDriverPath
  75. // return path.Join(GetCurrentDirectory(), attrs.ChromeDriverPath)
  76. }
  77. // 鼠标位置随机偏移
  78. func pos(x int, y int, xRange int, yRange int) (int, int) {
  79. var xPixels, yPixels int
  80. if xRange > 0 {
  81. xPixels = rand.Intn(xRange / 2)
  82. }
  83. if rand.Intn(2) == 1 {
  84. x = x + xPixels
  85. } else {
  86. x = x - xPixels
  87. }
  88. if yRange > 0 {
  89. yPixels = rand.Intn(yRange / 2)
  90. }
  91. if rand.Intn(2) == 1 {
  92. y = y + yPixels
  93. } else {
  94. y = y - yPixels
  95. }
  96. return x, y
  97. }
  98. func main() {
  99. service, webDriver, err := StartChrome()
  100. if err != nil {
  101. panic(err)
  102. }
  103. defer func(s *selenium.Service, wd selenium.WebDriver) {
  104. _ = wd.Quit()
  105. _ = s.Stop()
  106. }(service, webDriver)
  107. sWidth, sHeight := robotgo.GetScreenSize()
  108. fmt.Println("screen size:", sWidth, sHeight)
  109. // 随机鼠标起始位置
  110. robotgo.MoveMouseSmooth(pos(sWidth/2, sHeight/2, sWidth, sHeight)) // TODO 模拟人移动鼠标
  111. // 打开百度首页
  112. fmt.Println("1. open baidu home page")
  113. err = webDriver.Get("https://www.baidu.com")
  114. if err != nil {
  115. fmt.Printf("Failed to load page: %s\n", err)
  116. return
  117. }
  118. _ = webDriver.MaximizeWindow("")
  119. // 搜索关键词,进入搜索结果列表页
  120. fmt.Println("2. search keyword")
  121. time.Sleep(time.Second * 2)
  122. searchBar, _ := webDriver.FindElement(selenium.ByID, "kw")
  123. if searchBar == nil {
  124. fmt.Println("there no search bar in home page")
  125. return
  126. }
  127. loc, _ := searchBar.Location()
  128. // 160 是浏览器头部高度?
  129. robotgo.MoveMouseSmooth(pos(sWidth/2, 160+loc.Y, 540, 36))
  130. robotgo.Click()
  131. time.Sleep(time.Second * 2)
  132. robotgo.TypeString("LED广告屏") // TODO 模拟人打字
  133. time.Sleep(time.Second * 3)
  134. leftList, _ := webDriver.FindElement(selenium.ByID, "content_left")
  135. if leftList == nil {
  136. fmt.Println("there no links list in target page")
  137. return
  138. }
  139. // 获取所有超链接豆腐块,但是后续需要注意筛除掉反爬的蜜罐超链接(例如display=none的)
  140. items, _ := leftList.FindElements(selenium.ByTagName, "h3")
  141. if len(items) == 0 {
  142. fmt.Println("there no links in target page")
  143. return
  144. }
  145. for _, item := range items {
  146. if isDisplayed, _ := item.IsDisplayed(); !isDisplayed {
  147. continue
  148. }
  149. a, _ := item.FindElement(selenium.ByTagName, "a")
  150. if a == nil {
  151. continue
  152. }
  153. href, _ := a.GetAttribute("href")
  154. parent, err := item.FindElement(selenium.ByXPATH, "..")
  155. if parent == nil || err != nil {
  156. continue
  157. }
  158. if className, _ := parent.GetAttribute("class"); !strings.Contains(className, "c-container") {
  159. parent, _ = parent.FindElement(selenium.ByXPATH, "..")
  160. }
  161. sammyLinks, _ := parent.FindElements(selenium.ByXPATH, "div[last()]//a")
  162. if len(sammyLinks) > 0 {
  163. isTargetSite := false
  164. for _, a := range sammyLinks {
  165. t, _ := a.Text()
  166. if t == "" {
  167. continue
  168. }
  169. arr := linkRex.FindStringSubmatch(t)
  170. if len(arr) > 1 {
  171. siteHost := strings.ToLower(strings.Trim(arr[1], "/"))
  172. if siteHost == "www.novisled.com" { // 随便写了个网站测试用,后续处理,如果是目标网站,则继续移动鼠标到它对链接上点击处理,如果未显示先滚动到它的位置
  173. println("found target site,host", siteHost, " href:", href)
  174. isTargetSite = true
  175. break
  176. }
  177. }
  178. }
  179. if isTargetSite {
  180. p, _ := a.Location()
  181. fmt.Println("link element position:", p.X, p.Y)
  182. y := p.Y
  183. // TODO 平滑滚动
  184. if p.Y > sHeight {
  185. rd := rand.Intn(50)
  186. gap := p.Y - sHeight - rd
  187. robotgo.ScrollMouse(gap, "down")
  188. y = rd + 234
  189. }
  190. // 继续干点啥
  191. fmt.Println("link element new position:", p.X, y)
  192. robotgo.MoveMouseSmooth(p.X+30, y)
  193. fmt.Println("3. go to target site")
  194. robotgo.Click()
  195. }
  196. }
  197. }
  198. <-quit
  199. }
  200. // 启动火狐浏览器
  201. func StartFirefox() (service *selenium.Service, webDriver selenium.WebDriver, err error) {
  202. opts := []selenium.ServiceOption{
  203. // selenium.StartFrameBuffer(), // Start an X frame buffer for the browser to run in.
  204. selenium.GeckoDriver(geckoDriverPath), // Specify the path to GeckoDriver in order to use Firefox.
  205. selenium.Output(os.Stderr), // Output debug information to STDERR.
  206. }
  207. caps := selenium.Capabilities{
  208. "browserName": "firefox",
  209. }
  210. // 启动GeckoDriver
  211. service, err = selenium.NewGeckoDriverService(standaloneServerPath, servicePort, opts...)
  212. if err != nil {
  213. log.Printf("Error starting the GeckoDriver server: %v", err)
  214. return
  215. }
  216. // 调起浏览器
  217. webDriver, err = selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", servicePort))
  218. if err != nil {
  219. log.Printf("Error starting the GeckoDriver web driver: %v", err)
  220. return
  221. }
  222. return
  223. }
  224. // 启动谷歌浏览器
  225. func StartChrome() (service *selenium.Service, webDriver selenium.WebDriver, err error) {
  226. var opts []selenium.ServiceOption
  227. caps := selenium.Capabilities{
  228. "browserName": "chrome",
  229. }
  230. // 禁止加载图片,加快渲染速度
  231. //imagCaps := map[string]interface{}{
  232. // "profile.managed_default_content_settings.images": 2,
  233. //}
  234. chromeCaps := chrome.Capabilities{
  235. // Prefs: imagCaps,
  236. Path: "",
  237. Args: []string{
  238. "--start-maximized",
  239. // "--headless", // 设置Chrome无头模式
  240. "--no-sandbox",
  241. "--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7", // 模拟user-agent,防反爬
  242. },
  243. }
  244. caps.AddChrome(chromeCaps)
  245. // 启动ChromeDriver
  246. service, err = selenium.NewChromeDriverService(currentOS.GetChromeDriverPath(), servicePort, opts...)
  247. if err != nil {
  248. log.Printf("Error starting the ChromeDriver server: %v", err)
  249. return
  250. }
  251. // 调起chrome浏览器
  252. webDriver, err = selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", servicePort))
  253. if err != nil {
  254. log.Printf("Error starting the ChromeDriver web driver: %v", err)
  255. return
  256. }
  257. return
  258. }