gsearch_general.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. import traceback
  2. from selenium import webdriver
  3. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  4. import time
  5. import os
  6. import datetime
  7. import urllib.parse
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support import expected_conditions as EC
  11. import codecs
  12. import random
  13. from bs4 import BeautifulSoup
  14. import requests
  15. import time
  16. import rpyc
  17. import sys
  18. import docker
  19. import googlesearch
  20. import codecs
  21. import sys
  22. import time
  23. import dataset
  24. import os
  25. def process_one(driver):
  26. lst=[]
  27. elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  28. for elmt in elmts:
  29. try:
  30. href=elmt.get_attribute('href')
  31. # print(href)
  32. txt=elmt.text.split('\n')
  33. print(txt[0])
  34. lst.append({'title':txt[0],'url':href})
  35. except:
  36. print('href2 exception')
  37. traceback.print_exc()
  38. return lst
  39. def process_query(driver,qs,number_results=10,language_code='en',enable_next=True):
  40. escaped_search_term=urllib.parse.quote(qs)
  41. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results+1,language_code)
  42. print(googleurl)
  43. driver.get(googleurl)
  44. time.sleep(3)
  45. totallst=[]
  46. while True:
  47. lst=process_one(driver)
  48. totallst+=lst
  49. try:
  50. if enable_next:
  51. elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
  52. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  53. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  54. else:
  55. break
  56. except:
  57. traceback.print_exc()
  58. print('pnnext exception')
  59. break
  60. time.sleep(1.5)
  61. return totallst
  62. result=[]
  63. driver=None
  64. def restart_browser():
  65. # os.system('docker container restart p4444')
  66. # time.sleep(10)
  67. options = webdriver.ChromeOptions()
  68. # options.add_argument("--proxy-server=http://80.48.119.28:8080")
  69. driver=webdriver.Chrome(options=options)
  70. # driver=webdriver.Chrome(desired_capabilities=options.to_capabilities())
  71. #driver = webdriver.Remote(
  72. # command_executor='http://127.0.0.1:4444/wd/hub',
  73. #desired_capabilities=options.to_capabilities())
  74. # desired_capabilities=DesiredCapabilities.CHROME)
  75. driver.set_window_size(1400,1000)
  76. return driver