gsearch_general.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. import traceback
  2. from selenium import webdriver
  3. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  4. import time
  5. import os
  6. import datetime
  7. import urllib.parse
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support import expected_conditions as EC
  11. import codecs
  12. import random
  13. from bs4 import BeautifulSoup
  14. import requests
  15. import time
  16. # import rpyc
  17. import sys
  18. import docker
  19. # import googlesearch
  20. import codecs
  21. import sys
  22. import time
  23. # import dataset
  24. import os
  25. def process_one(driver):
  26. lst=[]
  27. elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  28. for elmt in elmts:
  29. try:
  30. href=elmt.get_attribute('href')
  31. # print(href)
  32. txt=elmt.text.split('\n')
  33. print(txt[0])
  34. lst.append({'title':txt[0],'url':href})
  35. except:
  36. print('href2 exception')
  37. traceback.print_exc()
  38. return lst
  39. def process_query(driver,qs,number_results=10,language_code='en',enable_next=True):
  40. escaped_search_term=urllib.parse.quote(qs)
  41. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results+1,language_code)
  42. print(googleurl)
  43. driver.get(googleurl)
  44. time.sleep(3)
  45. totallst=[]
  46. while True:
  47. lst=process_one(driver)
  48. totallst+=lst
  49. try:
  50. if enable_next:
  51. time.sleep(3)
  52. elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
  53. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  54. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  55. else:
  56. break
  57. except:
  58. traceback.print_exc()
  59. print('pnnext exception')
  60. break
  61. time.sleep(1.5)
  62. return totallst
  63. result=[]
  64. driver=None
  65. def restart_browser():
  66. # os.system('docker container restart p4444')
  67. # time.sleep(10)
  68. options = webdriver.ChromeOptions()
  69. # options.add_argument("--proxy-server=http://80.48.119.28:8080")
  70. driver=webdriver.Chrome(executable_path='/Users/zooeytsai/Downloads/chromedriver',options=options)
  71. # driver=webdriver.Chrome(desired_capabilities=options.to_capabilities())
  72. #driver = webdriver.Remote(
  73. # command_executor='http://127.0.0.1:4444/wd/hub',
  74. #desired_capabilities=options.to_capabilities())
  75. # desired_capabilities=DesiredCapabilities.CHROME)
  76. driver.set_window_size(1400,1000)
  77. return driver