resources_notebook.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. import undetected_chromedriver as uc
  2. import time
  3. import os
  4. import urllib
  5. from selenium.webdriver.common.by import By
  6. import sys
  7. driver = None
  8. def re_get_webdriver():
  9. global port
  10. global driver
  11. global portnum
  12. global is_docker
  13. result = []
  14. if driver is not None:
  15. print('closing....')
  16. driver.quit()
  17. print('quit....')
  18. driver = None
  19. try:
  20. options = uc.ChromeOptions()
  21. # options.add_argument("--user-agent=" + "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19")
  22. options.add_argument("--window-size=200,100") # 縮小視窗
  23. options.add_argument("--window-position=-32000,-32000") # 移到螢幕外
  24. # for window in gw.getWindowsWithTitle("Chrome"):
  25. # window.minimize()
  26. driver = uc.Chrome(options=options)
  27. driver.delete_all_cookies()
  28. except:
  29. driver = None
  30. return None
  31. return driver
  32. def get_resource(kw):
  33. while True:
  34. driver = re_get_webdriver()
  35. print('re_get_webdriver')
  36. if driver is not None:
  37. break
  38. time.sleep(3)
  39. try:
  40. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw'.format(urllib.parse.quote(kw), 100, 'zh-TW')
  41. # googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw&tbm=vid&tbs=vd:m'.format(urllib.parse.quote(kw), 100, 'zh-TW')
  42. # googleurl = f'https://www.google.co.jp/search?q={kw}&sca_esv=741dc4f98c90c9c4&source=hp&ei=djmOZ8inMYWk2roPk_yMiA4&iflsig=AL9hbdgAAAAAZ45HhiuBAUgi3Vf3Qd5FTyfcyUOySOxk&ved=0ahUKEwjIutTinoSLAxUFklYBHRM-A-EQ4dUDCA8&uact=5&oq=junho&gs_lp=Egdnd3Mtd2l6IgphbmdlbG8ga29vMgUQLhiABDIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIGEAAYChgeSL0YUABYqRZwAXgAkAEAmAGwAaABjQyqAQQwLjExuAEDyAEA-AEBmAIMoALYDMICCxAuGIAEGNEDGMcBwgIFEAAYgATCAgoQLhiABBhDGIoFwgILEC4YgAQYxwEYrwHCAgcQABiABBgKwgIHEC4YgAQYCsICDRAuGIAEGMcBGAoYrwGYAwCSBwQxLjExoAfBqQE&sclient=gws-wiz'
  43. driver.get(googleurl)
  44. time.sleep(6)
  45. print(driver.current_url)
  46. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
  47. numresults = len(elmts)
  48. print('搜尋結果數量', numresults)
  49. if numresults == 0:
  50. print(driver.current_url)
  51. print(driver.title)
  52. sys.exit()
  53. resources_list = []
  54. for elmt in elmts:
  55. href = elmt.get_attribute('href')
  56. resources_list.append(href)
  57. print(resources_list)
  58. return resources_list
  59. except Exception as e:
  60. print('exception')
  61. return None
  62. driver.quit()
  63. get_resource('') # 取得搜尋結果第一頁網址來源