gnews_click.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. import time
  2. import traceback
  3. #import json
  4. from selenium import webdriver
  5. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  6. import time
  7. import os
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support import expected_conditions as EC
  11. import dataset
  12. from selenium.webdriver.common.keys import Keys
  13. import json
  14. import random
  15. import time
  16. import sys
  17. import codecs
  18. import random
  19. import os
  20. import time
  21. #from userAgentRandomizer import userAgents
  22. driver=None
  23. db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
  24. #db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  25. table=db['gnews_clicks']
  26. def scrolling(driver,pgnum):
  27. ub = driver.find_element(By.CSS_SELECTOR,'body')
  28. for i in range(pgnum):
  29. ub.send_keys(Keys.PAGE_DOWN)
  30. if pgnum>1:
  31. time.sleep(0.3)
  32. def re_get_webdriver():
  33. global driver
  34. result=[]
  35. if driver is not None:
  36. print('closing....')
  37. driver.quit()
  38. os.system('killall chrome')
  39. print('quit....')
  40. driver=None
  41. try:
  42. # ua = userAgents()
  43. # user_agent = ua.random()
  44. options = webdriver.ChromeOptions()
  45. options.add_argument("--no-sandbox")
  46. options.add_argument("--disable-dev-shm-usage")
  47. options.add_argument("--headless")
  48. options.add_argument("--force-dark-mode")
  49. options.add_argument('--start-maximized')
  50. # print(user_agent)
  51. # options.add_argument("--user-agent=" +user_agent)
  52. options.add_argument("--incognito")
  53. driver=None
  54. try:
  55. driver = webdriver.Chrome(options=options)
  56. except:
  57. # driver.quit()
  58. # os.system('pkill -f ')
  59. os.system('kill %d' % os.getpid())
  60. sys.exit()
  61. return
  62. driver.set_window_size(1920, 19200)
  63. return
  64. except:
  65. import traceback
  66. traceback.print_exc()
  67. driver=None
  68. return None
  69. def run_once(jsobj):
  70. print(jsobj)
  71. global driver
  72. global table
  73. # i=random.randint(0,9)
  74. i=10
  75. if driver is None:
  76. time.sleep(8)
  77. re_get_webdriver()
  78. if driver is None:
  79. return
  80. try:
  81. kw=jsobj['kw']
  82. fname=jsobj['fname']
  83. if jsobj.get('domain') is None:
  84. # exclude=jsobj['exclude']
  85. exclude=None
  86. domain=None
  87. else:
  88. domain=jsobj['domain']
  89. exclude=None
  90. # q='裝潢'
  91. q=jsobj['kw']
  92. driver.get('https://news.google.com/search?q='+q+'&hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant')
  93. time.sleep(2)
  94. print(driver.current_url)
  95. time.sleep(2)
  96. # scrolling(driver,20)
  97. # elmts = driver.find_elements(By.XPATH, "//h3[@class='ipQwMb ekueJc RD0gLb']/a")
  98. elmts = driver.find_elements(By.XPATH, "//a[@class='JtKRv' and @jsaction='click:kkIcoc;']")
  99. numresults=len(elmts)
  100. print('搜尋結果數量',numresults)
  101. datadict={'搜尋詞':[],'結果標題':[],'結果網址':[],'結果名次':[],'結果說明':[]}
  102. idx=1
  103. found=False
  104. test_lst=[]
  105. txt_dict={}
  106. for elmt in elmts:
  107. href=elmt.get_attribute('href')
  108. txt=elmt.text
  109. desc=None
  110. try:
  111. desc=txt
  112. # elmt2=elmt.find_element(By.XPATH, "./../../..//div[@data-content-feature=1]")
  113. # desc=elmt2.text
  114. except:
  115. desc=None
  116. if len(txt)>10:
  117. if domain is not None:
  118. for d in domain:
  119. if d in txt:
  120. print('found....')
  121. print('clicked....')
  122. print(href)
  123. print(txt)
  124. print("ranking", idx)
  125. found=True
  126. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  127. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  128. time.sleep(6)
  129. table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt})
  130. return
  131. else:
  132. ex=False
  133. if not ex:
  134. test_lst.append(elmt)
  135. txt_dict[elmt]=txt
  136. idx+=1
  137. # time.sleep(9999)
  138. if exclude is not None:
  139. print('exclude')
  140. elmt=random.choice(test_lst[5:])
  141. print(elmt)
  142. print(txt_dict[elmt])
  143. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  144. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  145. time.sleep(6)
  146. if not found:
  147. table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
  148. except:
  149. print('exception')
  150. traceback.print_exc()
  151. entries=[]
  152. entry={'kw':'啟翔輕金屬 樂鋁屋','domain':['永續實踐再創新'],'ranking':'-1','fname':'fname'}
  153. entries.append(entry)
  154. entry={'kw':'啟翔輕金屬 防疫門','domain':['政治大學'],'ranking':'-1','fname':'fname'}
  155. entries.append(entry)
  156. entry={'kw':'啟翔輕金屬 鋁鋁創新','domain':['把痛點變新商機,'],'ranking':'-1','fname':'fname'}
  157. entries.append(entry)
  158. entry={'kw':'啟翔輕金屬 台灣國際室內設計','domain':['台灣國際室內設計'],'ranking':'-1','fname':'fname'}
  159. entries.append(entry)
  160. entry={'kw':'啟翔輕金屬 人工智慧','domain':['人工智慧'],'ranking':'-1','fname':'fname'}
  161. entries.append(entry)
  162. entry={'kw':'啟翔輕金屬 緬甸','domain':['緬甸'],'ranking':'-1','fname':'fname'}
  163. entries.append(entry)
  164. entry={'kw':'啟翔輕金屬 鋁業','domain':['臺灣鋁業'],'ranking':'-1','fname':'fname'}
  165. entries.append(entry)
  166. #entries.append(entry)
  167. entry=random.choice(entries)
  168. run_once(entry)
  169. #run_once({'kw':'啟翔 防疫門','domain':'政治大學','ranking':'-1','fname':'fname'})
  170. #run_once({'kw':'啟翔輕金屬 防疫門','domain':['政治大學'],'ranking':'-1','fname':'fname'})r
  171. #for c in cursor:
  172. # run_once({'kw':c['kw']})