gen_seo.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. #import redis
  2. import time
  3. import traceback
  4. #import json
  5. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  6. import time
  7. import os
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.support import expected_conditions as EC
  10. import dataset
  11. from selenium import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.common.keys import Keys
  14. from selenium.webdriver.chrome.service import Service
  15. import json
  16. import random
  17. import time
  18. import datetime
  19. import sys
  20. import codecs
  21. import random
  22. import os
  23. import time
  24. import requests
  25. import pymysql
  26. import urllib.parse
  27. from userAgentRandomizer import userAgents
  28. pymysql.install_as_MySQLdb()
  29. driver=None
  30. headers = {
  31. "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
  32. "Content-Type": "application/x-www-form-urlencoded"
  33. }
  34. def send_msg(kw):
  35. params = {"message":kw}
  36. r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
  37. blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402','https://ipo168.pixnet.net/blog/post/207626239-%E5%95%9F%E7%BF%94%E8%BC%95%E9%87%91%E5%B1%AC%E7%A7%91%E6%8A%80%E8%82%A1%E7%A5%A8%E6%98%AF%E9%80%99%E6%A8%A3%E7%9A%84%E5%85%AC%E5%8F%B8%21%21%E6%8A%95%E8%B3%87%E5%89%8D%E8%A6%81','https://latest.mediatagtw.com/article/%e5%95%9f%e7%bf%94%e8%bc%95%e9%87%91%e5%b1%ac%e7%a7%91%e6%8a%80%e8%82%a1%e4%bb%bd%e6%9c%89%e9%99%90%e5%85%ac%e5%8f%b8#gsc.tab=0']
  38. def re_get_webdriver():
  39. global port
  40. global driver
  41. result=[]
  42. if driver is not None:
  43. print('closing....')
  44. driver.quit()
  45. os.system('killall chrome')
  46. print('quit....')
  47. driver=None
  48. try:
  49. options = webdriver.ChromeOptions()
  50. # options.add_argument("user-agent=%s" % user_agent)
  51. options.add_argument('--headless')
  52. options.add_argument("--incognito")
  53. driver = webdriver.Chrome(options=options)
  54. driver.delete_all_cookies()
  55. driver.set_window_size(1400,1000)
  56. except:
  57. traceback.print_exc()
  58. driver=None
  59. return None
  60. def getDriver():
  61. ua=userAgents().random()
  62. options = webdriver.ChromeOptions()
  63. #print(ua)
  64. #options.add_argument("user-agent="+ua)
  65. options.add_argument('--headless')
  66. options.add_argument('--incognito')
  67. options.add_argument('--no-sandbox')
  68. driver=webdriver.Chrome(options=options)
  69. driver.set_window_size(1400,1000)
  70. return driver
  71. def run_once(jsobj):
  72. table=db['nda_log']
  73. print(jsobj)
  74. global driver
  75. # i=random.randint(0,9)
  76. i=100
  77. driver=getDriver()
  78. try:
  79. kw=jsobj['kw']
  80. if jsobj.get('domain') is None:
  81. exclude=jsobj['exclude']
  82. domain=None
  83. else:
  84. domain=jsobj['domain']
  85. exclude=None
  86. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW')
  87. driver.get(googleurl)
  88. time.sleep(6)
  89. print(driver.current_url)
  90. if 'sorry' in driver.current_url:
  91. print("URL Error: Caught")
  92. driver.quit()
  93. return
  94. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  95. # time.sleep(1)
  96. # elmt.send_keys(kw)
  97. # elmt.send_keys(Keys.ENTER)
  98. # time.sleep(6)
  99. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
  100. numresults=len(elmts)
  101. print('搜尋結果數量',numresults)
  102. if numresults==0:
  103. driver.quit()
  104. return
  105. idx=1
  106. found=False
  107. test_lst=[]
  108. txt_dict={}
  109. for elmt in elmts:
  110. href=elmt.get_attribute('href')
  111. txt=elmt.text
  112. if len(txt)>10:
  113. if domain is not None:
  114. if domain in href and href not in blacklist:
  115. print('found....')
  116. print('clicked....')
  117. print(href)
  118. print("ranking", idx)
  119. found=True
  120. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  121. # elmt.click()
  122. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  123. table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']})
  124. time.sleep(5)
  125. page_height = driver.execute_script("return document.body.scrollHeight")
  126. scroll_step = page_height // 4
  127. current_height = 0
  128. while current_height < page_height:
  129. driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});")
  130. time.sleep(3)
  131. current_height += scroll_step
  132. time.sleep(10)
  133. break
  134. else:
  135. ex=False
  136. for ee in exclude:
  137. if ee in href:
  138. ex=True
  139. if not ex:
  140. test_lst.append(elmt)
  141. txt_dict[elmt]=txt
  142. idx+=1
  143. if exclude is not None:
  144. print('exclude')
  145. elmt=random.choice(test_lst[5:])
  146. print(elmt)
  147. print(txt_dict[elmt])
  148. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  149. elmt.click()
  150. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  151. time.sleep(5)
  152. if not found: #don't waste resources, pick a random link as long as it is ok
  153. pick=''
  154. negativeflag=True
  155. while negativeflag==True:
  156. negativeflag=False
  157. pick = random.choice(elmts)
  158. href = pick.get_attribute('href')
  159. if href in blacklist:
  160. negativeflag=True
  161. print(href)
  162. webdriver.ActionChains(driver).move_to_element(pick).perform()
  163. webdriver.ActionChains(driver).move_to_element(pick).click().perform()
  164. time.sleep(10)
  165. #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']})
  166. except:
  167. print('exception')
  168. traceback.print_exc()
  169. driver.quit()
  170. # sys.exit()
  171. while True:
  172. try:# OLD TABLE NAME: seo.seo_jobs
  173. db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
  174. cursor=db.query("select json from public.seo_jobs_temp where cust='啟翔' and plan='形象SEO' and json like '%陳百欽%' order by random() limit 1")
  175. for c in cursor:
  176. js=json.loads(c['json'])
  177. prefix=js['prefix']
  178. postfix=js['postfix']
  179. domain=js['domain'][0]
  180. positive=js['positive']
  181. rnd=js['rnd']
  182. kw=''
  183. while '陳百欽' not in kw:
  184. kw=''
  185. kw1=random.choice(positive)
  186. kw2=random.choice(rnd)
  187. kw=prefix+" "+kw2+" "+kw1
  188. code='03'
  189. print(kw)
  190. run_once({'domain':domain,'kw':kw, 'cust':'啟翔'})
  191. if driver is not None:
  192. try:
  193. driver.quit()
  194. except:
  195. pass
  196. cursor=None
  197. driver=None
  198. db.close()
  199. print("Completed")
  200. time.sleep(61)
  201. except:
  202. traceback.print_exc()
  203. print("Execution Error")
  204. if driver is not None:
  205. try:
  206. driver.quit()
  207. except:
  208. pass
  209. cursor=None
  210. driver=None
  211. db.close()
  212. time.sleep(20)