gen_seo.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. #import redis
  2. import time
  3. import traceback
  4. #import json
  5. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  6. import time
  7. import os
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.support import expected_conditions as EC
  10. import dataset
  11. from selenium import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.common.keys import Keys
  14. from selenium.webdriver.chrome.service import Service
  15. import json
  16. import random
  17. import time
  18. import datetime
  19. import sys
  20. import codecs
  21. import random
  22. import os
  23. import time
  24. import requests
  25. import pymysql
  26. import urllib.parse
  27. pymysql.install_as_MySQLdb()
  28. driver=None
  29. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  30. headers = {
  31. "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
  32. "Content-Type": "application/x-www-form-urlencoded"
  33. }
  34. def send_msg(kw):
  35. params = {"message":kw}
  36. r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
  37. blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402']
  38. def re_get_webdriver():
  39. global port
  40. global driver
  41. result=[]
  42. if driver is not None:
  43. print('closing....')
  44. driver.quit()
  45. os.system('killall chrome')
  46. print('quit....')
  47. driver=None
  48. try:
  49. options = webdriver.ChromeOptions()
  50. # options.add_argument("user-agent=%s" % user_agent)
  51. options.add_argument('--headless')
  52. options.add_argument("--incognito")
  53. driver = webdriver.Chrome(options=options)
  54. driver.delete_all_cookies()
  55. driver.set_window_size(1400,1000)
  56. except:
  57. traceback.print_exc()
  58. driver=None
  59. return None
  60. def run_once(jsobj):
  61. table=db['nda_log']
  62. print(jsobj)
  63. global driver
  64. # i=random.randint(0,9)
  65. i=100
  66. if driver is None:
  67. time.sleep(8)
  68. options = webdriver.ChromeOptions()
  69. options.add_argument('--headless')
  70. # options.add_argument("--user-agent=" +user_agent)
  71. options.add_argument("--incognito")
  72. options.add_argument('--disable-dev-shm-usage')
  73. driver = webdriver.Chrome(options=options)
  74. if driver is None:
  75. return
  76. try:
  77. kw=jsobj['kw']
  78. if jsobj.get('domain') is None:
  79. exclude=jsobj['exclude']
  80. domain=None
  81. else:
  82. domain=jsobj['domain']
  83. exclude=None
  84. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW')
  85. driver.get(googleurl)
  86. time.sleep(6)
  87. print(driver.current_url)
  88. if 'sorry' in driver.current_url:
  89. print("URL Error: Caught")
  90. return
  91. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  92. # time.sleep(1)
  93. # elmt.send_keys(kw)
  94. # elmt.send_keys(Keys.ENTER)
  95. # time.sleep(6)
  96. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
  97. numresults=len(elmts)
  98. print('搜尋結果數量',numresults)
  99. if numresults==0:
  100. send_msg('stop working...')
  101. sys.exit()
  102. idx=1
  103. found=False
  104. test_lst=[]
  105. txt_dict={}
  106. for elmt in elmts:
  107. href=elmt.get_attribute('href')
  108. txt=elmt.text
  109. if len(txt)>10:
  110. if domain is not None:
  111. if domain in href:
  112. print('found....')
  113. print('clicked....')
  114. print(href)
  115. print("ranking", idx)
  116. found=True
  117. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  118. # elmt.click()
  119. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  120. table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']})
  121. time.sleep(5)
  122. page_height = driver.execute_script("return document.body.scrollHeight")
  123. scroll_step = page_height // 4
  124. current_height = 0
  125. while current_height < page_height:
  126. driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});")
  127. time.sleep(3)
  128. current_height += scroll_step
  129. time.sleep(10)
  130. break
  131. else:
  132. ex=False
  133. for ee in exclude:
  134. if ee in href:
  135. ex=True
  136. if not ex:
  137. test_lst.append(elmt)
  138. txt_dict[elmt]=txt
  139. idx+=1
  140. if exclude is not None:
  141. print('exclude')
  142. elmt=random.choice(test_lst[5:])
  143. print(elmt)
  144. print(txt_dict[elmt])
  145. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  146. elmt.click()
  147. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  148. time.sleep(5)
  149. if not found: #don't waste resources, pick a random link as long as it is ok
  150. pick=''
  151. negativeflag=True
  152. while negativeflag==True:
  153. negativeflag=False
  154. pick = random.choice(elmts)
  155. href = pick.get_attribute('href')
  156. if href in blacklist:
  157. negativeflag=True
  158. webdriver.ActionChains(driver).move_to_element(pick).perform()
  159. webdriver.ActionChains(driver).move_to_element(pick).click().perform()
  160. #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']})
  161. except:
  162. print('exception')
  163. traceback.print_exc()
  164. driver.quit()
  165. # sys.exit()
  166. while True:
  167. try:
  168. cursor=db.query('select json from seo.seo_jobs where cust="啟翔" and plan="形象SEO" and json like "%陳百欽%" order by rand() limit 1')
  169. for c in cursor:
  170. js=json.loads(c['json'])
  171. prefix=js['prefix']
  172. postfix=js['postfix']
  173. domain=js['domain'][0]
  174. positive=js['positive']
  175. rnd=js['rnd']
  176. kw=''
  177. while '陳百欽' not in kw:
  178. kw=''
  179. kw1=random.choice(positive)
  180. kw2=random.choice(rnd)
  181. kw=kw1+" "+prefix+" "+kw2
  182. code='03'
  183. run_once({'domain':domain,'kw':kw, 'cust':'啟翔'})
  184. time.sleep(61)
  185. cursor=None
  186. driver=None
  187. except:
  188. traceback.print_exc()
  189. print("Execution Error")
  190. time.sleep(20)