gen_seo.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. #import redis
  2. import time
  3. import traceback
  4. #import json
  5. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  6. import time
  7. import os
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.support import expected_conditions as EC
  10. import dataset
  11. from selenium import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.common.keys import Keys
  14. from selenium.webdriver.chrome.service import Service
  15. import json
  16. import random
  17. import time
  18. import datetime
  19. import sys
  20. import codecs
  21. import random
  22. import os
  23. import time
  24. import requests
  25. import pymysql
  26. import urllib.parse
  27. pymysql.install_as_MySQLdb()
  28. driver=None
  29. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  30. headers = {
  31. "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
  32. "Content-Type": "application/x-www-form-urlencoded"
  33. }
  34. def send_msg(kw):
  35. params = {"message":kw}
  36. r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
  37. blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402']
  38. def re_get_webdriver():
  39. global port
  40. global driver
  41. result=[]
  42. if driver is not None:
  43. print('closing....')
  44. driver.quit()
  45. os.system('killall chrome')
  46. print('quit....')
  47. driver=None
  48. try:
  49. options = webdriver.ChromeOptions()
  50. # options.add_argument("user-agent=%s" % user_agent)
  51. options.add_argument('--headless')
  52. options.add_argument("--incognito")
  53. driver = webdriver.Chrome(options=options)
  54. driver.delete_all_cookies()
  55. driver.set_window_size(1400,1000)
  56. except:
  57. traceback.print_exc()
  58. driver=None
  59. return None
  60. def run_once(jsobj):
  61. table=db['nda_log']
  62. print(jsobj)
  63. global driver
  64. # i=random.randint(0,9)
  65. i=100
  66. if driver is None:
  67. time.sleep(8)
  68. re_get_webdriver()
  69. if driver is None:
  70. return
  71. try:
  72. kw=jsobj['kw']
  73. if jsobj.get('domain') is None:
  74. exclude=jsobj['exclude']
  75. domain=None
  76. else:
  77. domain=jsobj['domain']
  78. exclude=None
  79. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW')
  80. driver.get(googleurl)
  81. time.sleep(6)
  82. print(driver.current_url)
  83. if 'sorry' in driver.current_url:
  84. print("URL Error: Caught")
  85. return
  86. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  87. # time.sleep(1)
  88. # elmt.send_keys(kw)
  89. # elmt.send_keys(Keys.ENTER)
  90. # time.sleep(6)
  91. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
  92. numresults=len(elmts)
  93. print('搜尋結果數量',numresults)
  94. if numresults==0:
  95. send_msg('stop working...')
  96. sys.exit()
  97. idx=1
  98. found=False
  99. test_lst=[]
  100. txt_dict={}
  101. for elmt in elmts:
  102. href=elmt.get_attribute('href')
  103. txt=elmt.text
  104. if len(txt)>10:
  105. if domain is not None:
  106. if domain in href:
  107. print('found....')
  108. print('clicked....')
  109. print(href)
  110. print("ranking", idx)
  111. found=True
  112. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  113. # elmt.click()
  114. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  115. table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']})
  116. time.sleep(5)
  117. page_height = driver.execute_script("return document.body.scrollHeight")
  118. scroll_step = page_height // 4
  119. current_height = 0
  120. while current_height < page_height:
  121. driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});")
  122. time.sleep(3)
  123. current_height += scroll_step
  124. time.sleep(10)
  125. break
  126. else:
  127. ex=False
  128. for ee in exclude:
  129. if ee in href:
  130. ex=True
  131. if not ex:
  132. test_lst.append(elmt)
  133. txt_dict[elmt]=txt
  134. idx+=1
  135. if exclude is not None:
  136. print('exclude')
  137. elmt=random.choice(test_lst[5:])
  138. print(elmt)
  139. print(txt_dict[elmt])
  140. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  141. elmt.click()
  142. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  143. time.sleep(5)
  144. if not found: #don't waste resources, pick a random link as long as it is ok
  145. pick=''
  146. negativeflag=True
  147. while negativeflag==True:
  148. negativeflag=False
  149. pick = random.choice(elmts)
  150. href = pick.get_attribute('href')
  151. if href in blacklist:
  152. negativeflag=True
  153. webdriver.ActionChains(driver).move_to_element(pick).perform()
  154. webdriver.ActionChains(driver).move_to_element(pick).click().perform()
  155. #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']})
  156. except:
  157. print('exception')
  158. traceback.print_exc()
  159. driver.quit()
  160. # sys.exit()
  161. while True:
  162. try:
  163. cursor=db.query('select json from seo.seo_jobs where cust="啟翔" and plan="形象SEO" and json like "%陳百欽%" order by rand() limit 1')
  164. for c in cursor:
  165. js=json.loads(c['json'])
  166. prefix=js['prefix']
  167. postfix=js['postfix']
  168. domain=js['domain'][0]
  169. positive=js['positive']
  170. rnd=js['rnd']
  171. kw=''
  172. while '陳百欽' not in kw:
  173. kw=''
  174. kw1=random.choice(positive)
  175. kw2=random.choice(rnd)
  176. kw=kw1+" "+prefix+" "+kw2
  177. code='03'
  178. run_once({'domain':domain,'kw':kw, 'cust':'啟翔'})
  179. time.sleep(61)
  180. cursor=None
  181. driver=None
  182. except:
  183. traceback.print_exc()
  184. print("Execution Error")
  185. time.sleep(20)