gen_seo2b.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. #import redis
  2. import time
  3. import traceback
  4. #import json
  5. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  6. import time
  7. import os
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.support import expected_conditions as EC
  10. import dataset
  11. from selenium import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.common.keys import Keys
  14. from selenium.webdriver.chrome.service import Service
  15. import json
  16. import random
  17. import time
  18. import datetime
  19. import sys
  20. import codecs
  21. import random
  22. import os
  23. import time
  24. import requests
  25. import pymysql
  26. import urllib.parse
  27. import multiprocessing
  28. pymysql.install_as_MySQLdb()
  29. from userAgentRandomizer import userAgents
  30. driver=None
  31. driverclosed = 0
  32. headers = {
  33. "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
  34. "Content-Type": "application/x-www-form-urlencoded"
  35. }
  36. def send_msg(kw):
  37. params = {"message":kw}
  38. r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
  39. blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402','https://ipo168.pixnet.net/blog/post/207626239-%E5%95%9F%E7%BF%94%E8%BC%95%E9%87%91%E5%B1%AC%E7%A7%91%E6%8A%80%E8%82%A1%E7%A5%A8%E6%98%AF%E9%80%99%E6%A8%A3%E7%9A%84%E5%85%AC%E5%8F%B8%21%21%E6%8A%95%E8%B3%87%E5%89%8D%E8%A6%81','https://latest.mediatagtw.com/article/%e5%95%9f%e7%bf%94%e8%bc%95%e9%87%91%e5%b1%ac%e7%a7%91%e6%8a%80%e8%82%a1%e4%bb%bd%e6%9c%89%e9%99%90%e5%85%ac%e5%8f%b8#gsc.tab=0']
  40. def re_get_webdriver():
  41. global port
  42. global driver
  43. result=[]
  44. if driver is not None:
  45. print('closing....')
  46. driver.quit()
  47. os.system('killall chrome')
  48. print('quit....')
  49. driver=None
  50. try:
  51. options = webdriver.ChromeOptions()
  52. # options.add_argument("user-agent=%s" % user_agent)
  53. options.add_argument('--headless')
  54. options.add_argument("--incognito")
  55. driver = webdriver.Chrome(options=options)
  56. driver.delete_all_cookies()
  57. driver.set_window_size(1400,1000)
  58. except:
  59. traceback.print_exc()
  60. driver=None
  61. return None
  62. def getDriver():
  63. ua=userAgents().random()
  64. options = webdriver.ChromeOptions()
  65. #print(ua)
  66. #options.add_argument("user-agent="+ua)
  67. options.add_argument('--headless')
  68. options.add_argument('--incognito')
  69. options.add_argument('--no-sandbox')
  70. driver=webdriver.Chrome(options=options)
  71. driver.set_window_size(1400,1000)
  72. return driver
  73. def run_once(jsobj,db):
  74. table=db['nda_log']
  75. print(jsobj)
  76. global driver
  77. driverclosed=0
  78. # i=random.randint(0,9)
  79. i=100
  80. driver=getDriver()
  81. try:
  82. kw=jsobj['kw']
  83. if jsobj.get('domain') is None:
  84. exclude=jsobj['exclude']
  85. domain=None
  86. else:
  87. domain=jsobj['domain']
  88. exclude=None
  89. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW')
  90. driver.get(googleurl)
  91. time.sleep(6)
  92. print(driver.current_url)
  93. if 'sorry' in driver.current_url:
  94. print("URL Error: Caught")
  95. driver.quit()
  96. driverclosed=1
  97. return
  98. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  99. # time.sleep(1)
  100. # elmt.send_keys(kw)
  101. # elmt.send_keys(Keys.ENTER)
  102. # time.sleep(6)
  103. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
  104. numresults=len(elmts)
  105. print('搜尋結果數量',numresults)
  106. if numresults==0:
  107. driver.quit()
  108. driverclosed=1
  109. return
  110. idx=1
  111. found=False
  112. test_lst=[]
  113. txt_dict={}
  114. for elmt in elmts:
  115. href=elmt.get_attribute('href')
  116. txt=elmt.text
  117. if len(txt)>10:
  118. if domain is not None:
  119. if domain in href and href not in blacklist:
  120. print('found....')
  121. print('clicked....')
  122. print(href)
  123. print("ranking", idx)
  124. found=True
  125. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  126. # elmt.click()
  127. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  128. table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']})
  129. time.sleep(5)
  130. page_height = driver.execute_script("return document.body.scrollHeight")
  131. scroll_step = page_height // 4
  132. current_height = 0
  133. while current_height < page_height:
  134. driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});")
  135. time.sleep(3)
  136. current_height += scroll_step
  137. time.sleep(10)
  138. break
  139. else:
  140. ex=False
  141. for ee in exclude:
  142. if ee in href:
  143. ex=True
  144. if not ex:
  145. test_lst.append(elmt)
  146. txt_dict[elmt]=txt
  147. idx+=1
  148. if exclude is not None:
  149. print('exclude')
  150. elmt=random.choice(test_lst[5:])
  151. print(elmt)
  152. print(txt_dict[elmt])
  153. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  154. elmt.click()
  155. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  156. time.sleep(5)
  157. if not found: #don't waste resources, pick a random link as long as it is ok
  158. attempt=0
  159. pick=''
  160. '''
  161. negativeflag=True
  162. while negativeflag==True:
  163. attempt+=1
  164. negativeflag=False
  165. pick = random.choice(elmts)
  166. href = pick.get_attribute('href')
  167. if href in blacklist:
  168. negativeflag=True
  169. ''''''try:
  170. content = pick.find_element(By.XPATH, "//em[@class='VwiC3b yXK7lf lyLwlc yDYNvb W8l4ac lEBKkf']/").text
  171. print(content)
  172. if "陳百欽" not in content:
  173. Exception
  174. except:
  175. print("Not Found")
  176. negativeFlag = True''''''
  177. if attempt==100:
  178. print("Action Terminated")
  179. break
  180. print(href)
  181. webdriver.ActionChains(driver).move_to_element(pick).perform()
  182. webdriver.ActionChains(driver).move_to_element(pick).click().perform()
  183. time.sleep(10)
  184. #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']})
  185. '''
  186. except:
  187. print('exception')
  188. traceback.print_exc()
  189. driver.quit()
  190. driverclosed=1
  191. # sys.exit()
  192. def exe():
  193. try: # OLD TABLE NAME: seo.seo_jobs
  194. db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
  195. '''cursor=db.query("select json from public.seo_jobs_temp where cust='啟翔' and plan='形象SEO' and json like '%陳百欽%' and (json like '%chinatimes.com%') order by random() limit 1")
  196. for c in cursor:
  197. js=json.loads(c['json'])
  198. prefix=js['prefix']
  199. postfix=js['postfix']
  200. domain=js['domain'][0]
  201. positive=js['positive']
  202. rnd=js['rnd']
  203. kw=''
  204. while '陳百欽' not in kw:
  205. kw=''
  206. kw1=random.choice(positive)
  207. kw2=random.choice(rnd)
  208. kw=prefix+" "+kw2+" "+kw1
  209. code='03'
  210. '''
  211. kwlist = ['創新園區','產業聚落','桃園新屋','研發中心','航太工業','節能減碳','中小企業','高端市場','經營哲學','防疫門','Bellavita','IKEA','馬達','家具','歐美','家具家飾','歐美日','台北101','鋁材帷幕牆','金屬合金','鋁產品製程','台灣鋁業市占率','外銷市場','緬甸設廠','產能需求','EMBA','汽車應用','高端價值','W Hotel','北歐家具','日本家具品牌','醫療產業','循環經濟','歐美日訂單','藍海策略','重圍突破','航太產業','LED','綠色','台灣人才','國際市場競爭力','東協','產業升級','光電屋頂','優勢國際綠能公司','台北小巨蛋','大安森林公園','太陽能發電站','綠能環保科技園區','發電設備']
  212. domain='chinatimes.com'
  213. kw=random.choice(kwlist) + ' 陳百欽'
  214. run_once({'domain':domain,'kw':kw, 'cust':'啟翔'},db)
  215. try:
  216. if driver is not None:
  217. try:
  218. driver.quit()
  219. except:
  220. pass
  221. except:
  222. pass
  223. cursor=None
  224. driver=None
  225. db.close()
  226. print("Completed")
  227. time.sleep(61)
  228. except:
  229. traceback.print_exc()
  230. print("Execution Error")
  231. try:
  232. if driver is not None:
  233. try:
  234. driver.quit()
  235. except:
  236. pass
  237. except:
  238. pass
  239. cursor=None
  240. driver=None
  241. db.close()
  242. time.sleep(20)
  243. def cleanup():
  244. if driverclosed == 0:
  245. try:
  246. driver.quit()
  247. except:
  248. pass
  249. if __name__ == '__main__':
  250. runcount=1
  251. while True:
  252. print("Run "+ str(runcount))
  253. start_time = time.time()
  254. p = multiprocessing.Process(target=exe)
  255. p.start()
  256. p.join(120)
  257. if p.is_alive():
  258. print("Overtime")
  259. p.kill()
  260. cleanup()
  261. p.join()
  262. duration = time.time()-start_time
  263. print("Runs: " + str(runcount) + " | Duration: " + str(duration))
  264. runcount+=1