gen_seo.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. #import redis
  2. import time
  3. import traceback
  4. #import json
  5. from selenium import webdriver
  6. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  7. import time
  8. import os
  9. from selenium.webdriver.support.ui import WebDriverWait
  10. from selenium.webdriver.common.by import By
  11. from selenium.webdriver.support import expected_conditions as EC
  12. import dataset
  13. from selenium.webdriver.common.keys import Keys
  14. import json
  15. import random
  16. import time
  17. import redis
  18. import sys
  19. import codecs
  20. import random
  21. import os
  22. import time
  23. from userAgentRandomizer import userAgents
  24. import requests
  25. driver=None
  26. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  27. headers = {
  28. "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
  29. "Content-Type": "application/x-www-form-urlencoded"
  30. }
  31. def send_msg(kw):
  32. params = {"message":kw}
  33. r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
  34. def re_get_webdriver():
  35. global port
  36. global driver
  37. result=[]
  38. if driver is not None:
  39. print('closing....')
  40. driver.quit()
  41. os.system('killall chrome')
  42. print('quit....')
  43. driver=None
  44. try:
  45. ua = userAgents()
  46. user_agent = ua.random()
  47. options = webdriver.ChromeOptions()
  48. options.add_argument("--no-sandbox")
  49. options.add_argument("--disable-dev-shm-usage")
  50. options.add_argument("--headless")
  51. options.add_argument('--remote-debugging-port='+port)
  52. # options.add_experimental_option("debuggerAddress", '127.0.0.1:9922')
  53. print(user_agent)
  54. # options.add_argument("--user-agent=" +user_agent)
  55. options.add_argument("--incognito")
  56. driver=None
  57. try:
  58. # driver = webdriver.Chrome(options=options)
  59. if os.name=='nt':
  60. driver = webdriver.Chrome(executable_path='C:/portable/webdriver/chrome102/chromedriver.exe',options=options)
  61. else:
  62. driver = webdriver.Chrome(executable_path='/root/drivers/102/chromedriver',options=options)
  63. except:
  64. # driver.quit()
  65. # os.system('pkill -f ')
  66. # os.system('kill %d' % os.getpid())
  67. traceback.print_exc()
  68. sys.exit()
  69. return
  70. driver.set_window_size(1400,1000)
  71. return
  72. except:
  73. traceback.print_exc()
  74. driver=None
  75. return None
  76. def run_once(jsobj):
  77. table=db['rank_detection']
  78. print(jsobj)
  79. global driver
  80. # i=random.randint(0,9)
  81. i=100
  82. if driver is None:
  83. time.sleep(8)
  84. re_get_webdriver()
  85. if driver is None:
  86. return
  87. try:
  88. kw=jsobj['kw']
  89. if jsobj.get('domain') is None:
  90. exclude=jsobj['exclude']
  91. domain=None
  92. else:
  93. domain=jsobj['domain']
  94. exclude=None
  95. # driver.get('https://www.google.com?num=100')
  96. driver.get('https://www.google.com?num=20')
  97. time.sleep(3)
  98. print(driver.current_url)
  99. elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  100. time.sleep(1)
  101. elmt.send_keys(kw)
  102. elmt.send_keys(Keys.ENTER)
  103. time.sleep(6)
  104. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
  105. numresults=len(elmts)
  106. print('搜尋結果數量',numresults)
  107. if numresults==0:
  108. send_msg('stop working...')
  109. sys.exit()
  110. idx=1
  111. found=False
  112. test_lst=[]
  113. txt_dict={}
  114. for elmt in elmts:
  115. href=elmt.get_attribute('href')
  116. txt=elmt.text
  117. if len(txt)>10:
  118. if domain is not None:
  119. for d in domain:
  120. if d in href:
  121. print('found....')
  122. print('clicked....')
  123. print(href)
  124. print(txt)
  125. print("ranking", idx)
  126. found=True
  127. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  128. elmt.click()
  129. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  130. table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt})
  131. time.sleep(6)
  132. return
  133. else:
  134. ex=False
  135. for ee in exclude:
  136. if ee in href:
  137. ex=True
  138. if not ex:
  139. test_lst.append(elmt)
  140. txt_dict[elmt]=txt
  141. idx+=1
  142. if exclude is not None:
  143. print('exclude')
  144. elmt=random.choice(test_lst[5:])
  145. print(elmt)
  146. print(txt_dict[elmt])
  147. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  148. # elmt.click()
  149. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  150. time.sleep(5)
  151. if not found:
  152. table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
  153. except:
  154. print('exception')
  155. traceback.print_exc()
  156. driver.quit()
  157. sys.exit()
  158. par1=sys.argv[1]
  159. port=sys.argv[2]
  160. #kws=['職籃','PLG','高雄','鋼鐵人','內幕','中資','股東','姊夫','中國','老賴','香港','無極','原始股東','外資','董事長','股權結構','高雄人','黑人','陳建州','職籃聯盟','球團','球團高層','香港無極','張憲銘','吳同喬','監察人']
  161. kws=['金融', '人才', '國際接軌','國際','投資金童','投資','金童','對沖基金','香港','外資','原始股東','職籃','PLG','職籃聯盟','球團','台灣女婿','抹紅','保守','港元','美國','升息','戰爭','通膨','亞洲','亞洲投資金童']
  162. positive=['引新聞','亞洲最強對沖基金','亞洲投資金童','年底前投資須保守','對沖基金創始人錢濤','升息','職籃夢','innews','66474','生活消費 網友熱搜','懷孕','亞洲對沖基金','攤證據','證據','台灣女婿','通膨','喊冤','亞洲投資金童','創始人','年底前','自由財經','美國升息','兼執行官','個人因素','經濟通','LTN','奇摩','金融巨鱷','投資績效','掌門人','亮眼成績','在台生根','孕妻待產','長住台灣','兼首席執行官','無極資本提供','新冠疫情反覆','戰事膠著','華爾街日報','國際接軌','本地券商','台灣金融人才','彭博社','路透社','量化投資團隊','量化投資業務','人工智慧','演算法','大數據分析','解決方案','全球化']
  163. #positive=['錢濤','亞洲最強對沖基金','亞洲投資金童','年底前投資須保守','對沖基金創始人錢濤','錢濤 升息','錢濤 職籃夢','引新聞 錢濤']
  164. os.system('docker container restart '+par1)
  165. kw=random.choice(kws)
  166. #time.sleep(9)
  167. #run_once({'domain':'ettoday.net','kw':'錢濤'})
  168. #run_once({'exclude':['moreptt.com','ptt.cc','tnews.cc','mirrormedia.mg','newtalk.tw','pourquoi.tw','match.net.tw','freshweekly.tw','z-upload.facebook.com','udn.com'],'kw':kw+' 錢濤'})
  169. domain=['yahoo.com','ettoday.net','tvbs.com.tw','sina.com.tw','ltn.com.tw','owlting.com','ctee.com.tw']
  170. #domain=random.choice(domains)
  171. #p=random.choice(positive)
  172. #run_once({'domain':domain,'kw':p})
  173. #pairs=[{'domain':'innews.com.tw','kw':'錢濤 引新聞'},{'domain':'innews.com.tw','kw':'innews 錢濤'},{'domain':'innews.com.tw','kw':'錢濤 66474'},{'domain':'innews.com.tw','kw':'錢濤 生活消費 網友熱搜'},{'domain':'yahoo.com','kw':'錢濤 懷孕'},{'domain':'yahoo.com','kw':'亞洲對沖基金 錢濤'},
  174. #{'domain':'ltn.com.tw','kw':'科技人才 錢濤'}]
  175. #p=random.choice(pairs)
  176. p=random.choice(positive)
  177. run_once({'domain':domain,'kw':p+" 錢濤"})
  178. #run_once({'domain':domain,'kw':kw+' 錢濤'})