gen_seo2.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. # import redis
  2. import time
  3. import traceback
  4. # import json
  5. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  6. import time
  7. import os
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.support import expected_conditions as EC
  10. import dataset
  11. from selenium import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.common.keys import Keys
  14. from selenium.webdriver.chrome.service import Service
  15. import json
  16. import random
  17. import time
  18. import redis
  19. import sys
  20. import codecs
  21. import random
  22. import os
  23. import time
  24. from userAgentRandomizer import userAgents
  25. import requests
  26. import pymysql
  27. pymysql.install_as_MySQLdb()
  28. driver = None
  29. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  30. headers = {
  31. "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
  32. "Content-Type": "application/x-www-form-urlencoded"
  33. }
  34. def send_msg(kw):
  35. params = {"message": kw}
  36. r = requests.post("https://notify-api.line.me/api/notify", headers=headers, params=params)
  37. def re_get_webdriver():
  38. global port
  39. global driver
  40. result = []
  41. if driver is not None:
  42. print('closing....')
  43. driver.quit()
  44. os.system('killall chrome')
  45. print('quit....')
  46. driver = None
  47. try:
  48. s = Service('/root/driver/chromedriver')
  49. options = webdriver.ChromeOptions()
  50. options.add_argument("--no-sandbox")
  51. options.add_argument("--disable-dev-shm-usage")
  52. options.add_argument("--headless")
  53. options.add_argument('--remote-debugging-port=9222')
  54. options.add_experimental_option("debuggerAddress", '127.0.0.1:9927')
  55. options.add_argument("--incognito")
  56. r = redis.Redis(host='db.ptt.cx', port=6379, db=2, password='choozmo9')
  57. data = r.get('google_proxy')
  58. jstext = data.decode('utf-8')
  59. jsobj = json.loads(jstext)
  60. proxy = random.choice(jsobj)
  61. change_ip_list = ['--proxy-server=%s' % proxy, "--proxy-server=socks5://127.0.0.1:9050",
  62. "--proxy-server=socks5://192.53.174.202:8180"]
  63. change_ip = random.choice(change_ip_list)
  64. options.add_argument(change_ip)
  65. print('使用代理ip', change_ip)
  66. driver.delete_all_cookies()
  67. driver = webdriver.Chrome(service=s, options=options)
  68. driver.set_window_size(1400, 1000)
  69. except:
  70. traceback.print_exc()
  71. driver = None
  72. return None
  73. def run_once(jsobj):
  74. table = db['rank_detection']
  75. print(jsobj)
  76. global driver
  77. # i=random.randint(0,9)
  78. i = 100
  79. if driver is None:
  80. time.sleep(8)
  81. re_get_webdriver()
  82. if driver is None:
  83. return
  84. try:
  85. kw = jsobj['kw']
  86. if jsobj.get('domain') is None:
  87. exclude = jsobj['exclude']
  88. domain = None
  89. else:
  90. domain = jsobj['domain']
  91. exclude = None
  92. # driver.get('https://www.google.com?num=100')
  93. driver.get('https://www.google.com?num=20')
  94. time.sleep(3)
  95. print(driver.current_url)
  96. elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  97. time.sleep(1)
  98. elmt.send_keys(kw)
  99. elmt.send_keys(Keys.ENTER)
  100. time.sleep(6)
  101. print(driver.current_url)
  102. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
  103. numresults = len(elmts)
  104. print('搜尋結果數量', numresults)
  105. if numresults == 0:
  106. send_msg('stop working...')
  107. sys.exit()
  108. idx = 1
  109. found = False
  110. test_lst = []
  111. txt_dict = {}
  112. for elmt in elmts:
  113. href = elmt.get_attribute('href')
  114. txt = elmt.text
  115. if len(txt) > 10:
  116. if domain is not None:
  117. if domain in href:
  118. print('found....')
  119. print('clicked....')
  120. print(href)
  121. print(txt)
  122. print("ranking", idx)
  123. found = True
  124. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  125. elmt.click()
  126. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  127. table.insert({'ranking': idx, 'kw': kw, 'results': numresults, 'url': href, 'title': txt})
  128. time.sleep(6)
  129. break
  130. else:
  131. ex = False
  132. for ee in exclude:
  133. if ee in href:
  134. ex = True
  135. if not ex:
  136. test_lst.append(elmt)
  137. txt_dict[elmt] = txt
  138. idx += 1
  139. if exclude is not None:
  140. print('exclude')
  141. elmt = random.choice(test_lst[5:])
  142. print(elmt)
  143. print(txt_dict[elmt])
  144. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  145. elmt.click()
  146. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  147. time.sleep(5)
  148. if not found:
  149. table.insert({'ranking': -1, 'kw': kw, 'results': numresults, 'url': '', 'title': '未收錄'})
  150. except:
  151. print('exception')
  152. traceback.print_exc()
  153. driver.quit()
  154. sys.exit()
  155. db.close()
  156. # par1=sys.argv[1]
  157. # port=sys.argv[2]
  158. # kws=['職籃','PLG','高雄','鋼鐵人','內幕','中資','股東','姊夫','中國','老賴','香港','無極','原始股東','外資','董事長','股權結構','高雄人','黑人','陳建州','職籃聯盟','球團','球團高層','香港無極','張憲銘','吳同喬','監察人']
  159. kws = ['金融', '人才', '國際接軌', '國際', '投資金童', '投資', '金童', '對沖基金', '香港', '外資', '原始股東', '職籃', 'PLG', '職籃聯盟', '球團', '台灣女婿',
  160. '抹紅', '保守', '港元', '美國', '升息', '戰爭', '通膨', '亞洲', '亞洲投資金童']
  161. positive = ['錢濤', '錢濤 升息', '錢濤 職籃夢']
  162. os.system('docker container restart tiny6')
  163. kw = random.choice(kws)
  164. # time.sleep(9)
  165. # run_once({'domain':'ettoday.net','kw':'錢濤'})
  166. # run_once({'exclude':['moreptt.com','ptt.cc','tnews.cc','mirrormedia.mg','newtalk.tw','pourquoi.tw','match.net.tw','freshweekly.tw','z-upload.facebook.com','udn.com'],'kw':kw+' 錢濤'})
  167. domains = ['yahoo.com', 'ettoday.net', 'tvbs.com.tw', 'sina.com.tw', 'ltn.com.tw', 'owlting.com', 'ctee.com.tw']
  168. domain = random.choice(domains)
  169. p = random.choice(positive)
  170. # run_once({'domain':domain,'kw':p})
  171. run_once({'domain': 'ettoday.net', 'kw': p})
  172. # run_once({'domain':domain,'kw':kw+' 錢濤'})