gen_seo2.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. import traceback
  2. import dataset
  3. from selenium import webdriver
  4. from selenium.webdriver.common.by import By
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.chrome.service import Service
  7. import json
  8. import redis
  9. import sys
  10. import random
  11. import os
  12. import time
  13. <<<<<<< HEAD
  14. from userAgentRandomizer import userAgents
  15. =======
  16. import requests
  17. #import pymysql
  18. #pymysql.install_as_MySQLdb()
  19. >>>>>>> 60aab8b5861808a3b1359bbffc1117b5d72236d4
  20. driver = None
  21. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  22. headers = {
  23. "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
  24. "Content-Type": "application/x-www-form-urlencoded"
  25. }
  26. def send_msg(kw):
  27. params = {"message": kw}
  28. r = requests.post("https://notify-api.line.me/api/notify", headers=headers, params=params)
  29. def re_get_webdriver():
  30. global port
  31. global driver
  32. result = []
  33. if driver is not None:
  34. print('closing....')
  35. driver.quit()
  36. os.system('killall chrome')
  37. print('quit....')
  38. driver = None
  39. try:
  40. s = Service('/root/driver/chromedriver102')
  41. options = webdriver.ChromeOptions()
  42. options.add_argument("--disable-dev-shm-usage")
  43. options.add_argument("--headless")
  44. options.add_argument('--remote-debugging-port=9222')
  45. options.add_experimental_option("debuggerAddress", '127.0.0.1:9927')
  46. options.add_argument("--incognito")
  47. r = redis.Redis(host='db.ptt.cx', port=6379, db=2, password='choozmo9')
  48. data = r.get('google_proxy')
  49. jstext = data.decode('utf-8')
  50. jsobj = json.loads(jstext)
  51. proxy = random.choice(jsobj)
  52. change_ip_list = ['--proxy-server=%s' % proxy, "--proxy-server=socks5://127.0.0.1:9050",
  53. "--proxy-server=socks5://192.53.174.202:8180"]
  54. change_ip = random.choice(change_ip_list)
  55. options.add_argument(change_ip)
  56. print('使用代理ip', change_ip)
  57. driver = webdriver.Chrome(options=options,service=s)
  58. #driver.delete_all_cookies()
  59. driver.set_window_size(1400, 1000)
  60. except:
  61. traceback.print_exc()
  62. driver = None
  63. return None
  64. def run_once(jsobj):
  65. table = db['rank_detection']
  66. print(jsobj)
  67. global driver
  68. # i=random.randint(0,9)
  69. i = 100
  70. if driver is None:
  71. time.sleep(8)
  72. re_get_webdriver()
  73. if driver is None:
  74. return
  75. try:
  76. kw = jsobj['kw']
  77. if jsobj.get('domain') is None:
  78. exclude = jsobj['exclude']
  79. domain = None
  80. else:
  81. domain = jsobj['domain']
  82. exclude = None
  83. # driver.get('https://www.google.com?num=100')
  84. driver.get('https://www.google.com?num=20')
  85. time.sleep(3)
  86. print(driver.current_url)
  87. elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  88. time.sleep(1)
  89. elmt.send_keys(kw)
  90. elmt.send_keys(Keys.ENTER)
  91. time.sleep(6)
  92. print(driver.current_url)
  93. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
  94. numresults = len(elmts)
  95. print('搜尋結果數量', numresults)
  96. if numresults == 0:
  97. send_msg('stop working...')
  98. sys.exit()
  99. idx = 1
  100. found = False
  101. test_lst = []
  102. txt_dict = {}
  103. for elmt in elmts:
  104. href = elmt.get_attribute('href')
  105. txt = elmt.text
  106. if len(txt) > 10:
  107. if domain is not None:
  108. if domain in href:
  109. print('found....')
  110. print('clicked....')
  111. print(href)
  112. print(txt)
  113. print("ranking", idx)
  114. found = True
  115. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  116. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  117. table.insert({'ranking': idx, 'kw': kw, 'results': numresults, 'url': href, 'title': txt})
  118. time.sleep(6)
  119. break
  120. else:
  121. ex = False
  122. for ee in exclude:
  123. if ee in href:
  124. ex = True
  125. if not ex:
  126. test_lst.append(elmt)
  127. txt_dict[elmt] = txt
  128. idx += 1
  129. if exclude is not None:
  130. print('exclude')
  131. elmt = random.choice(test_lst[5:])
  132. print(elmt)
  133. print(txt_dict[elmt])
  134. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  135. elmt.click()
  136. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  137. time.sleep(5)
  138. if not found:
  139. table.insert({'ranking': -1, 'kw': kw, 'results': numresults, 'url': '', 'title': '未收錄'})
  140. except:
  141. print('exception')
  142. traceback.print_exc()
  143. sys.exit()
  144. driver.quit()
  145. db.close()
  146. # par1=sys.argv[1]
  147. # port=sys.argv[2]
  148. # kws=['職籃','PLG','高雄','鋼鐵人','內幕','中資','股東','姊夫','中國','老賴','香港','無極','原始股東','外資','董事長','股權結構','高雄人','黑人','陳建州','職籃聯盟','球團','球團高層','香港無極','張憲銘','吳同喬','監察人']
  149. kws = ['金融', '人才', '國際接軌', '國際', '投資金童', '投資', '金童', '對沖基金', '香港', '外資', '原始股東', '職籃', 'PLG', '職籃聯盟', '球團', '台灣女婿',
  150. '抹紅', '保守', '港元', '美國', '升息', '戰爭', '通膨', '亞洲', '亞洲投資金童']
  151. positive = ['錢濤','亞洲投資金童', '錢濤 職籃夢']
  152. os.system('docker container restart tiny6')
  153. kw = random.choice(kws)
  154. # time.sleep(9)
  155. # run_once({'domain':'ettoday.net','kw':'錢濤'})
  156. # run_once({'exclude':['moreptt.com','ptt.cc','tnews.cc','mirrormedia.mg','newtalk.tw','pourquoi.tw','match.net.tw','freshweekly.tw','z-upload.facebook.com','udn.com'],'kw':kw+' 錢濤'})
  157. domains = ['yahoo.com', 'ettoday.net', 'tvbs.com.tw', 'sina.com.tw', 'ltn.com.tw', 'owlting.com', 'ctee.com.tw']
  158. domain = random.choice(domains)
  159. p = random.choice(positive)
  160. # run_once({'domain':domain,'kw':p})
  161. run_once({'domain': 'ettoday.net', 'kw': p})
  162. # run_once({'domain':domain,'kw':kw+' 錢濤'})