click_negative.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. # import redis
  2. import time
  3. import traceback
  4. # import json
  5. from selenium import webdriver
  6. from selenium.webdriver.chrome.service import Service
  7. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  8. import time
  9. # import urllib
  10. import os
  11. from selenium.webdriver.support.ui import WebDriverWait
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support import expected_conditions as EC
  14. import dataset
  15. from selenium.webdriver.common.keys import Keys
  16. import json
  17. import random
  18. import time
  19. # import redis
  20. import sys
  21. import codecs
  22. import random
  23. import datetime
  24. import os
  25. import time
  26. import requests
  27. import urllib.parse
  28. import ast
  29. driver = None
  30. db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
  31. # db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  32. # headers = {
  33. # "Authorization": "Bearer " + "ygIurYIfWgHj6HrQjOnVGh4rjXajZkeHuBYe12v8nTN",
  34. # "Content-Type": "application/x-www-form-urlencoded"
  35. headers = {
  36. "Authorization": "Bearer " + "OZDcq7sVKwr3F6YNLtBF3LuIgpa4Ql9eAnBWeD7sHTJ",
  37. "Content-Type": "application/x-www-form-urlencoded"
  38. }
  39. def send_msg(kw):
  40. params = {"message": kw}
  41. print('通知結果', params)
  42. r = requests.post("https://notify-api.line.me/api/notify", headers=headers, params=params)
  43. def re_get_webdriver():
  44. global port
  45. global driver
  46. global portnum
  47. global is_docker
  48. result = []
  49. if driver is not None:
  50. print('closing....')
  51. driver.quit()
  52. print('quit....')
  53. driver = None
  54. try:
  55. s = Service('C:\/Users\/s1301\/Downloads\/130\/chromedriver-win32\/chromedriver.exe')
  56. options = webdriver.ChromeOptions()
  57. options.add_argument('--headless')
  58. # options.add_argument("--user-agent=" + "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19")
  59. options.add_argument("--incognito")
  60. driver = webdriver.Chrome(options=options, service=s)
  61. driver.delete_all_cookies()
  62. driver.set_window_size(1400, 1000)
  63. except:
  64. traceback.print_exc()
  65. driver = None
  66. return None
  67. return driver
  68. def run_once(jsobj):
  69. table = db['seo_jobs_ranking']
  70. history = db['seo_search_history']
  71. nda_log = db['nda_log']
  72. delete_kw = db['delete_kw']
  73. seo = db['seo']
  74. print(jsobj)
  75. neg_word = ast.literal_eval(jsobj['neg_word'])
  76. print('這裏',neg_word)
  77. i = 100
  78. while True:
  79. driver = re_get_webdriver()
  80. print('re_get_webdriver')
  81. if driver is not None:
  82. break
  83. time.sleep(3)
  84. try:
  85. kw = jsobj['kw']
  86. domain = jsobj['domain']
  87. # googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw'.format(urllib.parse.quote(kw), 100, 'zh-TW')
  88. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw&tbm=vid&tbs=vd:m'.format(urllib.parse.quote(kw), 100, 'zh-TW')
  89. driver.get(googleurl)
  90. time.sleep(6)
  91. print(driver.current_url)
  92. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  93. # time.sleep(1)
  94. # elmt.send_keys(kw)
  95. # elmt.send_keys(Keys.ENTER)
  96. # time.sleep(6)
  97. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
  98. # elmts = driver.find_elements(By.XPATH, "//div[@class='xe8e1b']//a")
  99. numresults = len(elmts)
  100. print('搜尋結果數量', numresults)
  101. if numresults == 0:
  102. print(driver.current_url)
  103. print(driver.title)
  104. sys.exit()
  105. # time.sleep(9999)
  106. idx = 1
  107. found = 0
  108. test_lst = []
  109. clickelmt = None
  110. neg_count = 0
  111. neg_total = 0
  112. clickidx = 0
  113. clickhref = ''
  114. clicktitle = ''
  115. for elmt in elmts:
  116. href = elmt.get_attribute('href')
  117. # print(href)
  118. txt = elmt.text
  119. history.insert({'ranking': idx, 'kw': kw, 'results': numresults, 'url': href, 'title': txt,'dt':datetime.datetime.now()})
  120. # if '坑殺' in txt or '侵占' in txt or '判決書' in txt or '強佔' in txt or '掏空' in txt or '送達公告' in txt or '違反勞動'in txt:
  121. # neg_count+=1
  122. # neg_total+=idx
  123. # print('分數',neg_total, neg_count)
  124. for i in neg_word:
  125. if i in txt:
  126. neg_count += 1
  127. neg_total += idx
  128. break
  129. # print('分數',neg_total, neg_count)
  130. if domain in href:
  131. print('found....')
  132. print(href)
  133. print(txt)
  134. print("ranking", idx)
  135. found = True
  136. clickelmt = elmt
  137. clickidx = idx
  138. clickhref = href
  139. clicktitle = txt
  140. found = 1
  141. else:
  142. if found == 1:
  143. not_found = 0
  144. else:
  145. not_found = 1
  146. idx += 1
  147. if not_found == 1:
  148. print('未收錄')
  149. nda_log.insert({'ranking': -1, 'kw': kw, 'results': numresults, 'url': href, 'title': '未收錄','dt': datetime.datetime.now(), 'client': jsobj['client']})
  150. seo.delete(kw=kw, domain=domain)
  151. delete_kw.insert({'kw':kw,'domain':domain,'cust':jsobj['client'], 'dt':datetime.datetime.now()})
  152. msg_1 = '未收錄:'+kw+' '+domain
  153. msg_2 = jsobj['delete_kw_count']
  154. send_msg(msg_1 + "\n" + str(msg_2))
  155. else:
  156. nda_log.insert({'ranking': clickidx, 'kw': kw, 'results': numresults, 'url': clickhref, 'title': clicktitle,'dt': datetime.datetime.now(), 'client': jsobj['client'], 'type':''})
  157. webdriver.ActionChains(driver).move_to_element(clickelmt).perform()
  158. webdriver.ActionChains(driver).move_to_element(clickelmt).click().perform()
  159. print('clicked....')
  160. time.sleep(5)
  161. if neg_count == 0:
  162. negstr = 0
  163. else:
  164. negstr = neg_total / neg_count
  165. print(negstr)
  166. if negstr > 0 and negstr < 21:
  167. print('警示字')
  168. msg_1 = '警示字:' + kw
  169. msg_2 = jsobj['delete_kw_count']
  170. send_msg(msg_1 + "\n" + str(msg_2))
  171. seo.delete(kw=kw, domain=domain)
  172. delete_kw.insert({'kw': kw, 'domain': domain, 'cust': jsobj['client'],'dt':datetime.datetime.now()})
  173. table.insert(
  174. {'ranking': clickidx, 'kw': kw, 'results': numresults, 'url': domain, 'title': clicktitle, 'avg_neg': negstr,
  175. 'dt': datetime.datetime.now()})
  176. except:
  177. traceback.print_exc()
  178. print('exception')
  179. traceback.print_exc()
  180. # db.close()
  181. driver.quit()
  182. while True:
  183. # cursor = db.query("select * from public.seo where cust='百威' and type is NULL order by random() limit 1")
  184. cursor = db.query("select * from public.seo where cust='驊揚' order by random() limit 1")
  185. # cursor = db.query("select * from public.seo where id=627")
  186. cursor_n = db.query("select * from public.neg_word where client='驊揚'")
  187. cursor_d = db.query("select * from public.delete_kw where now()::date = dt::date")
  188. for c in cursor:
  189. kw = c['kw']
  190. domain = c['domain']
  191. d = {'驊揚':0,'百威':0}
  192. for c in cursor_d:
  193. if c['cust'] in d.keys():
  194. d[c['cust']]+=1
  195. print(d)
  196. for c in cursor_n:
  197. neg_word = c['neg_word']
  198. run_once({'domain':domain,'kw':kw,'client':'驊揚','neg_word':neg_word,'delete_kw_count':d})
  199. # db.close()
  200. print('等待下次執行')
  201. time.sleep(80)