click_negative.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. # import redis
  2. import time
  3. import traceback
  4. # import json
  5. from selenium import webdriver
  6. from selenium.webdriver.chrome.service import Service
  7. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  8. import time
  9. # import urllib
  10. import os
  11. from selenium.webdriver.support.ui import WebDriverWait
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support import expected_conditions as EC
  14. import dataset
  15. from selenium.webdriver.common.keys import Keys
  16. import json
  17. import random
  18. import time
  19. # import redis
  20. import sys
  21. import codecs
  22. import random
  23. import datetime
  24. import os
  25. import time
  26. import requests
  27. import urllib.parse
  28. import ast
  29. driver = None
  30. db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
  31. # db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  32. def re_get_webdriver():
  33. global port
  34. global driver
  35. global portnum
  36. global is_docker
  37. result = []
  38. if driver is not None:
  39. print('closing....')
  40. driver.quit()
  41. print('quit....')
  42. driver = None
  43. try:
  44. s = Service('/Users/mac/Downloads/119/chromedriver')
  45. options = webdriver.ChromeOptions()
  46. options.add_argument('--headless')
  47. # options.add_argument("--user-agent=" + "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19")
  48. options.add_argument("--incognito")
  49. driver = webdriver.Chrome(
  50. options=options, service=s)
  51. driver.delete_all_cookies()
  52. driver.set_window_size(1400, 1000)
  53. except:
  54. traceback.print_exc()
  55. driver = None
  56. return None
  57. return driver
  58. def run_once(jsobj):
  59. table = db['seo_jobs_ranking']
  60. history = db['seo_search_history']
  61. nda_log = db['nda_log']
  62. print(jsobj)
  63. neg_word = ast.literal_eval(jsobj['neg_word'])
  64. print('這裏',neg_word)
  65. i = 100
  66. while True:
  67. driver = re_get_webdriver()
  68. print('re_get_webdriver')
  69. if driver is not None:
  70. break
  71. time.sleep(3)
  72. try:
  73. kw = jsobj['kw']
  74. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW')
  75. driver.get(googleurl)
  76. time.sleep(6)
  77. print(driver.current_url)
  78. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  79. # time.sleep(1)
  80. # elmt.send_keys(kw)
  81. # elmt.send_keys(Keys.ENTER)
  82. # time.sleep(6)
  83. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
  84. numresults = len(elmts)
  85. print('搜尋結果數量', numresults)
  86. if numresults == 0:
  87. print(driver.current_url)
  88. print(driver.title)
  89. sys.exit()
  90. # time.sleep(9999)
  91. idx = 1
  92. found = False
  93. test_lst = []
  94. clickelmt = None
  95. neg_count = 0
  96. neg_total = 0
  97. clickidx = 0
  98. clickhref = ''
  99. clicktitle = ''
  100. for elmt in elmts:
  101. href = elmt.get_attribute('href')
  102. txt = elmt.text
  103. history.insert({'ranking': idx, 'kw': kw, 'results': numresults, 'url': href, 'title': txt,'dt':datetime.datetime.now()})
  104. # if '坑殺' in txt or '侵占' in txt or '判決書' in txt or '強佔' in txt or '掏空' in txt or '送達公告' in txt or '違反勞動'in txt:
  105. # neg_count+=1
  106. # neg_total+=idx
  107. # print('分數',neg_total, neg_count)
  108. for i in neg_word:
  109. print(i)
  110. if i in txt:
  111. neg_count += 1
  112. neg_total += idx
  113. print('分數',neg_total, neg_count)
  114. if domain in href:
  115. print('found....')
  116. print(href)
  117. print(txt)
  118. print("ranking", idx)
  119. found = True
  120. clickelmt = elmt
  121. clickidx = idx
  122. clickhref = href
  123. clicktitle = txt
  124. nda_log.insert({'ranking': idx, 'kw': kw, 'results': numresults, 'url': href, 'title': txt,'dt': datetime.datetime.now(), 'client': jsobj['client']})
  125. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  126. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  127. print('clicked....')
  128. time.sleep(5)
  129. if neg_count == 0:
  130. negstr = '0'
  131. else:
  132. negstr = str(neg_total / neg_count)
  133. print(' negative: ' + negstr)
  134. table.insert({'ranking': clickidx, 'kw': kw, 'results': numresults, 'url': domain, 'title': clicktitle,
  135. 'avg_neg': negstr, 'dt': datetime.datetime.now()})
  136. db.close()
  137. break
  138. else:
  139. nda_log.insert({'ranking': -1, 'kw': kw, 'results': numresults, 'url': href, 'title': '未收錄','dt': datetime.datetime.now(), 'client': jsobj['client']})
  140. idx += 1
  141. db.close()
  142. except:
  143. traceback.print_exc()
  144. print('exception')
  145. traceback.print_exc()
  146. db.close()
  147. driver.quit()
  148. cursor = db.query("select cust, json from public.seo_job where cust='信義房屋' order by random() limit 1")
  149. cursor_n = db.query("select * from public.neg_word where client='信義房屋'")
  150. for c in cursor:
  151. js_string = c['json']
  152. js = json.loads(js_string)
  153. prefix=js['prefix']
  154. postfix=js['postfix']
  155. domain=js['domain'][0]
  156. positive=js['positive']
  157. rnd=js['rnd']
  158. kw1=random.choice(positive)
  159. kw2=random.choice(rnd)
  160. # kw=kw1+" "+prefix+" "+kw2
  161. kw = prefix + " " + kw1
  162. for c in cursor_n:
  163. neg_word = c['neg_word']
  164. while True:
  165. run_once({'domain':domain,'kw':'信義 房屋','client':'信義房屋','neg_word':neg_word})
  166. print('等待下次執行')
  167. time.sleep(80)