click_choozmo.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. #import redis
  2. import time
  3. import traceback
  4. #import json
  5. from selenium import webdriver
  6. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  7. import time
  8. #import urllib
  9. import os
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.common.by import By
  12. from selenium.webdriver.support import expected_conditions as EC
  13. import dataset
  14. from selenium.webdriver.common.keys import Keys
  15. import json
  16. import random
  17. import time
  18. #import redis
  19. import sys
  20. import codecs
  21. import random
  22. import os
  23. import time
  24. import requests
  25. import platform
  26. import socket
  27. import os
  28. import time
  29. import datetime
  30. import requests
  31. driver=None
  32. dockername='p4444'
  33. is_docker=True
  34. #is_docker=False
  35. db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
  36. table=db['prog_launch']
  37. unamestr=str(platform.uname())
  38. table.insert({'uname':unamestr,'progname':os.path.basename(__file__),'dt':datetime.datetime.now()})
  39. #db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  40. if is_docker:
  41. portnum=random.randint(4444,4555)
  42. print(portnum)
  43. os.system('docker container stop '+dockername)
  44. time.sleep(0.5)
  45. os.system('docker container rm '+dockername)
  46. time.sleep(0.5)
  47. os.system('docker run -d -p '+str(portnum)+':4444 --shm-size=2g --name '+dockername+' --dns 168.95.1.1 selenium/standalone-chrome:103.0')
  48. time.sleep(7)
  49. def re_get_webdriver():
  50. global port
  51. global driver
  52. global portnum
  53. global is_docker
  54. result=[]
  55. if driver is not None:
  56. print('closing....')
  57. driver.quit()
  58. print('quit....')
  59. driver=None
  60. try:
  61. options = webdriver.ChromeOptions()
  62. options.add_argument("--no-sandbox")
  63. if is_docker:
  64. options.add_argument("--headless")
  65. options.add_argument("--incognito")
  66. # options.add_argument('--proxy-server=socks5://172.104.92.245:14900')
  67. mobile_emulation = {
  68. "deviceMetrics": { "width": 360, "height": 640, "pixelRatio": 3.0 },
  69. "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" }
  70. # options.add_experimental_option("mobileEmulation", mobile_emulation)
  71. if is_docker:
  72. try:
  73. driver = webdriver.Remote(
  74. command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  75. options=options)
  76. except:
  77. traceback.print_exc()
  78. time.sleep(9999)
  79. return None
  80. return driver
  81. try:
  82. driver = webdriver.Chrome(options=options)
  83. except:
  84. traceback.print_exc()
  85. return None
  86. return driver
  87. except:
  88. traceback.print_exc()
  89. driver=None
  90. return None
  91. return driver
  92. def run_once(jsobj):
  93. table=db['seo_jobs_ranking']
  94. history=db['seo_search_history']
  95. print(jsobj)
  96. kw=jsobj['kw']
  97. i=100
  98. while True:
  99. driver=re_get_webdriver()
  100. print('re_get_webdriver')
  101. if driver is not None:
  102. break
  103. time.sleep(3)
  104. try:
  105. kw=jsobj['kw']
  106. if jsobj.get('domain') is None:
  107. exclude=jsobj['exclude']
  108. domain=None
  109. else:
  110. domain=jsobj['domain']
  111. exclude=None
  112. driver.get('https://www.google.com?num=100')
  113. time.sleep(1)
  114. # time.sleep(1000)
  115. while True:
  116. try:
  117. print(driver.current_url)
  118. break
  119. except:
  120. traceback.print_exc()
  121. driver=re_get_webdriver()
  122. time.sleep(3)
  123. driver.get('https://www.google.com?num=100')
  124. # time.sleep(3)
  125. time.sleep(3)
  126. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  127. elmt = driver.find_element(By.XPATH, "//textarea[@name='q']")
  128. # elmt = driver.find_element(By.XPATH, "//textarea[@type='search']")
  129. time.sleep(1)
  130. elmt.send_keys(kw)
  131. elmt.send_keys(Keys.ENTER)
  132. time.sleep(3)
  133. # elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
  134. elmts = driver.find_elements(By.XPATH, "//a[@jsname='UWckNb']")
  135. numresults=len(elmts)
  136. print('搜尋結果數量',numresults)
  137. if numresults==0:
  138. print(driver.current_url)
  139. print(driver.title)
  140. sys.exit()
  141. # time.sleep(9999)
  142. idx=1
  143. found=False
  144. test_lst=[]
  145. clickelmt=None
  146. neg_count=0
  147. neg_total=0
  148. clickidx=0
  149. clickhref=''
  150. clicktitle=''
  151. for elmt in elmts:
  152. href=elmt.get_attribute('href')
  153. txt=elmt.text
  154. history.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt})
  155. if '坑殺' in txt or '侵占' in txt or '判決書' in txt or '強佔' in txt or '掏空' in txt or '送達公告' in txt or '違反勞動'in txt:
  156. neg_count+=1
  157. neg_total+=idx
  158. if len(txt)>10:
  159. if domain is not None:
  160. random.shuffle(domain)
  161. for d in domain:
  162. if d in href:
  163. print('found....')
  164. print('clicked....')
  165. print(href)
  166. print(txt)
  167. print("ranking", idx)
  168. found=True
  169. clickelmt=elmt
  170. clickidx=idx
  171. clickhref=href
  172. clicktitle=txt
  173. else:
  174. if exclude not in href:
  175. test_lst.append(elmt)
  176. idx+=1
  177. if exclude is not None:
  178. print('exclude')
  179. elmt=random.choice(test_lst)
  180. print(elmt)
  181. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  182. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  183. time.sleep(5)
  184. if neg_count ==0:
  185. negstr='0'
  186. else:
  187. negstr=str(neg_total/neg_count)
  188. print(' negative: ' +negstr)
  189. if not found:
  190. True
  191. table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
  192. else:
  193. webdriver.ActionChains(driver).move_to_element(clickelmt).perform()
  194. webdriver.ActionChains(driver).move_to_element(clickelmt).click().perform()
  195. print('clicked...')
  196. table.insert({'ranking':clickidx,'kw':kw,'results':numresults,'url':clickhref,'title':clicktitle,'avg_neg':negstr})
  197. time.sleep(6)
  198. print('sleep 6')
  199. return
  200. except:
  201. traceback.print_exc()
  202. print('exception')
  203. traceback.print_exc()
  204. driver.quit()
  205. time.sleep(5)
  206. #r=random.randint(0,7)
  207. #r=6
  208. r=999
  209. #JNOTE: 關鍵字點擊
  210. related=''
  211. cursor=db.query('SELECT cust,plan,prefix,domain,kw,positive FROM public.seo_jobs order by random() limit 1')
  212. for c in cursor:
  213. cust=c['cust']
  214. kw=c['kw']
  215. plan=c['plan']
  216. prefix=c['prefix']
  217. domain=eval(c['domain'])
  218. positive=eval(c['positive'])
  219. break
  220. if r==6:
  221. cust='CHOOZMO'
  222. # plan='補lost'
  223. plan='文章'
  224. prefix=''
  225. postfix=''
  226. domain=['contact_94111/%E9%99%B3%E7%99%BE%E6%AC%BD-401a08c4f3f5']
  227. # positive=['集仕多']
  228. # positive=['集仕多 AIGV']
  229. # positive=['集仕多 三立']
  230. # positive=['台北室內設計公司排名']
  231. # positive=[related]
  232. # positive=['半 日照 植物 推薦']
  233. # positive=['3 坪 多大']
  234. # positive=['鞋櫃']
  235. # positive=['裝修屋子']
  236. # positive=['']
  237. # kw='幸福空間'
  238. kw='陳百欽 森林資源'
  239. # kw='輕裝修'
  240. #朱英凱
  241. #琢隱設計
  242. #os.system('curl --socks5 choozmo:choozmo9@172.104.92.245:14900 http://www.google.com')
  243. newkw=prefix+" "+kw+' '+random.choice(positive)
  244. print(newkw)
  245. #newkw=kw
  246. run_once({'domain':domain,'kw':newkw})