hhh_use4.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. #import redis
  2. import time
  3. import traceback
  4. #import json
  5. from selenium import webdriver
  6. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  7. import time
  8. import urllib
  9. import os
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.common.by import By
  12. from selenium.webdriver.support import expected_conditions as EC
  13. import dataset
  14. from selenium.webdriver.common.keys import Keys
  15. import json
  16. import random
  17. import time
  18. import redis
  19. import sys
  20. import codecs
  21. import random
  22. import os
  23. import time
  24. import requests
  25. import datetime
  26. driver=None
  27. import pymysql
  28. pymysql.install_as_MySQLdb()
  29. from fake_useragent import UserAgent
  30. ua = UserAgent()
  31. #proxy_enabled=True
  32. # proxy_enabled=False
  33. # # https://youtu.be/cR2M5Khgxvc
  34. # db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  35. # glog_table=db['general_log']
  36. def re_get_webdriver():
  37. # global port
  38. global driver
  39. global portnum
  40. # os.system('killall chrome')
  41. result=[]
  42. # if driver is not None:
  43. # print('closing....')
  44. # driver.quit()
  45. # print('quit....')
  46. # driver=None
  47. # os.system()
  48. options = webdriver.ChromeOptions()
  49. options.add_argument("--user-agent=" +ua.random)
  50. options.add_argument("--no-sandbox")
  51. options.add_argument("--headless")
  52. options.add_argument("--incognito")
  53. driver = webdriver.Remote(
  54. command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  55. options=options)
  56. return driver
  57. # try:
  58. # options = webdriver.ChromeOptions()
  59. # options.add_argument("--no-sandbox")
  60. # options.add_argument("--headless")
  61. # options.add_argument("--incognito")
  62. # # if proxy_enabled:
  63. # # options.add_argument('--proxy-server=socks5://172.104.92.245:14900')
  64. # try:
  65. # driver = webdriver.Remote(
  66. # command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  67. # options=options)
  68. # except:
  69. # traceback.print_exc()
  70. # return None
  71. # return driver
  72. # except:
  73. # traceback.print_exc()
  74. # driver=None
  75. # return None
  76. # return driver
  77. def run_once():
  78. global count
  79. global bok
  80. global portnum
  81. # global glog_table
  82. # table=db['nda_log']
  83. # print(jsobj)
  84. # kw=jsobj['kw']
  85. # options = webdriver.ChromeOptions()
  86. # options.add_argument("--no-sandbox")
  87. # options.add_argument("--headless")
  88. # options.add_argument("--incognito")
  89. # driver = webdriver.Remote(
  90. # command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  91. # options=options)
  92. # if driver is not None:
  93. # break
  94. ettoday_url_list=[]
  95. #db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  96. db = dataset.connect('mysql://choozmo:pAssw0rd@172.105.194.225:3306/seo?charset=utf8mb4')
  97. cursor=db.query('SELECT * FROM columnids order by rand()')
  98. for c in cursor:
  99. #lst.append('https://www.hhh.com.tw/columns/detail/'+str(c['cid'])+'/')
  100. #lst.append('https://m.hhh.com.tw/columns/detail/'+str(c['cid'])+'/')
  101. #ettoday_url_list.append('https://m.hhh.com.tw/HHH_NEW/columns_detail/'+str(c['cid'])+'.php')
  102. ettoday_url_list.append('https://hhh.com.tw/HHH_NEW/columns_detail/'+str(c['cid'])+'.php?utm_source=choozmo&utm_medium=banner&utm_campaign=choozmo')
  103. # try:
  104. for i in ettoday_url_list:
  105. try:
  106. driver=re_get_webdriver()
  107. except:
  108. print('driver broken')
  109. portnum=random.randint(1399,1599)
  110. print(portnum)
  111. os.system('docker container stop p9916')
  112. time.sleep(5)
  113. os.system('docker container rm p9916')
  114. time.sleep(5)
  115. os.system('docker run -d -p '+str(portnum)+':4444 --name p9916 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:106.0')
  116. bok +=1
  117. count=0
  118. time.sleep(5)
  119. driver=re_get_webdriver()
  120. time.sleep(3)
  121. try:
  122. driver.get(i)
  123. time.sleep(3)
  124. #html = driver.page_source
  125. #with open('log1.txt', 'a+', encoding='UTF-8') as f:
  126. #f.write(html)
  127. #f.write("="*25)
  128. #elmt_next = driver.find_element(By.XPATH, '//*[@id="house"]/div[3]/div[2]/div[6]/div/div/div[1]/article/div/div[3]/p[1]/a')
  129. #webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  130. #time.sleep(0.5)
  131. #webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  132. print("cick!",i)
  133. count+=1
  134. print("count_time:",count,';broken_time:',bok)
  135. # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
  136. time.sleep(random.randint(3,5))
  137. #driver.close()
  138. driver.quit()
  139. except Exception as e:
  140. #print(e)
  141. #with open('log1.txt', 'a+', encoding='UTF-8') as f:
  142. #f.write(e.msg)
  143. #f.write(e.args)
  144. #driver.close()
  145. try:
  146. driver.quit()
  147. except:
  148. print('no have driver')
  149. print("wrong",i,';broken_time:',bok)
  150. time.sleep(5)
  151. # except:
  152. # print('wrong for:',i)
  153. # kw=jsobj['kw']
  154. # if jsobj.get('domain') is None:
  155. # exclude=jsobj['exclude']
  156. # domain=None
  157. # else:
  158. # domain=jsobj['domain']
  159. # exclude=None
  160. # driver.get('https://www.google.com?num=100')
  161. # time.sleep(17)
  162. # while True:
  163. # try:
  164. # print(driver.current_url)
  165. # break
  166. # except:
  167. # traceback.print_exc()
  168. # driver=re_get_webdriver()
  169. # time.sleep(3)
  170. # driver.get('https://www.google.com?num=100')
  171. # time.sleep(3)
  172. # time.sleep(3)
  173. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  174. # time.sleep(1)
  175. # elmt.send_keys(kw)
  176. # elmt.send_keys(Keys.ENTER)
  177. # time.sleep(6)
  178. # elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
  179. # numresults=len(elmts)
  180. # # time.sleep(9999)
  181. # print('搜尋結果數量',numresults)
  182. # if numresults==0:
  183. # print(driver.current_url)
  184. # print(driver.title)
  185. # sys.exit()
  186. # idx=1
  187. # found=False
  188. # test_lst=[]
  189. # for elmt in elmts:
  190. # href=elmt.get_attribute('href')
  191. # txt=elmt.text
  192. # if len(txt)>10:
  193. # if domain is not None:
  194. # for d in domain:
  195. # if d in href:
  196. # print('found....')
  197. # print('clicked....')
  198. # print(href)
  199. # print(txt)
  200. # print("ranking", idx)
  201. # found=True
  202. # webdriver.ActionChains(driver).move_to_element(elmt).perform()
  203. # # elmt.click()
  204. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  205. # table.insert({'kw':kw,'ranking':idx,'title':txt,'url':href,'dt':datetime.datetime.now(),'result':numresults,'client':'64G'})
  206. # time.sleep(6)
  207. # return
  208. # else:
  209. # if exclude not in href:
  210. # test_lst.append(elmt)
  211. # idx+=1
  212. # if exclude is not None:
  213. # print('exclude')
  214. # elmt=random.choice(test_lst)
  215. # print(elmt)
  216. # webdriver.ActionChains(driver).move_to_element(elmt).perform()
  217. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  218. # time.sleep(5)
  219. # if not found:
  220. # table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
  221. # except:
  222. # traceback.print_exc()
  223. # print('exception')
  224. # traceback.print_exc()
  225. # time.sleep(5)
  226. # r=random.randint(0,27)
  227. # r=26
  228. # cursor=db.query('select json from seo_jobs where cust="KNIGHT" and plan="形象SEO" order by rand() limit 1')
  229. # for c in cursor:
  230. # js=json.loads(c['json'])
  231. # prefix=js['prefix']
  232. # postfix=js['postfix']
  233. # domain=js['domain'][0]
  234. # positive=js['positive']
  235. # rnd=js['rnd']
  236. portnum=random.randint(1399,1599)
  237. print(portnum)
  238. os.system('docker container stop p9916')
  239. time.sleep(5)
  240. os.system('docker container rm p9916')
  241. time.sleep(5)
  242. os.system('docker run -d -p '+str(portnum)+':4444 --name p9916 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:106.0')
  243. bok = 0
  244. count=0
  245. time.sleep(5)
  246. while True:
  247. # run_once()
  248. # time.sleep(10)
  249. try:
  250. run_once()
  251. except:
  252. bok+=1
  253. print('broken')
  254. time.sleep(5)
  255. # kw=random.choice(positive)
  256. # kw2=random.choice(rnd)
  257. # count=0
  258. # while True:
  259. # try:
  260. # run_once({'domain':domain,'kw':prefix+" "+kw+" "+kw2})
  261. # count+=1
  262. # except:
  263. # continue
  264. # print('中場休息 次數',count)
  265. # time.sleep(random.randint(120,150))