etoday_use2.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. #import redis
  2. import time
  3. import traceback
  4. #import json
  5. from selenium import webdriver
  6. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  7. import time
  8. import urllib
  9. import os
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.common.by import By
  12. from selenium.webdriver.support import expected_conditions as EC
  13. import dataset
  14. from selenium.webdriver.common.keys import Keys
  15. import json
  16. import random
  17. import time
  18. import redis
  19. import sys
  20. import codecs
  21. import random
  22. import os
  23. import time
  24. import requests
  25. import datetime
  26. driver=None
  27. from fake_useragent import UserAgent
  28. ua = UserAgent()
  29. #proxy_enabled=True
  30. # proxy_enabled=False
  31. # # https://youtu.be/cR2M5Khgxvc
  32. # db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  33. # glog_table=db['general_log']
  34. def re_get_webdriver():
  35. # global port
  36. global driver
  37. global portnum
  38. # os.system('killall chrome')
  39. result=[]
  40. # if driver is not None:
  41. # print('closing....')
  42. # driver.quit()
  43. # print('quit....')
  44. # driver=None
  45. # os.system()
  46. options = webdriver.ChromeOptions()
  47. options.add_argument("--user-agent=" +ua.random)
  48. options.add_argument("--no-sandbox")
  49. options.add_argument("--headless")
  50. options.add_argument("--incognito")
  51. driver = webdriver.Remote(
  52. command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  53. options=options)
  54. return driver
  55. # try:
  56. # options = webdriver.ChromeOptions()
  57. # options.add_argument("--no-sandbox")
  58. # options.add_argument("--headless")
  59. # options.add_argument("--incognito")
  60. # # if proxy_enabled:
  61. # # options.add_argument('--proxy-server=socks5://172.104.92.245:14900')
  62. # try:
  63. # driver = webdriver.Remote(
  64. # command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  65. # options=options)
  66. # except:
  67. # traceback.print_exc()
  68. # return None
  69. # return driver
  70. # except:
  71. # traceback.print_exc()
  72. # driver=None
  73. # return None
  74. # return driver
  75. def run_once():
  76. global count
  77. global bok
  78. global portnum
  79. # global glog_table
  80. # table=db['nda_log']
  81. # print(jsobj)
  82. # kw=jsobj['kw']
  83. # options = webdriver.ChromeOptions()
  84. # options.add_argument("--no-sandbox")
  85. # options.add_argument("--headless")
  86. # options.add_argument("--incognito")
  87. # driver = webdriver.Remote(
  88. # command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  89. # options=options)
  90. # if driver is not None:
  91. # break
  92. ettoday_url_list = [
  93. 'https://house.ettoday.net/news/1586609',
  94. 'https://house.ettoday.net/news/1586604',
  95. 'https://house.ettoday.net/news/1597942',
  96. 'https://house.ettoday.net/news/1597936',
  97. 'https://house.ettoday.net/news/1675455',
  98. 'https://house.ettoday.net/news/1701065',
  99. 'https://house.ettoday.net/news/1700425',
  100. 'https://house.ettoday.net/news/1492047',
  101. 'https://house.ettoday.net/news/1492167',
  102. 'https://house.ettoday.net/news/1492288',
  103. 'https://house.ettoday.net/news/1492178',
  104. 'https://house.ettoday.net/news/1492229',
  105. 'https://house.ettoday.net/news/1492134',
  106. 'https://house.ettoday.net/news/1492240',
  107. 'https://house.ettoday.net/news/1492161',
  108. 'https://house.ettoday.net/news/1492168',
  109. 'https://house.ettoday.net/news/1492217',
  110. ]
  111. # try:
  112. for i in ettoday_url_list:
  113. try:
  114. driver=re_get_webdriver()
  115. except:
  116. print('driver broken')
  117. portnum=random.randint(6444,6555)
  118. print(portnum)
  119. os.system('docker container stop p8818')
  120. time.sleep(5)
  121. os.system('docker container rm p8818')
  122. time.sleep(5)
  123. os.system('docker run -d -p '+str(portnum)+':4444 --name p8818 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:114.0')
  124. #os.system('docker run -d -p '+str(portnum)+':4444 --name p8818 --dns 168.95.1.1 selenium/standalone-chrome:104.0')
  125. bok +=1
  126. count=0
  127. time.sleep(5)
  128. driver=re_get_webdriver()
  129. time.sleep(3)
  130. try:
  131. driver.get(i)
  132. time.sleep(3)
  133. #html = driver.page_source
  134. #with open('log1.txt', 'a+', encoding='UTF-8') as f:
  135. #f.write(html)
  136. #f.write("="*25)
  137. elmt_next = driver.find_element(By.XPATH, '//*[@id="house"]/div[3]/div[2]/div[6]/div/div/div[1]/article/div/div[4]/p[1]/a')
  138. webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  139. time.sleep(3)
  140. webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  141. print("cick!",i)
  142. count+=1
  143. print("count_time:",count,';broken_time:',bok)
  144. # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
  145. time.sleep(random.randint(3,7))
  146. #driver.close()
  147. driver.quit()
  148. except Exception as e:
  149. #print(e)
  150. #with open('log1.txt', 'a+', encoding='UTF-8') as f:
  151. #f.write(e.msg)
  152. #f.write(e.args)
  153. #driver.close()
  154. try:
  155. driver.quit()
  156. except:
  157. print('no have driver')
  158. print("wrong",i,';broken_time:',bok)
  159. time.sleep(5)
  160. # except:
  161. # print('wrong for:',i)
  162. # kw=jsobj['kw']
  163. # if jsobj.get('domain') is None:
  164. # exclude=jsobj['exclude']
  165. # domain=None
  166. # else:
  167. # domain=jsobj['domain']
  168. # exclude=None
  169. # driver.get('https://www.google.com?num=100')
  170. # time.sleep(17)
  171. # while True:
  172. # try:
  173. # print(driver.current_url)
  174. # break
  175. # except:
  176. # traceback.print_exc()
  177. # driver=re_get_webdriver()
  178. # time.sleep(3)
  179. # driver.get('https://www.google.com?num=100')
  180. # time.sleep(3)
  181. # time.sleep(3)
  182. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  183. # time.sleep(1)
  184. # elmt.send_keys(kw)
  185. # elmt.send_keys(Keys.ENTER)
  186. # time.sleep(6)
  187. # elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
  188. # numresults=len(elmts)
  189. # # time.sleep(9999)
  190. # print('搜尋結果數量',numresults)
  191. # if numresults==0:
  192. # print(driver.current_url)
  193. # print(driver.title)
  194. # sys.exit()
  195. # idx=1
  196. # found=False
  197. # test_lst=[]
  198. # for elmt in elmts:
  199. # href=elmt.get_attribute('href')
  200. # txt=elmt.text
  201. # if len(txt)>10:
  202. # if domain is not None:
  203. # for d in domain:
  204. # if d in href:
  205. # print('found....')
  206. # print('clicked....')
  207. # print(href)
  208. # print(txt)
  209. # print("ranking", idx)
  210. # found=True
  211. # webdriver.ActionChains(driver).move_to_element(elmt).perform()
  212. # # elmt.click()
  213. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  214. # table.insert({'kw':kw,'ranking':idx,'title':txt,'url':href,'dt':datetime.datetime.now(),'result':numresults,'client':'64G'})
  215. # time.sleep(6)
  216. # return
  217. # else:
  218. # if exclude not in href:
  219. # test_lst.append(elmt)
  220. # idx+=1
  221. # if exclude is not None:
  222. # print('exclude')
  223. # elmt=random.choice(test_lst)
  224. # print(elmt)
  225. # webdriver.ActionChains(driver).move_to_element(elmt).perform()
  226. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  227. # time.sleep(5)
  228. # if not found:
  229. # table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
  230. # except:
  231. # traceback.print_exc()
  232. # print('exception')
  233. # traceback.print_exc()
  234. # time.sleep(5)
  235. # r=random.randint(0,27)
  236. # r=26
  237. # cursor=db.query('select json from seo_jobs where cust="KNIGHT" and plan="形象SEO" order by rand() limit 1')
  238. # for c in cursor:
  239. # js=json.loads(c['json'])
  240. # prefix=js['prefix']
  241. # postfix=js['postfix']
  242. # domain=js['domain'][0]
  243. # positive=js['positive']
  244. # rnd=js['rnd']
  245. portnum=random.randint(6444,6555)
  246. print(portnum)
  247. os.system('docker container stop p8818')
  248. time.sleep(5)
  249. os.system('docker container rm p8818')
  250. time.sleep(5)
  251. os.system('docker run -d -p '+str(portnum)+':4444 --name p8818 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:114.0')
  252. #os.system('docker run -d -p '+str(portnum)+':4444 --name p8818 --dns 168.95.1.1 selenium/standalone-chrome:114.0')
  253. bok = 0
  254. count=0
  255. time.sleep(5)
  256. while True:
  257. # run_once()
  258. # time.sleep(10)
  259. try:
  260. run_once()
  261. except:
  262. bok+=1
  263. print('broken')
  264. time.sleep(5)
  265. # kw=random.choice(positive)
  266. # kw2=random.choice(rnd)
  267. # count=0
  268. # while True:
  269. # try:
  270. # run_once({'domain':domain,'kw':prefix+" "+kw+" "+kw2})
  271. # count+=1
  272. # except:
  273. # continue
  274. # print('中場休息 次數',count)
  275. # time.sleep(random.randint(120,150))