etoday_use4.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. #import redis
  2. import time
  3. import traceback
  4. #import json
  5. from selenium import webdriver
  6. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  7. import time
  8. import urllib
  9. import os
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.common.by import By
  12. from selenium.webdriver.support import expected_conditions as EC
  13. import dataset
  14. from selenium.webdriver.common.keys import Keys
  15. import json
  16. import random
  17. import time
  18. import redis
  19. import sys
  20. import codecs
  21. import random
  22. import os
  23. import time
  24. import requests
  25. import datetime
  26. driver=None
  27. from fake_useragent import UserAgent
  28. ua = UserAgent()
  29. #proxy_enabled=True
  30. # proxy_enabled=False
  31. # # https://youtu.be/cR2M5Khgxvc
  32. # db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  33. # glog_table=db['general_log']
  34. def re_get_webdriver():
  35. # global port
  36. global driver
  37. global portnum
  38. # os.system('killall chrome')
  39. result=[]
  40. # if driver is not None:
  41. # print('closing....')
  42. # driver.quit()
  43. # print('quit....')
  44. # driver=None
  45. # os.system()
  46. options = webdriver.ChromeOptions()
  47. options.add_argument("--user-agent=" +ua.random)
  48. options.add_argument("--no-sandbox")
  49. options.add_argument("--headless")
  50. options.add_argument("--incognito")
  51. driver = webdriver.Remote(
  52. command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  53. options=options)
  54. return driver
  55. # try:
  56. # options = webdriver.ChromeOptions()
  57. # options.add_argument("--no-sandbox")
  58. # options.add_argument("--headless")
  59. # options.add_argument("--incognito")
  60. # # if proxy_enabled:
  61. # # options.add_argument('--proxy-server=socks5://172.104.92.245:14900')
  62. # try:
  63. # driver = webdriver.Remote(
  64. # command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  65. # options=options)
  66. # except:
  67. # traceback.print_exc()
  68. # return None
  69. # return driver
  70. # except:
  71. # traceback.print_exc()
  72. # driver=None
  73. # return None
  74. # return driver
  75. def run_once():
  76. global count
  77. global portnum
  78. global bok
  79. # global glog_table
  80. # table=db['nda_log']
  81. # print(jsobj)
  82. # kw=jsobj['kw']
  83. # options = webdriver.ChromeOptions()
  84. # options.add_argument("--no-sandbox")
  85. # options.add_argument("--headless")
  86. # options.add_argument("--incognito")
  87. # driver = webdriver.Remote(
  88. # command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  89. # options=options)
  90. # if driver is not None:
  91. # break
  92. ettoday_url_list = [
  93. 'https://house.ettoday.net/news/1586609',
  94. 'https://house.ettoday.net/news/1586604',
  95. 'https://house.ettoday.net/news/1597942',
  96. 'https://house.ettoday.net/news/1597936',
  97. 'https://house.ettoday.net/news/1675455',
  98. 'https://house.ettoday.net/news/1701065',
  99. 'https://house.ettoday.net/news/1700425',
  100. 'https://house.ettoday.net/news/1492047',
  101. 'https://house.ettoday.net/news/1492167',
  102. 'https://house.ettoday.net/news/1492288',
  103. 'https://house.ettoday.net/news/1492178',
  104. 'https://house.ettoday.net/news/1492229',
  105. 'https://house.ettoday.net/news/1492134',
  106. 'https://house.ettoday.net/news/1492240',
  107. 'https://house.ettoday.net/news/1492161',
  108. 'https://house.ettoday.net/news/1492168',
  109. 'https://house.ettoday.net/news/1492217']
  110. # try:
  111. for i in ettoday_url_list:
  112. try:
  113. driver=re_get_webdriver()
  114. except:
  115. portnum=random.randint(7555,7666)
  116. print(portnum)
  117. os.system('docker container stop p8816')
  118. time.sleep(5)
  119. os.system('docker container rm p8816')
  120. time.sleep(5)
  121. os.system('docker run -d -p '+str(portnum)+':4444 --name p8816 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:111.0')
  122. bok += 1
  123. count=0
  124. time.sleep(5)
  125. driver=re_get_webdriver()
  126. time.sleep(3)
  127. try:
  128. driver.get(i)
  129. time.sleep(3)
  130. elmt_next = driver.find_element(By.XPATH, '//*[@id="house"]/div[3]/div[2]/div[6]/div/div/div[1]/article/div/div[4]/p[1]/a')
  131. webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  132. time.sleep(3)
  133. webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  134. print("cick!",i)
  135. count+=1
  136. print("count_time:",count,';borken_time:',bok)
  137. # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
  138. time.sleep(random.randint(3,7))
  139. #driver.close()
  140. driver.quit()
  141. except:
  142. #driver.close()
  143. try:
  144. driver.quit()
  145. except:
  146. print('no have driver')
  147. print("wrong",i,';borken_time:',bok)
  148. time.sleep(5)
  149. # except:
  150. # print('wrong for:',i)
  151. # kw=jsobj['kw']
  152. # if jsobj.get('domain') is None:
  153. # exclude=jsobj['exclude']
  154. # domain=None
  155. # else:
  156. # domain=jsobj['domain']
  157. # exclude=None
  158. # driver.get('https://www.google.com?num=100')
  159. # time.sleep(17)
  160. # while True:
  161. # try:
  162. # print(driver.current_url)
  163. # break
  164. # except:
  165. # traceback.print_exc()
  166. # driver=re_get_webdriver()
  167. # time.sleep(3)
  168. # driver.get('https://www.google.com?num=100')
  169. # time.sleep(3)
  170. # time.sleep(3)
  171. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  172. # time.sleep(1)
  173. # elmt.send_keys(kw)
  174. # elmt.send_keys(Keys.ENTER)
  175. # time.sleep(6)
  176. # elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
  177. # numresults=len(elmts)
  178. # # time.sleep(9999)
  179. # print('搜尋結果數量',numresults)
  180. # if numresults==0:
  181. # print(driver.current_url)
  182. # print(driver.title)
  183. # sys.exit()
  184. # idx=1
  185. # found=False
  186. # test_lst=[]
  187. # for elmt in elmts:
  188. # href=elmt.get_attribute('href')
  189. # txt=elmt.text
  190. # if len(txt)>10:
  191. # if domain is not None:
  192. # for d in domain:
  193. # if d in href:
  194. # print('found....')
  195. # print('clicked....')
  196. # print(href)
  197. # print(txt)
  198. # print("ranking", idx)
  199. # found=True
  200. # webdriver.ActionChains(driver).move_to_element(elmt).perform()
  201. # # elmt.click()
  202. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  203. # table.insert({'kw':kw,'ranking':idx,'title':txt,'url':href,'dt':datetime.datetime.now(),'result':numresults,'client':'64G'})
  204. # time.sleep(6)
  205. # return
  206. # else:
  207. # if exclude not in href:
  208. # test_lst.append(elmt)
  209. # idx+=1
  210. # if exclude is not None:
  211. # print('exclude')
  212. # elmt=random.choice(test_lst)
  213. # print(elmt)
  214. # webdriver.ActionChains(driver).move_to_element(elmt).perform()
  215. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  216. # time.sleep(5)
  217. # if not found:
  218. # table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
  219. # except:
  220. # traceback.print_exc()
  221. # print('exception')
  222. # traceback.print_exc()
  223. # time.sleep(5)
  224. # r=random.randint(0,27)
  225. # r=26
  226. # cursor=db.query('select json from seo_jobs where cust="KNIGHT" and plan="形象SEO" order by rand() limit 1')
  227. # for c in cursor:
  228. # js=json.loads(c['json'])
  229. # prefix=js['prefix']
  230. # postfix=js['postfix']
  231. # domain=js['domain'][0]
  232. # positive=js['positive']
  233. # rnd=js['rnd']
  234. portnum=random.randint(7555,7666)
  235. print(portnum)
  236. os.system('docker container stop p8816')
  237. time.sleep(5)
  238. os.system('docker container rm p8816')
  239. time.sleep(5)
  240. os.system('docker run -d -p '+str(portnum)+':4444 --name p8816 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:111.0')
  241. bok = 0
  242. count=0
  243. time.sleep(5)
  244. while True:
  245. # run_once()
  246. # time.sleep(10)
  247. try:
  248. run_once()
  249. except:
  250. bok+=1
  251. print('broken')
  252. time.sleep(5)
  253. # kw=random.choice(positive)
  254. # kw2=random.choice(rnd)
  255. # count=0
  256. # while True:
  257. # try:
  258. # run_once({'domain':domain,'kw':prefix+" "+kw+" "+kw2})
  259. # count+=1
  260. # except:
  261. # continue
  262. # print('中場休息 次數',count)
  263. # time.sleep(random.randint(120,150))