yahoo_use3.py 16 KB


  1. import time
  2. from datetime import datetime
  3. import json
  4. from selenium import webdriver
  5. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  6. import time
  7. import os
  8. import urllib.parse
  9. from selenium.webdriver.support.ui import WebDriverWait
  10. from selenium.webdriver.common.by import By
  11. from selenium.webdriver.support import expected_conditions as EC
  12. import codecs
  13. import random
  14. import requests
  15. import dataset
  16. import traceback
  17. import sys
  18. from selenium.webdriver.common.keys import Keys
  19. import timeit
  20. import socket
  21. import random
  22. import re
  23. # import requests
  24. from fake_useragent import UserAgent
  25. ua = UserAgent()
  26. def re_get_webdriver():
  27. # global port
  28. global driver
  29. global portnum
  30. # os.system('killall chrome')
  31. result=[]
  32. # if driver is not None:
  33. # print('closing....')
  34. # driver.quit()
  35. # print('quit....')
  36. # driver=None
  37. # os.system()
  38. options = webdriver.ChromeOptions()
  39. options.add_argument("--user-agent=" +ua.random)
  40. options.add_argument("--no-sandbox")
  41. options.add_argument("--headless")
  42. options.add_argument("--incognito")
  43. driver = webdriver.Remote(
  44. command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  45. options=options)
  46. return driver
  47. # headers = {'user-agent': ua.chrome}
  48. # r = requests.get('https://house.ettoday.net/news/1492047', headers=headers)
  49. # print(r.text)
  50. # options.binary_location = ('C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe')
  51. # driverPath = './chromedriver.exe'
  52. # driver = webdriver.Firefox()
  53. # driver.get('https://google.com')
  54. # ettoday_url_list = ['https://house.ettoday.net/news/1492047',
  55. # 'https://house.ettoday.net/news/1492167',
  56. # 'https://house.ettoday.net/news/1492288',
  57. # 'https://house.ettoday.net/news/1492178',
  58. # 'https://house.ettoday.net/news/1492229',
  59. # 'https://house.ettoday.net/news/1492134',
  60. # 'https://house.ettoday.net/news/1492240',
  61. # 'https://house.ettoday.net/news/1492161',
  62. # 'https://house.ettoday.net/news/1492168',
  63. # 'https://house.ettoday.net/news/1492217']
  64. # for i in ettoday_url_list:
  65. # driver.get(i)
  66. # time.sleep(3)
  67. # elmt_next = driver.find_element(By.XPATH, '//*[@id="house"]/div[3]/div[2]/div[6]/div/div/div[1]/article/div/div[3]/p[1]/a')
  68. # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  69. # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  70. # print("cick:",i)
  71. # # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
  72. # time.sleep(random.randint(3,7))
  73. # driver.quit()
  74. # query='幸福空間'
  75. # elmt.send_keys(query)
  76. # elmt.send_keys(Keys.ENTER)
  77. # time.sleep(1)
  78. # time.sleep(1)
  79. def run_once():
  80. global count
  81. global bok
  82. global portnum
  83. yahoo_url_list = [
  84. 'https://house.yahoo.com.tw/%E9%9B%8D%E5%AE%B9%E9%9B%85%E7%B7%BB-%E5%84%AA%E9%9B%85%E5%81%87%E6%9C%9F-%E6%96%B0%E5%8F%A4%E5%85%B8-31%E5%9D%AA-020000499.html',
  85. 'https://house.yahoo.com.tw/%E6%96%B0%E7%94%9F%E9%AD%85%E5%8A%9B-%E8%AD%9C%E5%AF%AB%E5%B9%B8%E7%A6%8F%E5%9C%93%E8%88%9E%E6%9B%B2-%E5%8C%97%E6%AD%90%E9%A2%A8-35%E5%9D%AA-020000759.html',
  86. 'https://house.yahoo.com.tw/%E7%AF%89-%E6%96%B9%E8%B3%AA%E7%B0%A1%E5%85%89%E5%AF%93-%E4%BA%AB%E5%8F%97%E6%81%AC%E9%9D%9C%E6%BA%AB%E9%A6%A8%E6%97%A5%E5%B8%B8-%E4%BA%BA%E6%96%87%E9%A3%AF%E5%BA%97%E9%A2%A8-45%E5%9D%AA-020000682.html',
  87. 'https://house.yahoo.com.tw/sheer-%E7%B4%94%E7%B2%B9-%E7%8F%BE%E4%BB%A3%E9%A2%A8-25%E5%9D%AA-020000325.html',
  88. 'https://house.yahoo.com.tw/%E8%AE%8A%E5%BD%A2%E8%88%87%E7%B5%84%E5%90%88-%E8%A4%87%E5%90%88%E5%BC%8F%E7%9A%84%E7%A9%BA%E9%96%93%E8%A8%AD%E8%A8%88-%E4%B8%AD-020000869.html',
  89. 'https://house.yahoo.com.tw/%E8%A7%A3%E6%94%BE%E6%8B%98%E7%A6%81%E5%BF%83%E9%9D%88-%E8%B6%85%E8%84%AB%E7%8B%82%E6%83%B3%E9%80%8F%E5%A4%A9%E5%8E%9D-020000093.html',
  90. 'https://house.yahoo.com.tw/%E8%A6%AA%E5%AD%90%E6%96%99%E7%90%86%E7%9B%B4%E6%92%AD%E4%B8%BB%E7%9A%84%E5%AE%B6-%E5%BE%AE%E7%BE%8E%E5%BC%8F%E8%A8%AD%E8%A8%88-50%E5%9D%AA-020000607.html',
  91. 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%94%E6%B7%B1%E8%89%B2%E6%BA%AB%E5%BA%A6-%E8%8B%B1%E5%80%AB%E7%B4%B3%E5%A3%AB%E8%B2%B4%E6%97%8F%E9%A2%A8-%E7%8F%BE%E4%BB%A3%E5%A5%A2%E8%8F%AF%E9%A2%A8-020000334.html',
  92. 'https://house.yahoo.com.tw/%E7%8E%A9%E5%91%B3%E7%B3%BB%E7%B5%B1%E6%9D%BF-%E5%BF%AB%E9%80%9F%E6%88%90%E5%AE%B6%E7%B0%A1%E7%B4%84%E7%8F%BE%E4%BB%A3%E9%A2%A8-35%E5%9D%AA-020000199.html',
  93. 'https://house.yahoo.com.tw/%E4%BB%A5%E5%9C%93%E5%BD%A2%E7%AC%A6%E7%A2%BC-%E5%BD%A2%E5%A1%91%E6%81%A2%E5%BC%98%E5%A5%A2%E7%BE%8E%E8%87%BB%E9%82%B8-%E5%A5%A2%E8%8F%AF%E9%A2%A8-42%E5%9D%AA-020000780.html',
  94. 'https://house.yahoo.com.tw/%E7%B4%99%E9%9B%95-%E7%8F%BE%E4%BB%A3%E9%A2%A8-30%E5%9D%AA-020000034.html',
  95. 'https://house.yahoo.com.tw/%E6%8C%91%E9%AB%98%E6%97%A5%E7%B3%BB%E8%BE%A6%E5%85%AC%E7%A9%BA%E9%96%93-%E7%B5%90%E5%90%88%E4%BE%98%E5%AF%82%E8%88%87%E8%87%AA%E7%84%B6%E7%9A%84%E7%B0%A1%E7%B4%84%E7%BE%8E%E5%AD%B8-230%E5%9D%AA-020000517.html',
  96. 'https://house.yahoo.com.tw/35%E5%9D%AA%E8%80%81%E5%B1%8B%E5%A5%BD%E5%B1%8B%E6%B3%81-%E9%80%B2%E5%8C%96%E8%B3%AA%E6%84%9F%E6%A9%9F%E8%83%BD%E5%AE%85-%E7%8F%BE%E4%BB%A3%E9%A2%A8-020000438.html',
  97. 'https://house.yahoo.com.tw/%E6%BA%AB%E8%98%8A%E9%9F%B6%E5%85%89-%E7%8F%BE%E4%BB%A3%E9%A2%A8-25%E5%9D%AA-020000337.html',
  98. 'https://house.yahoo.com.tw/home-%E6%B7%B7%E6%90%AD%E9%A2%A8-020000440.html',
  99. 'https://house.yahoo.com.tw/%E6%B8%B2%E6%9F%93%E6%9D%B1%E6%96%B9%E8%B3%AA%E9%9F%BB-%E4%BA%A4%E7%B9%94%E7%8F%BE%E4%BB%A3%E6%99%AF%E7%B7%BB-70%E5%9D%AA-020000667.html',
  100. 'https://house.yahoo.com.tw/%E5%A4%A7%E8%86%BD%E8%B7%B3%E8%84%AB%E6%85%A3%E5%B8%B8%E6%80%9D%E7%B6%AD-35%E5%9D%AA%E8%80%81%E6%88%BF%E6%BC%94%E7%B9%B9%E9%A0%82%E7%B4%9A%E9%A3%AF%E5%BA%97%E8%B3%AA%E6%84%9F%E6%9C%83%E6%89%80-020000172.html',
  101. 'https://house.yahoo.com.tw/%E9%9D%88%E5%B7%A7%E5%85%89%E6%BD%A4-%E6%81%AC%E8%AC%90%E5%AE%B6%E5%B1%8B-%E7%B6%93%E5%85%B8%E5%8C%97%E6%AD%90%E9%A2%A8-8%E5%9D%AA-020000645.html',
  102. 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%9E-%E6%BA%AB%E6%BD%A4%E7%94%9F%E6%B4%BB%E6%B0%A3%E6%81%AF-%E7%8F%BE%E4%BB%A3%E9%A2%A8-18%E5%9D%AA-020000206.html',
  103. 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%94%E6%B7%B1%E8%89%B2%E6%BA%AB%E5%BA%A6-%E8%8B%B1%E5%80%AB%E7%B4%B3%E5%A3%AB%E8%B2%B4%E6%97%8F%E9%A2%A8-%E7%8F%BE%E4%BB%A3%E5%A5%A2%E8%8F%AF%E9%A2%A8-020000334.html'
  104. ]
  105. for i in yahoo_url_list:
  106. try:
  107. try:
  108. driver = re_get_webdriver()
  109. except:
  110. print('driver_bok')
  111. portnum=random.randint(3777,3999)
  112. print(portnum)
  113. os.system('docker container stop p8809')
  114. time.sleep(5)
  115. os.system('docker container rm p8809')
  116. time.sleep(5)
  117. os.system('docker run -d -p '+str(portnum)+':4444 --name p8809 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:106.0')
  118. count=0
  119. bok+=1
  120. time.sleep(5)
  121. driver = re_get_webdriver()
  122. driver.get(i)
  123. time.sleep(5)
  124. #elmt_next = driver.find_element(By.XPATH, '//*[@id="maincontainer"]/main/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/div/div[1]/a')
  125. elmt_next = driver.find_element(By.XPATH, '/html/body/div[3]/div/main/div/div[1]/div/div/div/div/article/header/div[1]/a')
  126. webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  127. webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  128. print("cick!")
  129. count+=1
  130. print('click_all_time:',count,';broken_time:',bok)
  131. # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
  132. time.sleep(random.randint(3,7))
  133. #driver.close()
  134. driver.quit()
  135. except:
  136. #driver.close()
  137. try:
  138. driver.quit()
  139. except:
  140. print('no have driver')
  141. print(i,'error',';broken_time:',bok)
  142. time.sleep(10)
  143. portnum=random.randint(3777,3999)
  144. print(portnum)
  145. os.system('docker container stop p8809')
  146. time.sleep(5)
  147. os.system('docker container rm p8809')
  148. time.sleep(5)
  149. os.system('docker run -d -p '+str(portnum)+':4444 --name p8809 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:106.0')
  150. count=0
  151. bok=0
  152. time.sleep(5)
  153. while True:
  154. # run_once()
  155. # time.sleep(10)
  156. try:
  157. run_once()
  158. except:
  159. bok+=1
  160. print('broken')
  161. time.sleep(5)
  162. # elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a')
  163. # domain = 'hhh.com.tw'
  164. # idx=1
  165. # ranking=-1
  166. # domain_in_link = 0
  167. # print (len(elmts))
  168. # # driver.save_screenshot('c:/tmp/test.png')
  169. # n=0
  170. # for el in elmts:
  171. # n+=1
  172. # href=el.get_attribute('href')
  173. # txt=el.text
  174. # # print(txt)
  175. # if len(txt)>10:
  176. # if domain in href:
  177. # domain_in_link += 1
  178. # print('clicked....')
  179. # print('href:',href)
  180. # print('txt:',txt)
  181. # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a')
  182. # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  183. # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  184. # time.sleep(2)
  185. # elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a')
  186. # domain = 'hhh.com.tw'
  187. # idx=1
  188. # ranking=-1
  189. # domain_in_link = 0
  190. # print (len(elmts))
  191. # # driver.save_screenshot('c:/tmp/test.png')
  192. # n=0
  193. # for el in elmts:
  194. # n+=1
  195. # href=el.get_attribute('href')
  196. # txt=el.text
  197. # # print(txt)
  198. # if len(txt)>10:
  199. # if domain in href:
  200. # domain_in_link += 1
  201. # print('clicked....')
  202. # print('href:',href)
  203. # print('txt:',txt)
  204. # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]')
  205. # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  206. # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  207. # time.sleep(5)
  208. # for i in range(20):
  209. # try:
  210. # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]')
  211. # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  212. # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  213. # time.sleep(5)
  214. # except:
  215. # time.sleep(200)
  216. # webdriver.ActionChains(driver).move_to_element(el).click().perform()
  217. # add_tabs = [7,9,11,13,15,7,9,11,13,15,7,9,11,13,15,7,9,11,13,15]
  218. # db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  219. # driver=None
  220. # headers = {
  221. # "Authorization": "Bearer " + "t35vhZtWNgvDNWHc3DJh0OKll3mcB9GvC8K2EAkBug2",
  222. # "Content-Type": "application/x-www-form-urlencoded"
  223. # }
  224. # sleepoffset = 0
  225. # def send_msg(kw):
  226. # params = {"message": "處理關鍵字: "+kw}
  227. # r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
  228. # def empty_query(q):
  229. # global driver
  230. # googleurl='https://www.google.com/search?q='+urllib.parse.quote(q)
  231. # driver.get(googleurl)
  232. # time.sleep(3)
  233. # def process_query(domain, target_domain, brands, query):
  234. # print(query)
  235. # sleepoffset = 0
  236. # global driver
  237. # if query == "艾立思" and "index" in target_domain:
  238. # driver.get('https://www.google.com/search?num=100&q=艾立思&rlz=1C1ONGR_zh-TWTW997TW997&ei=zjdUY_DBG9Lm-Abpgq84&start=0&sa=N&filter=0&ved=2ahUKEwjw4KeEvfT6AhVSM94KHWnBCwcQ8tMDegQIARAQ&cshid=1666463754367857&biw=1368&bih=761&dpr=2')
  239. # time.sleep(4)
  240. # else:
  241. # driver.get('https://www.google.com?num=100')
  242. # time.sleep(3)
  243. # print(driver.current_url)
  244. # # elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  245. # # ABOVE METHOD IS DEPRECATED STARTING SELENIUM 4.3.0, USE THIS
  246. # #
  247. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  248. # time.sleep(1)
  249. # elmt.send_keys(query)
  250. # elmt.send_keys(Keys.ENTER)
  251. # idx=1
  252. # ranking=-1
  253. # domain_in_link = 0
  254. # googleurl = driver.current_url
  255. # print(driver.current_url)
  256. # if "sorry" in googleurl:
  257. # return 444
  258. # elmts=driver.find_elements("xpath","//div[@class='yuRUbf']/a")
  259. # print (len(elmts))
  260. # # driver.save_screenshot('c:/tmp/test.png')
  261. # n=0
  262. # for el in elmts:
  263. # n+=1
  264. # href=el.get_attribute('href')
  265. # txt=el.text
  266. # if len(txt)>10:
  267. # if domain in href:
  268. # domain_in_link += 1
  269. # print('clicked....')
  270. # print(href)
  271. # print(txt)
  272. # if query == "艾立思" and "index" in target_domain and href != "https://hhh.com.tw/brand-index.php?brand_id=211":
  273. # print("wrong site")
  274. # continue
  275. # webdriver.ActionChains(driver).move_to_element(el).perform()
  276. # webdriver.ActionChains(driver).move_to_element(el).click().perform()
  277. # print("Rank: " + str(n))
  278. # time.sleep(15)
  279. # ''' unused
  280. # new_windows_count = add_tabs[random.randint(0,19)]
  281. # print(str(new_windows_count) + " new tabs")
  282. # for i in range (0,new_windows_count):
  283. # print("Tab " + str(i+1))
  284. # #original_window = driver.current_window_handle
  285. # #driver.switch_to.new_window('window')
  286. # #driver.get(href)
  287. # sleepoffset += 12
  288. # driver.execute_script('window.open("'+href+'","_blank");')
  289. # driver.execute_script("window.scrollTo(0, 600)")
  290. # time.sleep(15)
  291. # #driver.close()
  292. # #driver.switch_to.window(original_window)
  293. # if domain in target_domain:
  294. # print("Target link found")
  295. # time_stamp = datetime.fromtimestamp(time.time())
  296. # time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
  297. # db['click_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "url": href, "content": txt, "extra_windows": '0'})
  298. # '''
  299. # break
  300. # '''if domain in target_domain:
  301. # print("Target domain found")
  302. # time_stamp = datetime.fromtimestamp(time.time())
  303. # time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
  304. # db['query_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "googleurl": googleurl, "element_count": len(elmts), "domain_in_link_count": domain_in_link})
  305. # '''
  306. # print(domain_in_link)
  307. # return 200
  308. # def run_once(domain, target_domain, brands, query):
  309. # global driver
  310. # result=[]
  311. # options = webdriver.ChromeOptions()
  312. # options.add_argument('--headless')
  313. # # options.add_argument("--user-agent=" +user_agent)
  314. # options.add_argument("--incognito")
  315. # options.add_argument('--no-sandbox')
  316. # options.add_argument('--disable-dev-shm-usage')
  317. # driver = webdriver.Chrome(
  318. # options=options)
  319. # driver.delete_all_cookies()
  320. # driver.set_window_size(1400,1000)
  321. # statuscode = process_query(domain, target_domain, brands, query)
  322. # driver.quit()
  323. # return statuscode
  324. # #execution starts here
  325. # def execute(domain, target_domain, brands, query_list):
  326. # print("Ctrl+C or Ctrl+Z to stop.")
  327. # statuscode = 0
  328. # st = timeit.default_timer()
  329. # try:
  330. # statuscode = run_once(domain, target_domain, brands, random.choice(query_list))
  331. # except:
  332. # traceback.print_exc()
  333. # timetaken = timeit.default_timer()-st
  334. # print("Time taken: " + str(timetaken))
  335. # print("Process returned with " + str(statuscode))
  336. # if statuscode == 444:
  337. # print("You have been caught!!!")
  338. # #notify("Clickbot " + brands[domain] + " has been caught by Google and will terminate. IP: ")
  339. # extrasleep = 0
  340. # if(timetaken < 50):
  341. # extrasleep = 50 - timetaken
  342. # print("Ctrl+C or Ctrl+Z to stop now.")
  343. # print("You have " + str(10 + extrasleep) + " seconds.")
  344. # time.sleep(10 + extrasleep)
  345. # return statuscode