yahoo_use.py 14 KB


  1. import time
  2. from datetime import datetime
  3. import json
  4. from selenium import webdriver
  5. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  6. import time
  7. import os
  8. import urllib.parse
  9. from selenium.webdriver.support.ui import WebDriverWait
  10. from selenium.webdriver.common.by import By
  11. from selenium.webdriver.support import expected_conditions as EC
  12. import codecs
  13. import random
  14. import requests
  15. import dataset
  16. import traceback
  17. import sys
  18. from selenium.webdriver.common.keys import Keys
  19. import timeit
  20. import socket
  21. import random
  22. import re
  23. # import requests
  24. from fake_useragent import UserAgent
  25. ua = UserAgent()
  26. def re_get_webdriver():
  27. # global port
  28. global driver
  29. global portnum
  30. # os.system('killall chrome')
  31. result=[]
  32. # if driver is not None:
  33. # print('closing....')
  34. # driver.quit()
  35. # print('quit....')
  36. # driver=None
  37. # os.system()
  38. options = webdriver.ChromeOptions()
  39. options.add_argument("--user-agent=" +ua.random)
  40. options.add_argument("--no-sandbox")
  41. options.add_argument("--headless")
  42. options.add_argument("--incognito")
  43. driver = webdriver.Remote(
  44. command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  45. options=options)
  46. return driver
  47. # headers = {'user-agent': ua.chrome}
  48. # r = requests.get('https://house.ettoday.net/news/1492047', headers=headers)
  49. # print(r.text)
  50. # options.binary_location = ('C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe')
  51. # driverPath = './chromedriver.exe'
  52. # driver = webdriver.Firefox()
  53. # driver.get('https://google.com')
  54. # ettoday_url_list = ['https://house.ettoday.net/news/1492047',
  55. # 'https://house.ettoday.net/news/1492167',
  56. # 'https://house.ettoday.net/news/1492288',
  57. # 'https://house.ettoday.net/news/1492178',
  58. # 'https://house.ettoday.net/news/1492229',
  59. # 'https://house.ettoday.net/news/1492134',
  60. # 'https://house.ettoday.net/news/1492240',
  61. # 'https://house.ettoday.net/news/1492161',
  62. # 'https://house.ettoday.net/news/1492168',
  63. # 'https://house.ettoday.net/news/1492217']
  64. # for i in ettoday_url_list:
  65. # driver.get(i)
  66. # time.sleep(3)
  67. # elmt_next = driver.find_element(By.XPATH, '//*[@id="house"]/div[3]/div[2]/div[6]/div/div/div[1]/article/div/div[3]/p[1]/a')
  68. # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  69. # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  70. # print("cick:",i)
  71. # # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
  72. # time.sleep(random.randint(3,7))
  73. # driver.quit()
  74. # query='幸福空間'
  75. # elmt.send_keys(query)
  76. # elmt.send_keys(Keys.ENTER)
  77. # time.sleep(1)
  78. # time.sleep(1)
  79. def run_once():
  80. global count
  81. global bok
  82. global portnum
  83. yahoo_url_list = [
  84. 'https://house.yahoo.com.tw/%E9%9B%8D%E5%AE%B9%E9%9B%85%E7%B7%BB-%E5%84%AA%E9%9B%85%E5%81%87%E6%9C%9F-%E6%96%B0%E5%8F%A4%E5%85%B8-31%E5%9D%AA-020000499.html',
  85. 'https://house.yahoo.com.tw/%E6%96%B0%E7%94%9F%E9%AD%85%E5%8A%9B-%E8%AD%9C%E5%AF%AB%E5%B9%B8%E7%A6%8F%E5%9C%93%E8%88%9E%E6%9B%B2-%E5%8C%97%E6%AD%90%E9%A2%A8-35%E5%9D%AA-020000759.html',
  86. 'https://house.yahoo.com.tw/20%E5%B9%B4%E8%80%81%E5%AE%85%E9%87%8D%E7%94%9F-%E7%BE%8E%E5%BC%8F%E4%BD%8E%E5%A5%A2%E6%9C%89%E5%AE%B6%E7%9A%84%E6%BA%AB%E5%BA%A6-106%E5%9D%AA-020000087.html',
  87. 'https://house.yahoo.com.tw/sheer-%E7%B4%94%E7%B2%B9-%E7%8F%BE%E4%BB%A3%E9%A2%A8-25%E5%9D%AA-020000325.html',
  88. 'https://house.yahoo.com.tw/%E8%AE%8A%E5%BD%A2%E8%88%87%E7%B5%84%E5%90%88-%E8%A4%87%E5%90%88%E5%BC%8F%E7%9A%84%E7%A9%BA%E9%96%93%E8%A8%AD%E8%A8%88-%E4%B8%AD-020000869.html',
  89. 'https://house.yahoo.com.tw/%E8%A7%A3%E6%94%BE%E6%8B%98%E7%A6%81%E5%BF%83%E9%9D%88-%E8%B6%85%E8%84%AB%E7%8B%82%E6%83%B3%E9%80%8F%E5%A4%A9%E5%8E%9D-020000093.html',
  90. 'https://house.yahoo.com.tw/%E8%A6%AA%E5%AD%90%E6%96%99%E7%90%86%E7%9B%B4%E6%92%AD%E4%B8%BB%E7%9A%84%E5%AE%B6-%E5%BE%AE%E7%BE%8E%E5%BC%8F%E8%A8%AD%E8%A8%88-50%E5%9D%AA-020000607.html',
  91. 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%94%E6%B7%B1%E8%89%B2%E6%BA%AB%E5%BA%A6-%E8%8B%B1%E5%80%AB%E7%B4%B3%E5%A3%AB%E8%B2%B4%E6%97%8F%E9%A2%A8-%E7%8F%BE%E4%BB%A3%E5%A5%A2%E8%8F%AF%E9%A2%A8-020000334.html',
  92. 'https://house.yahoo.com.tw/%E7%8E%A9%E5%91%B3%E7%B3%BB%E7%B5%B1%E6%9D%BF-%E5%BF%AB%E9%80%9F%E6%88%90%E5%AE%B6%E7%B0%A1%E7%B4%84%E7%8F%BE%E4%BB%A3%E9%A2%A8-35%E5%9D%AA-020000199.html',
  93. 'https://house.yahoo.com.tw/%E4%BB%A5%E5%9C%93%E5%BD%A2%E7%AC%A6%E7%A2%BC-%E5%BD%A2%E5%A1%91%E6%81%A2%E5%BC%98%E5%A5%A2%E7%BE%8E%E8%87%BB%E9%82%B8-%E5%A5%A2%E8%8F%AF%E9%A2%A8-42%E5%9D%AA-020000780.html']
  94. for i in yahoo_url_list:
  95. try:
  96. try:
  97. driver = re_get_webdriver()
  98. except:
  99. print('driver_bok')
  100. portnum=random.randint(4555,4666)
  101. print(portnum)
  102. os.system('docker container stop p8809')
  103. time.sleep(5)
  104. os.system('docker container rm p8809')
  105. time.sleep(5)
  106. os.system('docker run -d -p '+str(portnum)+':4444 --name p8809 --dns 168.95.1.1 selenium/standalone-chrome:106.0')
  107. count=0
  108. bok+=1
  109. time.sleep(5)
  110. driver = re_get_webdriver()
  111. driver.get(i)
  112. time.sleep(5)
  113. elmt_next = driver.find_element(By.XPATH, '//*[@id="maincontainer"]/main/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/div/div[1]/a')
  114. webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  115. webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  116. print("cick!")
  117. count+=1
  118. print('click_all_time:',count,';broken_time:',bok)
  119. # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
  120. time.sleep(random.randint(3,7))
  121. driver.quit()
  122. except:
  123. driver.quit()
  124. print(i,'error',';broken_time:',bok)
  125. time.sleep(10)
  126. portnum=random.randint(4555,4666)
  127. print(portnum)
  128. os.system('docker container stop p8809')
  129. time.sleep(5)
  130. os.system('docker container rm p8809')
  131. time.sleep(5)
  132. os.system('docker run -d -p '+str(portnum)+':4444 --name p8809 --dns 168.95.1.1 selenium/standalone-chrome:106.0')
  133. count=0
  134. bok=0
  135. time.sleep(5)
  136. while True:
  137. # run_once()
  138. # time.sleep(10)
  139. run_once()
  140. # elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a')
  141. # domain = 'hhh.com.tw'
  142. # idx=1
  143. # ranking=-1
  144. # domain_in_link = 0
  145. # print (len(elmts))
  146. # # driver.save_screenshot('c:/tmp/test.png')
  147. # n=0
  148. # for el in elmts:
  149. # n+=1
  150. # href=el.get_attribute('href')
  151. # txt=el.text
  152. # # print(txt)
  153. # if len(txt)>10:
  154. # if domain in href:
  155. # domain_in_link += 1
  156. # print('clicked....')
  157. # print('href:',href)
  158. # print('txt:',txt)
  159. # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a')
  160. # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  161. # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  162. # time.sleep(2)
  163. # elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a')
  164. # domain = 'hhh.com.tw'
  165. # idx=1
  166. # ranking=-1
  167. # domain_in_link = 0
  168. # print (len(elmts))
  169. # # driver.save_screenshot('c:/tmp/test.png')
  170. # n=0
  171. # for el in elmts:
  172. # n+=1
  173. # href=el.get_attribute('href')
  174. # txt=el.text
  175. # # print(txt)
  176. # if len(txt)>10:
  177. # if domain in href:
  178. # domain_in_link += 1
  179. # print('clicked....')
  180. # print('href:',href)
  181. # print('txt:',txt)
  182. # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]')
  183. # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  184. # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  185. # time.sleep(5)
  186. # for i in range(20):
  187. # try:
  188. # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]')
  189. # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
  190. # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
  191. # time.sleep(5)
  192. # except:
  193. # time.sleep(200)
  194. # webdriver.ActionChains(driver).move_to_element(el).click().perform()
  195. # add_tabs = [7,9,11,13,15,7,9,11,13,15,7,9,11,13,15,7,9,11,13,15]
  196. # db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  197. # driver=None
  198. # headers = {
  199. # "Authorization": "Bearer " + "t35vhZtWNgvDNWHc3DJh0OKll3mcB9GvC8K2EAkBug2",
  200. # "Content-Type": "application/x-www-form-urlencoded"
  201. # }
  202. # sleepoffset = 0
  203. # def send_msg(kw):
  204. # params = {"message": "處理關鍵字: "+kw}
  205. # r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
  206. # def empty_query(q):
  207. # global driver
  208. # googleurl='https://www.google.com/search?q='+urllib.parse.quote(q)
  209. # driver.get(googleurl)
  210. # time.sleep(3)
  211. # def process_query(domain, target_domain, brands, query):
  212. # print(query)
  213. # sleepoffset = 0
  214. # global driver
  215. # if query == "艾立思" and "index" in target_domain:
  216. # driver.get('https://www.google.com/search?num=100&q=艾立思&rlz=1C1ONGR_zh-TWTW997TW997&ei=zjdUY_DBG9Lm-Abpgq84&start=0&sa=N&filter=0&ved=2ahUKEwjw4KeEvfT6AhVSM94KHWnBCwcQ8tMDegQIARAQ&cshid=1666463754367857&biw=1368&bih=761&dpr=2')
  217. # time.sleep(4)
  218. # else:
  219. # driver.get('https://www.google.com?num=100')
  220. # time.sleep(3)
  221. # print(driver.current_url)
  222. # # elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  223. # # ABOVE METHOD IS DEPRECATED STARTING SELENIUM 4.3.0, USE THIS
  224. # #
  225. # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  226. # time.sleep(1)
  227. # elmt.send_keys(query)
  228. # elmt.send_keys(Keys.ENTER)
  229. # idx=1
  230. # ranking=-1
  231. # domain_in_link = 0
  232. # googleurl = driver.current_url
  233. # print(driver.current_url)
  234. # if "sorry" in googleurl:
  235. # return 444
  236. # elmts=driver.find_elements("xpath","//div[@class='yuRUbf']/a")
  237. # print (len(elmts))
  238. # # driver.save_screenshot('c:/tmp/test.png')
  239. # n=0
  240. # for el in elmts:
  241. # n+=1
  242. # href=el.get_attribute('href')
  243. # txt=el.text
  244. # if len(txt)>10:
  245. # if domain in href:
  246. # domain_in_link += 1
  247. # print('clicked....')
  248. # print(href)
  249. # print(txt)
  250. # if query == "艾立思" and "index" in target_domain and href != "https://hhh.com.tw/brand-index.php?brand_id=211":
  251. # print("wrong site")
  252. # continue
  253. # webdriver.ActionChains(driver).move_to_element(el).perform()
  254. # webdriver.ActionChains(driver).move_to_element(el).click().perform()
  255. # print("Rank: " + str(n))
  256. # time.sleep(15)
  257. # ''' unused
  258. # new_windows_count = add_tabs[random.randint(0,19)]
  259. # print(str(new_windows_count) + " new tabs")
  260. # for i in range (0,new_windows_count):
  261. # print("Tab " + str(i+1))
  262. # #original_window = driver.current_window_handle
  263. # #driver.switch_to.new_window('window')
  264. # #driver.get(href)
  265. # sleepoffset += 12
  266. # driver.execute_script('window.open("'+href+'","_blank");')
  267. # driver.execute_script("window.scrollTo(0, 600)")
  268. # time.sleep(15)
  269. # #driver.close()
  270. # #driver.switch_to.window(original_window)
  271. # if domain in target_domain:
  272. # print("Target link found")
  273. # time_stamp = datetime.fromtimestamp(time.time())
  274. # time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
  275. # db['click_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "url": href, "content": txt, "extra_windows": '0'})
  276. # '''
  277. # break
  278. # '''if domain in target_domain:
  279. # print("Target domain found")
  280. # time_stamp = datetime.fromtimestamp(time.time())
  281. # time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
  282. # db['query_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "googleurl": googleurl, "element_count": len(elmts), "domain_in_link_count": domain_in_link})
  283. # '''
  284. # print(domain_in_link)
  285. # return 200
  286. # def run_once(domain, target_domain, brands, query):
  287. # global driver
  288. # result=[]
  289. # options = webdriver.ChromeOptions()
  290. # options.add_argument('--headless')
  291. # # options.add_argument("--user-agent=" +user_agent)
  292. # options.add_argument("--incognito")
  293. # options.add_argument('--no-sandbox')
  294. # options.add_argument('--disable-dev-shm-usage')
  295. # driver = webdriver.Chrome(
  296. # options=options)
  297. # driver.delete_all_cookies()
  298. # driver.set_window_size(1400,1000)
  299. # statuscode = process_query(domain, target_domain, brands, query)
  300. # driver.quit()
  301. # return statuscode
  302. # #execution starts here
  303. # def execute(domain, target_domain, brands, query_list):
  304. # print("Ctrl+C or Ctrl+Z to stop.")
  305. # statuscode = 0
  306. # st = timeit.default_timer()
  307. # try:
  308. # statuscode = run_once(domain, target_domain, brands, random.choice(query_list))
  309. # except:
  310. # traceback.print_exc()
  311. # timetaken = timeit.default_timer()-st
  312. # print("Time taken: " + str(timetaken))
  313. # print("Process returned with " + str(statuscode))
  314. # if statuscode == 444:
  315. # print("You have been caught!!!")
  316. # #notify("Clickbot " + brands[domain] + " has been caught by Google and will terminate. IP: ")
  317. # extrasleep = 0
  318. # if(timetaken < 50):
  319. # extrasleep = 50 - timetaken
  320. # print("Ctrl+C or Ctrl+Z to stop now.")
  321. # print("You have " + str(10 + extrasleep) + " seconds.")
  322. # time.sleep(10 + extrasleep)
  323. # return statuscode