import time from datetime import datetime import json from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time import os import urllib.parse from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC import codecs import random import requests import dataset import traceback import sys from selenium.webdriver.common.keys import Keys import timeit import socket import random import re # import requests from fake_useragent import UserAgent ua = UserAgent() def re_get_webdriver(): # global port global driver global portnum # os.system('killall chrome') result=[] # if driver is not None: # print('closing....') # driver.quit() # print('quit....') # driver=None # os.system() options = webdriver.ChromeOptions() options.add_argument("--user-agent=" +ua.random) options.add_argument("--no-sandbox") options.add_argument("--headless") options.add_argument("--incognito") driver = webdriver.Remote( command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub', options=options) return driver # headers = {'user-agent': ua.chrome} # r = requests.get('https://house.ettoday.net/news/1492047', headers=headers) # print(r.text) # options.binary_location = ('C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe') # driverPath = './chromedriver.exe' # driver = webdriver.Firefox() # driver.get('https://google.com') # ettoday_url_list = ['https://house.ettoday.net/news/1492047', # 'https://house.ettoday.net/news/1492167', # 'https://house.ettoday.net/news/1492288', # 'https://house.ettoday.net/news/1492178', # 'https://house.ettoday.net/news/1492229', # 'https://house.ettoday.net/news/1492134', # 'https://house.ettoday.net/news/1492240', # 'https://house.ettoday.net/news/1492161', # 'https://house.ettoday.net/news/1492168', # 'https://house.ettoday.net/news/1492217'] # for i in ettoday_url_list: # driver.get(i) # time.sleep(3) # elmt_next = driver.find_element(By.XPATH, '//*[@id="house"]/div[3]/div[2]/div[6]/div/div/div[1]/article/div/div[3]/p[1]/a') # webdriver.ActionChains(driver).move_to_element(elmt_next).perform() # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform() # print("cick:",i) # # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]') # time.sleep(random.randint(3,7)) # driver.quit() # query='幸福空間' # elmt.send_keys(query) # elmt.send_keys(Keys.ENTER) # time.sleep(1) # time.sleep(1) def run_once(): global count global bok global portnum yahoo_url_list = [ 'https://house.yahoo.com.tw/%E9%9B%8D%E5%AE%B9%E9%9B%85%E7%B7%BB-%E5%84%AA%E9%9B%85%E5%81%87%E6%9C%9F-%E6%96%B0%E5%8F%A4%E5%85%B8-31%E5%9D%AA-020000499.html', 'https://house.yahoo.com.tw/%E6%96%B0%E7%94%9F%E9%AD%85%E5%8A%9B-%E8%AD%9C%E5%AF%AB%E5%B9%B8%E7%A6%8F%E5%9C%93%E8%88%9E%E6%9B%B2-%E5%8C%97%E6%AD%90%E9%A2%A8-35%E5%9D%AA-020000759.html', 'https://house.yahoo.com.tw/20%E5%B9%B4%E8%80%81%E5%AE%85%E9%87%8D%E7%94%9F-%E7%BE%8E%E5%BC%8F%E4%BD%8E%E5%A5%A2%E6%9C%89%E5%AE%B6%E7%9A%84%E6%BA%AB%E5%BA%A6-106%E5%9D%AA-020000087.html', 'https://house.yahoo.com.tw/sheer-%E7%B4%94%E7%B2%B9-%E7%8F%BE%E4%BB%A3%E9%A2%A8-25%E5%9D%AA-020000325.html', 'https://house.yahoo.com.tw/%E8%AE%8A%E5%BD%A2%E8%88%87%E7%B5%84%E5%90%88-%E8%A4%87%E5%90%88%E5%BC%8F%E7%9A%84%E7%A9%BA%E9%96%93%E8%A8%AD%E8%A8%88-%E4%B8%AD-020000869.html', 'https://house.yahoo.com.tw/%E8%A7%A3%E6%94%BE%E6%8B%98%E7%A6%81%E5%BF%83%E9%9D%88-%E8%B6%85%E8%84%AB%E7%8B%82%E6%83%B3%E9%80%8F%E5%A4%A9%E5%8E%9D-020000093.html', 'https://house.yahoo.com.tw/%E8%A6%AA%E5%AD%90%E6%96%99%E7%90%86%E7%9B%B4%E6%92%AD%E4%B8%BB%E7%9A%84%E5%AE%B6-%E5%BE%AE%E7%BE%8E%E5%BC%8F%E8%A8%AD%E8%A8%88-50%E5%9D%AA-020000607.html', 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%94%E6%B7%B1%E8%89%B2%E6%BA%AB%E5%BA%A6-%E8%8B%B1%E5%80%AB%E7%B4%B3%E5%A3%AB%E8%B2%B4%E6%97%8F%E9%A2%A8-%E7%8F%BE%E4%BB%A3%E5%A5%A2%E8%8F%AF%E9%A2%A8-020000334.html', 'https://house.yahoo.com.tw/%E7%8E%A9%E5%91%B3%E7%B3%BB%E7%B5%B1%E6%9D%BF-%E5%BF%AB%E9%80%9F%E6%88%90%E5%AE%B6%E7%B0%A1%E7%B4%84%E7%8F%BE%E4%BB%A3%E9%A2%A8-35%E5%9D%AA-020000199.html', 'https://house.yahoo.com.tw/%E4%BB%A5%E5%9C%93%E5%BD%A2%E7%AC%A6%E7%A2%BC-%E5%BD%A2%E5%A1%91%E6%81%A2%E5%BC%98%E5%A5%A2%E7%BE%8E%E8%87%BB%E9%82%B8-%E5%A5%A2%E8%8F%AF%E9%A2%A8-42%E5%9D%AA-020000780.html'] for i in yahoo_url_list: try: try: driver = re_get_webdriver() except: print('driver_bok') portnum=random.randint(4555,4666) print(portnum) os.system('docker container stop p8809') time.sleep(5) os.system('docker container rm p8809') time.sleep(5) os.system('docker run -d -p '+str(portnum)+':4444 --name p8809 --dns 168.95.1.1 selenium/standalone-chrome:106.0') count=0 bok+=1 time.sleep(5) driver = re_get_webdriver() driver.get(i) time.sleep(5) elmt_next = driver.find_element(By.XPATH, '//*[@id="maincontainer"]/main/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/div/div[1]/a') webdriver.ActionChains(driver).move_to_element(elmt_next).perform() webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform() print("cick!") count+=1 print('click_all_time:',count,';broken_time:',bok) # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]') time.sleep(random.randint(3,7)) driver.quit() except: driver.quit() print(i,'error',';broken_time:',bok) time.sleep(10) portnum=random.randint(4555,4666) print(portnum) os.system('docker container stop p8809') time.sleep(5) os.system('docker container rm p8809') time.sleep(5) os.system('docker run -d -p '+str(portnum)+':4444 --name p8809 --dns 168.95.1.1 selenium/standalone-chrome:106.0') count=0 bok=0 time.sleep(5) while True: # run_once() # time.sleep(10) run_once() # elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a') # domain = 'hhh.com.tw' # idx=1 # ranking=-1 # domain_in_link = 0 # print (len(elmts)) # # driver.save_screenshot('c:/tmp/test.png') # n=0 # for el in elmts: # n+=1 # href=el.get_attribute('href') # txt=el.text # # print(txt) # if len(txt)>10: # if domain in href: # domain_in_link += 1 # print('clicked....') # print('href:',href) # print('txt:',txt) # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a') # webdriver.ActionChains(driver).move_to_element(elmt_next).perform() # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform() # time.sleep(2) # elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a') # domain = 'hhh.com.tw' # idx=1 # ranking=-1 # domain_in_link = 0 # print (len(elmts)) # # driver.save_screenshot('c:/tmp/test.png') # n=0 # for el in elmts: # n+=1 # href=el.get_attribute('href') # txt=el.text # # print(txt) # if len(txt)>10: # if domain in href: # domain_in_link += 1 # print('clicked....') # print('href:',href) # print('txt:',txt) # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]') # webdriver.ActionChains(driver).move_to_element(elmt_next).perform() # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform() # time.sleep(5) # for i in range(20): # try: # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]') # webdriver.ActionChains(driver).move_to_element(elmt_next).perform() # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform() # time.sleep(5) # except: # time.sleep(200) # webdriver.ActionChains(driver).move_to_element(el).click().perform() # add_tabs = [7,9,11,13,15,7,9,11,13,15,7,9,11,13,15,7,9,11,13,15] # db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4') # driver=None # headers = { # "Authorization": "Bearer " + "t35vhZtWNgvDNWHc3DJh0OKll3mcB9GvC8K2EAkBug2", # "Content-Type": "application/x-www-form-urlencoded" # } # sleepoffset = 0 # def send_msg(kw): # params = {"message": "處理關鍵字: "+kw} # r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params) # def empty_query(q): # global driver # googleurl='https://www.google.com/search?q='+urllib.parse.quote(q) # driver.get(googleurl) # time.sleep(3) # def process_query(domain, target_domain, brands, query): # print(query) # sleepoffset = 0 # global driver # if query == "艾立思" and "index" in target_domain: # driver.get('https://www.google.com/search?num=100&q=艾立思&rlz=1C1ONGR_zh-TWTW997TW997&ei=zjdUY_DBG9Lm-Abpgq84&start=0&sa=N&filter=0&ved=2ahUKEwjw4KeEvfT6AhVSM94KHWnBCwcQ8tMDegQIARAQ&cshid=1666463754367857&biw=1368&bih=761&dpr=2') # time.sleep(4) # else: # driver.get('https://www.google.com?num=100') # time.sleep(3) # print(driver.current_url) # # elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a") # # ABOVE METHOD IS DEPRECATED STARTING SELENIUM 4.3.0, USE THIS # # # elmt = driver.find_element(By.XPATH, "//input[@name='q']") # time.sleep(1) # elmt.send_keys(query) # elmt.send_keys(Keys.ENTER) # idx=1 # ranking=-1 # domain_in_link = 0 # googleurl = driver.current_url # print(driver.current_url) # if "sorry" in googleurl: # return 444 # elmts=driver.find_elements("xpath","//div[@class='yuRUbf']/a") # print (len(elmts)) # # driver.save_screenshot('c:/tmp/test.png') # n=0 # for el in elmts: # n+=1 # href=el.get_attribute('href') # txt=el.text # if len(txt)>10: # if domain in href: # domain_in_link += 1 # print('clicked....') # print(href) # print(txt) # if query == "艾立思" and "index" in target_domain and href != "https://hhh.com.tw/brand-index.php?brand_id=211": # print("wrong site") # continue # webdriver.ActionChains(driver).move_to_element(el).perform() # webdriver.ActionChains(driver).move_to_element(el).click().perform() # print("Rank: " + str(n)) # time.sleep(15) # ''' unused # new_windows_count = add_tabs[random.randint(0,19)] # print(str(new_windows_count) + " new tabs") # for i in range (0,new_windows_count): # print("Tab " + str(i+1)) # #original_window = driver.current_window_handle # #driver.switch_to.new_window('window') # #driver.get(href) # sleepoffset += 12 # driver.execute_script('window.open("'+href+'","_blank");') # driver.execute_script("window.scrollTo(0, 600)") # time.sleep(15) # #driver.close() # #driver.switch_to.window(original_window) # if domain in target_domain: # print("Target link found") # time_stamp = datetime.fromtimestamp(time.time()) # time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S") # db['click_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "url": href, "content": txt, "extra_windows": '0'}) # ''' # break # '''if domain in target_domain: # print("Target domain found") # time_stamp = datetime.fromtimestamp(time.time()) # time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S") # db['query_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "googleurl": googleurl, "element_count": len(elmts), "domain_in_link_count": domain_in_link}) # ''' # print(domain_in_link) # return 200 # def run_once(domain, target_domain, brands, query): # global driver # result=[] # options = webdriver.ChromeOptions() # options.add_argument('--headless') # # options.add_argument("--user-agent=" +user_agent) # options.add_argument("--incognito") # options.add_argument('--no-sandbox') # options.add_argument('--disable-dev-shm-usage') # driver = webdriver.Chrome( # options=options) # driver.delete_all_cookies() # driver.set_window_size(1400,1000) # statuscode = process_query(domain, target_domain, brands, query) # driver.quit() # return statuscode # #execution starts here # def execute(domain, target_domain, brands, query_list): # print("Ctrl+C or Ctrl+Z to stop.") # statuscode = 0 # st = timeit.default_timer() # try: # statuscode = run_once(domain, target_domain, brands, random.choice(query_list)) # except: # traceback.print_exc() # timetaken = timeit.default_timer()-st # print("Time taken: " + str(timetaken)) # print("Process returned with " + str(statuscode)) # if statuscode == 444: # print("You have been caught!!!") # #notify("Clickbot " + brands[domain] + " has been caught by Google and will terminate. IP: ") # extrasleep = 0 # if(timetaken < 50): # extrasleep = 50 - timetaken # print("Ctrl+C or Ctrl+Z to stop now.") # print("You have " + str(10 + extrasleep) + " seconds.") # time.sleep(10 + extrasleep) # return statuscode