import time from datetime import datetime import json from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time import os import urllib.parse from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC import codecs import random import requests import dataset import traceback import sys from selenium.webdriver.common.keys import Keys import timeit import socket import random import re # import requests from fake_useragent import UserAgent ua = UserAgent() def re_get_webdriver(): # global port global driver global portnum # os.system('killall chrome') result=[] # if driver is not None: # print('closing....') # driver.quit() # print('quit....') # driver=None # os.system() options = webdriver.ChromeOptions() options.add_argument("--user-agent=" +ua.random) options.add_argument("--no-sandbox") options.add_argument("--headless") options.add_argument("--incognito") driver = webdriver.Remote( command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub', options=options) return driver # headers = {'user-agent': ua.chrome} # r = requests.get('https://house.ettoday.net/news/1492047', headers=headers) # print(r.text) # options.binary_location = ('C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe') # driverPath = './chromedriver.exe' # driver = webdriver.Firefox() # driver.get('https://google.com') # ettoday_url_list = ['https://house.ettoday.net/news/1492047', # 'https://house.ettoday.net/news/1492167', # 'https://house.ettoday.net/news/1492288', # 'https://house.ettoday.net/news/1492178', # 'https://house.ettoday.net/news/1492229', # 'https://house.ettoday.net/news/1492134', # 'https://house.ettoday.net/news/1492240', # 'https://house.ettoday.net/news/1492161', # 'https://house.ettoday.net/news/1492168', # 'https://house.ettoday.net/news/1492217'] # for i in ettoday_url_list: # driver.get(i) # time.sleep(3) # elmt_next = driver.find_element(By.XPATH, '//*[@id="house"]/div[3]/div[2]/div[6]/div/div/div[1]/article/div/div[3]/p[1]/a') # webdriver.ActionChains(driver).move_to_element(elmt_next).perform() # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform() # print("cick:",i) # # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]') # time.sleep(random.randint(3,7)) # driver.quit() # query='幸福空間' # elmt.send_keys(query) # elmt.send_keys(Keys.ENTER) # time.sleep(1) # time.sleep(1) def run_once(): global count global bok global portnum yahoo_url_list = [ 'https://house.yahoo.com.tw/%E9%9B%8D%E5%AE%B9%E9%9B%85%E7%B7%BB-%E5%84%AA%E9%9B%85%E5%81%87%E6%9C%9F-%E6%96%B0%E5%8F%A4%E5%85%B8-31%E5%9D%AA-020000499.html', 'https://house.yahoo.com.tw/%E6%96%B0%E7%94%9F%E9%AD%85%E5%8A%9B-%E8%AD%9C%E5%AF%AB%E5%B9%B8%E7%A6%8F%E5%9C%93%E8%88%9E%E6%9B%B2-%E5%8C%97%E6%AD%90%E9%A2%A8-35%E5%9D%AA-020000759.html', 'https://house.yahoo.com.tw/%E7%AF%89-%E6%96%B9%E8%B3%AA%E7%B0%A1%E5%85%89%E5%AF%93-%E4%BA%AB%E5%8F%97%E6%81%AC%E9%9D%9C%E6%BA%AB%E9%A6%A8%E6%97%A5%E5%B8%B8-%E4%BA%BA%E6%96%87%E9%A3%AF%E5%BA%97%E9%A2%A8-45%E5%9D%AA-020000682.html', 'https://house.yahoo.com.tw/sheer-%E7%B4%94%E7%B2%B9-%E7%8F%BE%E4%BB%A3%E9%A2%A8-25%E5%9D%AA-020000325.html', 'https://house.yahoo.com.tw/%E8%AE%8A%E5%BD%A2%E8%88%87%E7%B5%84%E5%90%88-%E8%A4%87%E5%90%88%E5%BC%8F%E7%9A%84%E7%A9%BA%E9%96%93%E8%A8%AD%E8%A8%88-%E4%B8%AD-020000869.html', 'https://house.yahoo.com.tw/%E8%A7%A3%E6%94%BE%E6%8B%98%E7%A6%81%E5%BF%83%E9%9D%88-%E8%B6%85%E8%84%AB%E7%8B%82%E6%83%B3%E9%80%8F%E5%A4%A9%E5%8E%9D-020000093.html', 'https://house.yahoo.com.tw/%E8%A6%AA%E5%AD%90%E6%96%99%E7%90%86%E7%9B%B4%E6%92%AD%E4%B8%BB%E7%9A%84%E5%AE%B6-%E5%BE%AE%E7%BE%8E%E5%BC%8F%E8%A8%AD%E8%A8%88-50%E5%9D%AA-020000607.html', 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%94%E6%B7%B1%E8%89%B2%E6%BA%AB%E5%BA%A6-%E8%8B%B1%E5%80%AB%E7%B4%B3%E5%A3%AB%E8%B2%B4%E6%97%8F%E9%A2%A8-%E7%8F%BE%E4%BB%A3%E5%A5%A2%E8%8F%AF%E9%A2%A8-020000334.html', 'https://house.yahoo.com.tw/%E7%8E%A9%E5%91%B3%E7%B3%BB%E7%B5%B1%E6%9D%BF-%E5%BF%AB%E9%80%9F%E6%88%90%E5%AE%B6%E7%B0%A1%E7%B4%84%E7%8F%BE%E4%BB%A3%E9%A2%A8-35%E5%9D%AA-020000199.html', 'https://house.yahoo.com.tw/%E4%BB%A5%E5%9C%93%E5%BD%A2%E7%AC%A6%E7%A2%BC-%E5%BD%A2%E5%A1%91%E6%81%A2%E5%BC%98%E5%A5%A2%E7%BE%8E%E8%87%BB%E9%82%B8-%E5%A5%A2%E8%8F%AF%E9%A2%A8-42%E5%9D%AA-020000780.html', 'https://house.yahoo.com.tw/%E7%B4%99%E9%9B%95-%E7%8F%BE%E4%BB%A3%E9%A2%A8-30%E5%9D%AA-020000034.html', 'https://house.yahoo.com.tw/%E6%8C%91%E9%AB%98%E6%97%A5%E7%B3%BB%E8%BE%A6%E5%85%AC%E7%A9%BA%E9%96%93-%E7%B5%90%E5%90%88%E4%BE%98%E5%AF%82%E8%88%87%E8%87%AA%E7%84%B6%E7%9A%84%E7%B0%A1%E7%B4%84%E7%BE%8E%E5%AD%B8-230%E5%9D%AA-020000517.html', 'https://house.yahoo.com.tw/35%E5%9D%AA%E8%80%81%E5%B1%8B%E5%A5%BD%E5%B1%8B%E6%B3%81-%E9%80%B2%E5%8C%96%E8%B3%AA%E6%84%9F%E6%A9%9F%E8%83%BD%E5%AE%85-%E7%8F%BE%E4%BB%A3%E9%A2%A8-020000438.html', 'https://house.yahoo.com.tw/%E6%BA%AB%E8%98%8A%E9%9F%B6%E5%85%89-%E7%8F%BE%E4%BB%A3%E9%A2%A8-25%E5%9D%AA-020000337.html', 'https://house.yahoo.com.tw/home-%E6%B7%B7%E6%90%AD%E9%A2%A8-020000440.html', 'https://house.yahoo.com.tw/%E6%B8%B2%E6%9F%93%E6%9D%B1%E6%96%B9%E8%B3%AA%E9%9F%BB-%E4%BA%A4%E7%B9%94%E7%8F%BE%E4%BB%A3%E6%99%AF%E7%B7%BB-70%E5%9D%AA-020000667.html', 'https://house.yahoo.com.tw/%E5%A4%A7%E8%86%BD%E8%B7%B3%E8%84%AB%E6%85%A3%E5%B8%B8%E6%80%9D%E7%B6%AD-35%E5%9D%AA%E8%80%81%E6%88%BF%E6%BC%94%E7%B9%B9%E9%A0%82%E7%B4%9A%E9%A3%AF%E5%BA%97%E8%B3%AA%E6%84%9F%E6%9C%83%E6%89%80-020000172.html', 'https://house.yahoo.com.tw/%E9%9D%88%E5%B7%A7%E5%85%89%E6%BD%A4-%E6%81%AC%E8%AC%90%E5%AE%B6%E5%B1%8B-%E7%B6%93%E5%85%B8%E5%8C%97%E6%AD%90%E9%A2%A8-8%E5%9D%AA-020000645.html', 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%9E-%E6%BA%AB%E6%BD%A4%E7%94%9F%E6%B4%BB%E6%B0%A3%E6%81%AF-%E7%8F%BE%E4%BB%A3%E9%A2%A8-18%E5%9D%AA-020000206.html', 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%94%E6%B7%B1%E8%89%B2%E6%BA%AB%E5%BA%A6-%E8%8B%B1%E5%80%AB%E7%B4%B3%E5%A3%AB%E8%B2%B4%E6%97%8F%E9%A2%A8-%E7%8F%BE%E4%BB%A3%E5%A5%A2%E8%8F%AF%E9%A2%A8-020000334.html' ] for i in yahoo_url_list: try: try: driver = re_get_webdriver() except: print('driver_bok') portnum=random.randint(3777,3999) print(portnum) os.system('docker container stop p8809') time.sleep(5) os.system('docker container rm p8809') time.sleep(5) os.system('docker run -d -p '+str(portnum)+':4444 --name p8809 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:106.0') count=0 bok+=1 time.sleep(5) driver = re_get_webdriver() driver.get(i) time.sleep(5) #elmt_next = driver.find_element(By.XPATH, '//*[@id="maincontainer"]/main/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/div/div[1]/a') elmt_next = driver.find_element(By.XPATH, '/html/body/div[3]/div/main/div/div[1]/div/div/div/div/article/header/div[1]/a') webdriver.ActionChains(driver).move_to_element(elmt_next).perform() webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform() print("cick!") count+=1 print('click_all_time:',count,';broken_time:',bok) # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]') time.sleep(random.randint(3,7)) #driver.close() driver.quit() except: #driver.close() try: driver.quit() except: print('no have driver') print(i,'error',';broken_time:',bok) time.sleep(10) portnum=random.randint(3777,3999) print(portnum) os.system('docker container stop p8809') time.sleep(5) os.system('docker container rm p8809') time.sleep(5) os.system('docker run -d -p '+str(portnum)+':4444 --name p8809 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:106.0') count=0 bok=0 time.sleep(5) while True: # run_once() # time.sleep(10) try: run_once() except: bok+=1 print('broken') time.sleep(5) # elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a') # domain = 'hhh.com.tw' # idx=1 # ranking=-1 # domain_in_link = 0 # print (len(elmts)) # # driver.save_screenshot('c:/tmp/test.png') # n=0 # for el in elmts: # n+=1 # href=el.get_attribute('href') # txt=el.text # # print(txt) # if len(txt)>10: # if domain in href: # domain_in_link += 1 # print('clicked....') # print('href:',href) # print('txt:',txt) # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a') # webdriver.ActionChains(driver).move_to_element(elmt_next).perform() # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform() # time.sleep(2) # elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a') # domain = 'hhh.com.tw' # idx=1 # ranking=-1 # domain_in_link = 0 # print (len(elmts)) # # driver.save_screenshot('c:/tmp/test.png') # n=0 # for el in elmts: # n+=1 # href=el.get_attribute('href') # txt=el.text # # print(txt) # if len(txt)>10: # if domain in href: # domain_in_link += 1 # print('clicked....') # print('href:',href) # print('txt:',txt) # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]') # webdriver.ActionChains(driver).move_to_element(elmt_next).perform() # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform() # time.sleep(5) # for i in range(20): # try: # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]') # webdriver.ActionChains(driver).move_to_element(elmt_next).perform() # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform() # time.sleep(5) # except: # time.sleep(200) # webdriver.ActionChains(driver).move_to_element(el).click().perform() # add_tabs = [7,9,11,13,15,7,9,11,13,15,7,9,11,13,15,7,9,11,13,15] # db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4') # driver=None # headers = { # "Authorization": "Bearer " + "t35vhZtWNgvDNWHc3DJh0OKll3mcB9GvC8K2EAkBug2", # "Content-Type": "application/x-www-form-urlencoded" # } # sleepoffset = 0 # def send_msg(kw): # params = {"message": "處理關鍵字: "+kw} # r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params) # def empty_query(q): # global driver # googleurl='https://www.google.com/search?q='+urllib.parse.quote(q) # driver.get(googleurl) # time.sleep(3) # def process_query(domain, target_domain, brands, query): # print(query) # sleepoffset = 0 # global driver # if query == "艾立思" and "index" in target_domain: # driver.get('https://www.google.com/search?num=100&q=艾立思&rlz=1C1ONGR_zh-TWTW997TW997&ei=zjdUY_DBG9Lm-Abpgq84&start=0&sa=N&filter=0&ved=2ahUKEwjw4KeEvfT6AhVSM94KHWnBCwcQ8tMDegQIARAQ&cshid=1666463754367857&biw=1368&bih=761&dpr=2') # time.sleep(4) # else: # driver.get('https://www.google.com?num=100') # time.sleep(3) # print(driver.current_url) # # elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a") # # ABOVE METHOD IS DEPRECATED STARTING SELENIUM 4.3.0, USE THIS # # # elmt = driver.find_element(By.XPATH, "//input[@name='q']") # time.sleep(1) # elmt.send_keys(query) # elmt.send_keys(Keys.ENTER) # idx=1 # ranking=-1 # domain_in_link = 0 # googleurl = driver.current_url # print(driver.current_url) # if "sorry" in googleurl: # return 444 # elmts=driver.find_elements("xpath","//div[@class='yuRUbf']/a") # print (len(elmts)) # # driver.save_screenshot('c:/tmp/test.png') # n=0 # for el in elmts: # n+=1 # href=el.get_attribute('href') # txt=el.text # if len(txt)>10: # if domain in href: # domain_in_link += 1 # print('clicked....') # print(href) # print(txt) # if query == "艾立思" and "index" in target_domain and href != "https://hhh.com.tw/brand-index.php?brand_id=211": # print("wrong site") # continue # webdriver.ActionChains(driver).move_to_element(el).perform() # webdriver.ActionChains(driver).move_to_element(el).click().perform() # print("Rank: " + str(n)) # time.sleep(15) # ''' unused # new_windows_count = add_tabs[random.randint(0,19)] # print(str(new_windows_count) + " new tabs") # for i in range (0,new_windows_count): # print("Tab " + str(i+1)) # #original_window = driver.current_window_handle # #driver.switch_to.new_window('window') # #driver.get(href) # sleepoffset += 12 # driver.execute_script('window.open("'+href+'","_blank");') # driver.execute_script("window.scrollTo(0, 600)") # time.sleep(15) # #driver.close() # #driver.switch_to.window(original_window) # if domain in target_domain: # print("Target link found") # time_stamp = datetime.fromtimestamp(time.time()) # time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S") # db['click_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "url": href, "content": txt, "extra_windows": '0'}) # ''' # break # '''if domain in target_domain: # print("Target domain found") # time_stamp = datetime.fromtimestamp(time.time()) # time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S") # db['query_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "googleurl": googleurl, "element_count": len(elmts), "domain_in_link_count": domain_in_link}) # ''' # print(domain_in_link) # return 200 # def run_once(domain, target_domain, brands, query): # global driver # result=[] # options = webdriver.ChromeOptions() # options.add_argument('--headless') # # options.add_argument("--user-agent=" +user_agent) # options.add_argument("--incognito") # options.add_argument('--no-sandbox') # options.add_argument('--disable-dev-shm-usage') # driver = webdriver.Chrome( # options=options) # driver.delete_all_cookies() # driver.set_window_size(1400,1000) # statuscode = process_query(domain, target_domain, brands, query) # driver.quit() # return statuscode # #execution starts here # def execute(domain, target_domain, brands, query_list): # print("Ctrl+C or Ctrl+Z to stop.") # statuscode = 0 # st = timeit.default_timer() # try: # statuscode = run_once(domain, target_domain, brands, random.choice(query_list)) # except: # traceback.print_exc() # timetaken = timeit.default_timer()-st # print("Time taken: " + str(timetaken)) # print("Process returned with " + str(statuscode)) # if statuscode == 444: # print("You have been caught!!!") # #notify("Clickbot " + brands[domain] + " has been caught by Google and will terminate. IP: ") # extrasleep = 0 # if(timetaken < 50): # extrasleep = 50 - timetaken # print("Ctrl+C or Ctrl+Z to stop now.") # print("You have " + str(10 + extrasleep) + " seconds.") # time.sleep(10 + extrasleep) # return statuscode