|
@@ -0,0 +1,377 @@
|
|
|
+import time
|
|
|
+from datetime import datetime
|
|
|
+import json
|
|
|
+from selenium import webdriver
|
|
|
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
+import time
|
|
|
+import os
|
|
|
+import urllib.parse
|
|
|
+from selenium.webdriver.support.ui import WebDriverWait
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+import codecs
|
|
|
+import random
|
|
|
+import requests
|
|
|
+import dataset
|
|
|
+import traceback
|
|
|
+import sys
|
|
|
+from selenium.webdriver.common.keys import Keys
|
|
|
+import timeit
|
|
|
+import socket
|
|
|
+
|
|
|
+import random
|
|
|
+import re
|
|
|
+
|
|
|
+
|
|
|
+# import requests
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+from fake_useragent import UserAgent
|
|
|
+
|
|
|
+ua = UserAgent()
|
|
|
+def re_get_webdriver():
|
|
|
+ # global port
|
|
|
+ global driver
|
|
|
+ global portnum
|
|
|
+ # os.system('killall chrome')
|
|
|
+ result=[]
|
|
|
+ # if driver is not None:
|
|
|
+ # print('closing....')
|
|
|
+ # driver.quit()
|
|
|
+ # print('quit....')
|
|
|
+ # driver=None
|
|
|
+ # os.system()
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ options.add_argument("--user-agent=" +ua.random)
|
|
|
+ options.add_argument("--no-sandbox")
|
|
|
+ options.add_argument("--headless")
|
|
|
+ options.add_argument("--incognito")
|
|
|
+ driver = webdriver.Remote(
|
|
|
+ command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
|
|
|
+ options=options)
|
|
|
+ return driver
|
|
|
+# headers = {'user-agent': ua.chrome}
|
|
|
+# r = requests.get('https://house.ettoday.net/news/1492047', headers=headers)
|
|
|
+# print(r.text)
|
|
|
+
|
|
|
+
|
|
|
+# options.binary_location = ('C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe')
|
|
|
+# driverPath = './chromedriver.exe'
|
|
|
+
|
|
|
+# driver = webdriver.Firefox()
|
|
|
+# driver.get('https://google.com')
|
|
|
+
|
|
|
+# ettoday_url_list = ['https://house.ettoday.net/news/1492047',
|
|
|
+# 'https://house.ettoday.net/news/1492167',
|
|
|
+# 'https://house.ettoday.net/news/1492288',
|
|
|
+# 'https://house.ettoday.net/news/1492178',
|
|
|
+# 'https://house.ettoday.net/news/1492229',
|
|
|
+# 'https://house.ettoday.net/news/1492134',
|
|
|
+# 'https://house.ettoday.net/news/1492240',
|
|
|
+# 'https://house.ettoday.net/news/1492161',
|
|
|
+# 'https://house.ettoday.net/news/1492168',
|
|
|
+# 'https://house.ettoday.net/news/1492217']
|
|
|
+# for i in ettoday_url_list:
|
|
|
+
|
|
|
+# driver.get(i)
|
|
|
+# time.sleep(3)
|
|
|
+# elmt_next = driver.find_element(By.XPATH, '//*[@id="house"]/div[3]/div[2]/div[6]/div/div/div[1]/article/div/div[3]/p[1]/a')
|
|
|
+
|
|
|
+# webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
|
|
|
+# webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
|
|
|
+# print("cick:",i)
|
|
|
+# # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
|
|
|
+# time.sleep(random.randint(3,7))
|
|
|
+# driver.quit()
|
|
|
+
|
|
|
+# query='幸福空間'
|
|
|
+# elmt.send_keys(query)
|
|
|
+# elmt.send_keys(Keys.ENTER)
|
|
|
+# time.sleep(1)
|
|
|
+
|
|
|
+# time.sleep(1)
|
|
|
+
|
|
|
+def run_once():
|
|
|
+ global count
|
|
|
+ global bok
|
|
|
+ global portnum
|
|
|
+ yahoo_url_list = [
|
|
|
+ 'https://house.yahoo.com.tw/%E9%9B%8D%E5%AE%B9%E9%9B%85%E7%B7%BB-%E5%84%AA%E9%9B%85%E5%81%87%E6%9C%9F-%E6%96%B0%E5%8F%A4%E5%85%B8-31%E5%9D%AA-020000499.html',
|
|
|
+ 'https://house.yahoo.com.tw/%E6%96%B0%E7%94%9F%E9%AD%85%E5%8A%9B-%E8%AD%9C%E5%AF%AB%E5%B9%B8%E7%A6%8F%E5%9C%93%E8%88%9E%E6%9B%B2-%E5%8C%97%E6%AD%90%E9%A2%A8-35%E5%9D%AA-020000759.html',
|
|
|
+ 'https://house.yahoo.com.tw/20%E5%B9%B4%E8%80%81%E5%AE%85%E9%87%8D%E7%94%9F-%E7%BE%8E%E5%BC%8F%E4%BD%8E%E5%A5%A2%E6%9C%89%E5%AE%B6%E7%9A%84%E6%BA%AB%E5%BA%A6-106%E5%9D%AA-020000087.html',
|
|
|
+ 'https://house.yahoo.com.tw/sheer-%E7%B4%94%E7%B2%B9-%E7%8F%BE%E4%BB%A3%E9%A2%A8-25%E5%9D%AA-020000325.html',
|
|
|
+ 'https://house.yahoo.com.tw/%E8%AE%8A%E5%BD%A2%E8%88%87%E7%B5%84%E5%90%88-%E8%A4%87%E5%90%88%E5%BC%8F%E7%9A%84%E7%A9%BA%E9%96%93%E8%A8%AD%E8%A8%88-%E4%B8%AD-020000869.html',
|
|
|
+ 'https://house.yahoo.com.tw/%E8%A7%A3%E6%94%BE%E6%8B%98%E7%A6%81%E5%BF%83%E9%9D%88-%E8%B6%85%E8%84%AB%E7%8B%82%E6%83%B3%E9%80%8F%E5%A4%A9%E5%8E%9D-020000093.html',
|
|
|
+ 'https://house.yahoo.com.tw/%E8%A6%AA%E5%AD%90%E6%96%99%E7%90%86%E7%9B%B4%E6%92%AD%E4%B8%BB%E7%9A%84%E5%AE%B6-%E5%BE%AE%E7%BE%8E%E5%BC%8F%E8%A8%AD%E8%A8%88-50%E5%9D%AA-020000607.html',
|
|
|
+ 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%94%E6%B7%B1%E8%89%B2%E6%BA%AB%E5%BA%A6-%E8%8B%B1%E5%80%AB%E7%B4%B3%E5%A3%AB%E8%B2%B4%E6%97%8F%E9%A2%A8-%E7%8F%BE%E4%BB%A3%E5%A5%A2%E8%8F%AF%E9%A2%A8-020000334.html',
|
|
|
+ 'https://house.yahoo.com.tw/%E7%8E%A9%E5%91%B3%E7%B3%BB%E7%B5%B1%E6%9D%BF-%E5%BF%AB%E9%80%9F%E6%88%90%E5%AE%B6%E7%B0%A1%E7%B4%84%E7%8F%BE%E4%BB%A3%E9%A2%A8-35%E5%9D%AA-020000199.html',
|
|
|
+ 'https://house.yahoo.com.tw/%E4%BB%A5%E5%9C%93%E5%BD%A2%E7%AC%A6%E7%A2%BC-%E5%BD%A2%E5%A1%91%E6%81%A2%E5%BC%98%E5%A5%A2%E7%BE%8E%E8%87%BB%E9%82%B8-%E5%A5%A2%E8%8F%AF%E9%A2%A8-42%E5%9D%AA-020000780.html']
|
|
|
+
|
|
|
+
|
|
|
+ for i in yahoo_url_list:
|
|
|
+ try:
|
|
|
+ try:
|
|
|
+ driver = re_get_webdriver()
|
|
|
+ except:
|
|
|
+ print('driver_bok')
|
|
|
+ portnum=random.randint(4555,4666)
|
|
|
+ print(portnum)
|
|
|
+ os.system('docker container stop p8809')
|
|
|
+ time.sleep(5)
|
|
|
+ os.system('docker container rm p8809')
|
|
|
+ time.sleep(5)
|
|
|
+ os.system('docker run -d -p '+str(portnum)+':4444 --name p8809 --dns 168.95.1.1 selenium/standalone-chrome:106.0')
|
|
|
+ count=0
|
|
|
+ bok+=1
|
|
|
+ time.sleep(5)
|
|
|
+ driver = re_get_webdriver()
|
|
|
+
|
|
|
+ driver.get(i)
|
|
|
+ time.sleep(5)
|
|
|
+ elmt_next = driver.find_element(By.XPATH, '//*[@id="maincontainer"]/main/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/div/div[1]/a')
|
|
|
+
|
|
|
+ webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
|
|
|
+ webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
|
|
|
+ print("cick!")
|
|
|
+ count+=1
|
|
|
+ print('click_all_time:',count,';broken_time:',bok)
|
|
|
+ # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
|
|
|
+ time.sleep(random.randint(3,7))
|
|
|
+ driver.quit()
|
|
|
+ except:
|
|
|
+ driver.quit()
|
|
|
+ print(i,'error',';broken_time:',bok)
|
|
|
+ time.sleep(10)
|
|
|
+portnum=random.randint(4555,4666)
|
|
|
+print(portnum)
|
|
|
+os.system('docker container stop p8809')
|
|
|
+time.sleep(5)
|
|
|
+os.system('docker container rm p8809')
|
|
|
+time.sleep(5)
|
|
|
+os.system('docker run -d -p '+str(portnum)+':4444 --name p8809 --dns 168.95.1.1 selenium/standalone-chrome:106.0')
|
|
|
+count=0
|
|
|
+bok=0
|
|
|
+time.sleep(5)
|
|
|
+while True:
|
|
|
+ # run_once()
|
|
|
+ # time.sleep(10)
|
|
|
+
|
|
|
+ run_once()
|
|
|
+# elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a')
|
|
|
+# domain = 'hhh.com.tw'
|
|
|
+# idx=1
|
|
|
+# ranking=-1
|
|
|
+# domain_in_link = 0
|
|
|
+# print (len(elmts))
|
|
|
+# # driver.save_screenshot('c:/tmp/test.png')
|
|
|
+# n=0
|
|
|
+# for el in elmts:
|
|
|
+# n+=1
|
|
|
+# href=el.get_attribute('href')
|
|
|
+# txt=el.text
|
|
|
+# # print(txt)
|
|
|
+# if len(txt)>10:
|
|
|
+# if domain in href:
|
|
|
+# domain_in_link += 1
|
|
|
+# print('clicked....')
|
|
|
+# print('href:',href)
|
|
|
+# print('txt:',txt)
|
|
|
+# elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a')
|
|
|
+
|
|
|
+# webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
|
|
|
+# webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
|
|
|
+# time.sleep(2)
|
|
|
+# elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a')
|
|
|
+# domain = 'hhh.com.tw'
|
|
|
+# idx=1
|
|
|
+# ranking=-1
|
|
|
+# domain_in_link = 0
|
|
|
+# print (len(elmts))
|
|
|
+# # driver.save_screenshot('c:/tmp/test.png')
|
|
|
+# n=0
|
|
|
+# for el in elmts:
|
|
|
+# n+=1
|
|
|
+# href=el.get_attribute('href')
|
|
|
+# txt=el.text
|
|
|
+# # print(txt)
|
|
|
+# if len(txt)>10:
|
|
|
+# if domain in href:
|
|
|
+# domain_in_link += 1
|
|
|
+# print('clicked....')
|
|
|
+# print('href:',href)
|
|
|
+# print('txt:',txt)
|
|
|
+# elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]')
|
|
|
+
|
|
|
+# webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
|
|
|
+# webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
|
|
|
+# time.sleep(5)
|
|
|
+# for i in range(20):
|
|
|
+# try:
|
|
|
+# elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]')
|
|
|
+
|
|
|
+# webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
|
|
|
+# webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
|
|
|
+# time.sleep(5)
|
|
|
+# except:
|
|
|
+# time.sleep(200)
|
|
|
+# webdriver.ActionChains(driver).move_to_element(el).click().perform()
|
|
|
+# add_tabs = [7,9,11,13,15,7,9,11,13,15,7,9,11,13,15,7,9,11,13,15]
|
|
|
+
|
|
|
+# db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
|
|
|
+# driver=None
|
|
|
+# headers = {
|
|
|
+# "Authorization": "Bearer " + "t35vhZtWNgvDNWHc3DJh0OKll3mcB9GvC8K2EAkBug2",
|
|
|
+# "Content-Type": "application/x-www-form-urlencoded"
|
|
|
+# }
|
|
|
+
|
|
|
+# sleepoffset = 0
|
|
|
+
|
|
|
+# def send_msg(kw):
|
|
|
+# params = {"message": "處理關鍵字: "+kw}
|
|
|
+# r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
|
|
|
+
|
|
|
+# def empty_query(q):
|
|
|
+# global driver
|
|
|
+# googleurl='https://www.google.com/search?q='+urllib.parse.quote(q)
|
|
|
+# driver.get(googleurl)
|
|
|
+# time.sleep(3)
|
|
|
+
|
|
|
+# def process_query(domain, target_domain, brands, query):
|
|
|
+# print(query)
|
|
|
+# sleepoffset = 0
|
|
|
+# global driver
|
|
|
+# if query == "艾立思" and "index" in target_domain:
|
|
|
+# driver.get('https://www.google.com/search?num=100&q=艾立思&rlz=1C1ONGR_zh-TWTW997TW997&ei=zjdUY_DBG9Lm-Abpgq84&start=0&sa=N&filter=0&ved=2ahUKEwjw4KeEvfT6AhVSM94KHWnBCwcQ8tMDegQIARAQ&cshid=1666463754367857&biw=1368&bih=761&dpr=2')
|
|
|
+# time.sleep(4)
|
|
|
+# else:
|
|
|
+# driver.get('https://www.google.com?num=100')
|
|
|
+# time.sleep(3)
|
|
|
+# print(driver.current_url)
|
|
|
+
|
|
|
+# # elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
|
|
|
+# # ABOVE METHOD IS DEPRECATED STARTING SELENIUM 4.3.0, USE THIS
|
|
|
+# #
|
|
|
+# elmt = driver.find_element(By.XPATH, "//input[@name='q']")
|
|
|
+# time.sleep(1)
|
|
|
+
|
|
|
+# elmt.send_keys(query)
|
|
|
+# elmt.send_keys(Keys.ENTER)
|
|
|
+
|
|
|
+# idx=1
|
|
|
+# ranking=-1
|
|
|
+# domain_in_link = 0
|
|
|
+
|
|
|
+# googleurl = driver.current_url
|
|
|
+# print(driver.current_url)
|
|
|
+
|
|
|
+# if "sorry" in googleurl:
|
|
|
+# return 444
|
|
|
+
|
|
|
+# elmts=driver.find_elements("xpath","//div[@class='yuRUbf']/a")
|
|
|
+
|
|
|
+
|
|
|
+# print (len(elmts))
|
|
|
+# # driver.save_screenshot('c:/tmp/test.png')
|
|
|
+# n=0
|
|
|
+# for el in elmts:
|
|
|
+# n+=1
|
|
|
+# href=el.get_attribute('href')
|
|
|
+# txt=el.text
|
|
|
+# if len(txt)>10:
|
|
|
+# if domain in href:
|
|
|
+# domain_in_link += 1
|
|
|
+# print('clicked....')
|
|
|
+# print(href)
|
|
|
+# print(txt)
|
|
|
+
|
|
|
+# if query == "艾立思" and "index" in target_domain and href != "https://hhh.com.tw/brand-index.php?brand_id=211":
|
|
|
+# print("wrong site")
|
|
|
+# continue
|
|
|
+
|
|
|
+# webdriver.ActionChains(driver).move_to_element(el).perform()
|
|
|
+# webdriver.ActionChains(driver).move_to_element(el).click().perform()
|
|
|
+# print("Rank: " + str(n))
|
|
|
+# time.sleep(15)
|
|
|
+
|
|
|
+# ''' unused
|
|
|
+# new_windows_count = add_tabs[random.randint(0,19)]
|
|
|
+# print(str(new_windows_count) + " new tabs")
|
|
|
+# for i in range (0,new_windows_count):
|
|
|
+# print("Tab " + str(i+1))
|
|
|
+# #original_window = driver.current_window_handle
|
|
|
+# #driver.switch_to.new_window('window')
|
|
|
+# #driver.get(href)
|
|
|
+# sleepoffset += 12
|
|
|
+# driver.execute_script('window.open("'+href+'","_blank");')
|
|
|
+# driver.execute_script("window.scrollTo(0, 600)")
|
|
|
+# time.sleep(15)
|
|
|
+# #driver.close()
|
|
|
+# #driver.switch_to.window(original_window)
|
|
|
+
|
|
|
+# if domain in target_domain:
|
|
|
+# print("Target link found")
|
|
|
+# time_stamp = datetime.fromtimestamp(time.time())
|
|
|
+# time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+# db['click_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "url": href, "content": txt, "extra_windows": '0'})
|
|
|
+# '''
|
|
|
+# break
|
|
|
+
|
|
|
+# '''if domain in target_domain:
|
|
|
+# print("Target domain found")
|
|
|
+# time_stamp = datetime.fromtimestamp(time.time())
|
|
|
+# time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+# db['query_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "googleurl": googleurl, "element_count": len(elmts), "domain_in_link_count": domain_in_link})
|
|
|
+# '''
|
|
|
+
|
|
|
+# print(domain_in_link)
|
|
|
+# return 200
|
|
|
+
|
|
|
+
|
|
|
+# def run_once(domain, target_domain, brands, query):
|
|
|
+# global driver
|
|
|
+# result=[]
|
|
|
+# options = webdriver.ChromeOptions()
|
|
|
+# options.add_argument('--headless')
|
|
|
+# # options.add_argument("--user-agent=" +user_agent)
|
|
|
+# options.add_argument("--incognito")
|
|
|
+# options.add_argument('--no-sandbox')
|
|
|
+# options.add_argument('--disable-dev-shm-usage')
|
|
|
+
|
|
|
+# driver = webdriver.Chrome(
|
|
|
+# options=options)
|
|
|
+
|
|
|
+# driver.delete_all_cookies()
|
|
|
+# driver.set_window_size(1400,1000)
|
|
|
+
|
|
|
+# statuscode = process_query(domain, target_domain, brands, query)
|
|
|
+# driver.quit()
|
|
|
+
|
|
|
+# return statuscode
|
|
|
+
|
|
|
+# #execution starts here
|
|
|
+
|
|
|
+# def execute(domain, target_domain, brands, query_list):
|
|
|
+# print("Ctrl+C or Ctrl+Z to stop.")
|
|
|
+# statuscode = 0
|
|
|
+# st = timeit.default_timer()
|
|
|
+# try:
|
|
|
+# statuscode = run_once(domain, target_domain, brands, random.choice(query_list))
|
|
|
+# except:
|
|
|
+# traceback.print_exc()
|
|
|
+# timetaken = timeit.default_timer()-st
|
|
|
+# print("Time taken: " + str(timetaken))
|
|
|
+
|
|
|
+# print("Process returned with " + str(statuscode))
|
|
|
+# if statuscode == 444:
|
|
|
+# print("You have been caught!!!")
|
|
|
+
|
|
|
+# #notify("Clickbot " + brands[domain] + " has been caught by Google and will terminate. IP: ")
|
|
|
+
|
|
|
+# extrasleep = 0
|
|
|
+# if(timetaken < 50):
|
|
|
+# extrasleep = 50 - timetaken
|
|
|
+# print("Ctrl+C or Ctrl+Z to stop now.")
|
|
|
+# print("You have " + str(10 + extrasleep) + " seconds.")
|
|
|
+# time.sleep(10 + extrasleep)
|
|
|
+# return statuscode
|