123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404 |
- import time
- from datetime import datetime
- import json
- from selenium import webdriver
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- import time
- import os
- import urllib.parse
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
- import codecs
- import random
- import requests
- import dataset
- import traceback
- import sys
- from selenium.webdriver.common.keys import Keys
- import timeit
- import socket
- import random
- import re
- # import requests
- from fake_useragent import UserAgent
- ua = UserAgent()
- def re_get_webdriver():
- # global port
- global driver
- global portnum
- # os.system('killall chrome')
- result=[]
- # if driver is not None:
- # print('closing....')
- # driver.quit()
- # print('quit....')
- # driver=None
- # os.system()
- options = webdriver.ChromeOptions()
- options.add_argument("--user-agent=" +ua.random)
- options.add_argument("--no-sandbox")
- options.add_argument("--headless")
- options.add_argument("--incognito")
- driver = webdriver.Remote(
- command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
- options=options)
- return driver
- # headers = {'user-agent': ua.chrome}
- # r = requests.get('https://house.ettoday.net/news/1492047', headers=headers)
- # print(r.text)
- # options.binary_location = ('C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe')
- # driverPath = './chromedriver.exe'
- # driver = webdriver.Firefox()
- # driver.get('https://google.com')
- # ettoday_url_list = ['https://house.ettoday.net/news/1492047',
- # 'https://house.ettoday.net/news/1492167',
- # 'https://house.ettoday.net/news/1492288',
- # 'https://house.ettoday.net/news/1492178',
- # 'https://house.ettoday.net/news/1492229',
- # 'https://house.ettoday.net/news/1492134',
- # 'https://house.ettoday.net/news/1492240',
- # 'https://house.ettoday.net/news/1492161',
- # 'https://house.ettoday.net/news/1492168',
- # 'https://house.ettoday.net/news/1492217']
- # for i in ettoday_url_list:
-
- # driver.get(i)
- # time.sleep(3)
- # elmt_next = driver.find_element(By.XPATH, '//*[@id="house"]/div[3]/div[2]/div[6]/div/div/div[1]/article/div/div[3]/p[1]/a')
- # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
- # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
- # print("cick:",i)
- # # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
- # time.sleep(random.randint(3,7))
- # driver.quit()
- # query='幸福空間'
- # elmt.send_keys(query)
- # elmt.send_keys(Keys.ENTER)
- # time.sleep(1)
- # time.sleep(1)
- def run_once():
- global count
- global bok
- global portnum
- yahoo_url_list = [
- 'https://house.yahoo.com.tw/%E9%9B%8D%E5%AE%B9%E9%9B%85%E7%B7%BB-%E5%84%AA%E9%9B%85%E5%81%87%E6%9C%9F-%E6%96%B0%E5%8F%A4%E5%85%B8-31%E5%9D%AA-020000499.html',
- 'https://house.yahoo.com.tw/%E6%96%B0%E7%94%9F%E9%AD%85%E5%8A%9B-%E8%AD%9C%E5%AF%AB%E5%B9%B8%E7%A6%8F%E5%9C%93%E8%88%9E%E6%9B%B2-%E5%8C%97%E6%AD%90%E9%A2%A8-35%E5%9D%AA-020000759.html',
- 'https://house.yahoo.com.tw/%E7%AF%89-%E6%96%B9%E8%B3%AA%E7%B0%A1%E5%85%89%E5%AF%93-%E4%BA%AB%E5%8F%97%E6%81%AC%E9%9D%9C%E6%BA%AB%E9%A6%A8%E6%97%A5%E5%B8%B8-%E4%BA%BA%E6%96%87%E9%A3%AF%E5%BA%97%E9%A2%A8-45%E5%9D%AA-020000682.html',
- 'https://house.yahoo.com.tw/sheer-%E7%B4%94%E7%B2%B9-%E7%8F%BE%E4%BB%A3%E9%A2%A8-25%E5%9D%AA-020000325.html',
- 'https://house.yahoo.com.tw/%E8%AE%8A%E5%BD%A2%E8%88%87%E7%B5%84%E5%90%88-%E8%A4%87%E5%90%88%E5%BC%8F%E7%9A%84%E7%A9%BA%E9%96%93%E8%A8%AD%E8%A8%88-%E4%B8%AD-020000869.html',
- 'https://house.yahoo.com.tw/%E8%A7%A3%E6%94%BE%E6%8B%98%E7%A6%81%E5%BF%83%E9%9D%88-%E8%B6%85%E8%84%AB%E7%8B%82%E6%83%B3%E9%80%8F%E5%A4%A9%E5%8E%9D-020000093.html'
- 'https://house.yahoo.com.tw/%E8%A6%AA%E5%AD%90%E6%96%99%E7%90%86%E7%9B%B4%E6%92%AD%E4%B8%BB%E7%9A%84%E5%AE%B6-%E5%BE%AE%E7%BE%8E%E5%BC%8F%E8%A8%AD%E8%A8%88-50%E5%9D%AA-020000607.html',
- 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%94%E6%B7%B1%E8%89%B2%E6%BA%AB%E5%BA%A6-%E8%8B%B1%E5%80%AB%E7%B4%B3%E5%A3%AB%E8%B2%B4%E6%97%8F%E9%A2%A8-%E7%8F%BE%E4%BB%A3%E5%A5%A2%E8%8F%AF%E9%A2%A8-020000334.html',
- 'https://house.yahoo.com.tw/%E7%8E%A9%E5%91%B3%E7%B3%BB%E7%B5%B1%E6%9D%BF-%E5%BF%AB%E9%80%9F%E6%88%90%E5%AE%B6%E7%B0%A1%E7%B4%84%E7%8F%BE%E4%BB%A3%E9%A2%A8-35%E5%9D%AA-020000199.html',
- 'https://house.yahoo.com.tw/%E4%BB%A5%E5%9C%93%E5%BD%A2%E7%AC%A6%E7%A2%BC-%E5%BD%A2%E5%A1%91%E6%81%A2%E5%BC%98%E5%A5%A2%E7%BE%8E%E8%87%BB%E9%82%B8-%E5%A5%A2%E8%8F%AF%E9%A2%A8-42%E5%9D%AA-020000780.html',
- 'https://house.yahoo.com.tw/%E7%B4%99%E9%9B%95-%E7%8F%BE%E4%BB%A3%E9%A2%A8-30%E5%9D%AA-020000034.html',
- 'https://house.yahoo.com.tw/%E6%8C%91%E9%AB%98%E6%97%A5%E7%B3%BB%E8%BE%A6%E5%85%AC%E7%A9%BA%E9%96%93-%E7%B5%90%E5%90%88%E4%BE%98%E5%AF%82%E8%88%87%E8%87%AA%E7%84%B6%E7%9A%84%E7%B0%A1%E7%B4%84%E7%BE%8E%E5%AD%B8-230%E5%9D%AA-020000517.html',
- 'https://house.yahoo.com.tw/35%E5%9D%AA%E8%80%81%E5%B1%8B%E5%A5%BD%E5%B1%8B%E6%B3%81-%E9%80%B2%E5%8C%96%E8%B3%AA%E6%84%9F%E6%A9%9F%E8%83%BD%E5%AE%85-%E7%8F%BE%E4%BB%A3%E9%A2%A8-020000438.html',
- 'https://house.yahoo.com.tw/%E6%BA%AB%E8%98%8A%E9%9F%B6%E5%85%89-%E7%8F%BE%E4%BB%A3%E9%A2%A8-25%E5%9D%AA-020000337.html',
- 'https://house.yahoo.com.tw/home-%E6%B7%B7%E6%90%AD%E9%A2%A8-020000440.html',
- 'https://house.yahoo.com.tw/%E6%B8%B2%E6%9F%93%E6%9D%B1%E6%96%B9%E8%B3%AA%E9%9F%BB-%E4%BA%A4%E7%B9%94%E7%8F%BE%E4%BB%A3%E6%99%AF%E7%B7%BB-70%E5%9D%AA-020000667.html',
- 'https://house.yahoo.com.tw/%E5%A4%A7%E8%86%BD%E8%B7%B3%E8%84%AB%E6%85%A3%E5%B8%B8%E6%80%9D%E7%B6%AD-35%E5%9D%AA%E8%80%81%E6%88%BF%E6%BC%94%E7%B9%B9%E9%A0%82%E7%B4%9A%E9%A3%AF%E5%BA%97%E8%B3%AA%E6%84%9F%E6%9C%83%E6%89%80-020000172.html',
- 'https://house.yahoo.com.tw/%E9%9D%88%E5%B7%A7%E5%85%89%E6%BD%A4-%E6%81%AC%E8%AC%90%E5%AE%B6%E5%B1%8B-%E7%B6%93%E5%85%B8%E5%8C%97%E6%AD%90%E9%A2%A8-8%E5%9D%AA-020000645.html',
- 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%9E-%E6%BA%AB%E6%BD%A4%E7%94%9F%E6%B4%BB%E6%B0%A3%E6%81%AF-%E7%8F%BE%E4%BB%A3%E9%A2%A8-18%E5%9D%AA-020000206.html',
- 'https://house.yahoo.com.tw/%E5%82%B3%E9%81%94%E6%B7%B1%E8%89%B2%E6%BA%AB%E5%BA%A6-%E8%8B%B1%E5%80%AB%E7%B4%B3%E5%A3%AB%E8%B2%B4%E6%97%8F%E9%A2%A8-%E7%8F%BE%E4%BB%A3%E5%A5%A2%E8%8F%AF%E9%A2%A8-020000334.html'
- ]
-
-
- for i in yahoo_url_list:
- try:
- try:
- driver = re_get_webdriver()
- except:
- print('driver_bok')
- portnum=random.randint(8555,8777)
- print(portnum)
- os.system('docker container stop p8808')
- time.sleep(5)
- os.system('docker container rm p8808')
- time.sleep(5)
- os.system('docker run -d -p '+str(portnum)+':4444 --name p8808 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:106.0')
- count=0
- bok+=1
- time.sleep(5)
- driver = re_get_webdriver()
- driver.get(i)
- time.sleep(5)
- #elmt_next = driver.find_element(By.XPATH, '//*[@id="maincontainer"]/main/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/div/div[1]/a')
- elmt_next = driver.find_element(By.XPATH, '/html/body/div[3]/div/main/div/div[1]/div/div/div/div/article/header/div[1]/a')
- webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
- webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
- print("cick!")
- count+=1
- print('click_all_time:',count,';broken_time:',bok)
- # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
- time.sleep(random.randint(3,7))
- #driver.close()
- driver.quit()
- except:
- #driver.close()
- try:
- driver.quit()
- except:
- print('no have driver')
- print(i,'error',';broken_time:',bok)
- time.sleep(10)
- portnum=random.randint(8555,8777)
- print(portnum)
- os.system('docker container stop p8808')
- time.sleep(5)
- os.system('docker container rm p8808')
- time.sleep(5)
- os.system('docker run -d -p '+str(portnum)+':4444 --name p8808 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:106.0')
- count=0
- bok=0
- time.sleep(5)
- while True:
- # run_once()
- # time.sleep(10)
- try:
- run_once()
- except:
- bok+=1
- print('broken')
- time.sleep(5)
- # elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a')
- # domain = 'hhh.com.tw'
- # idx=1
- # ranking=-1
- # domain_in_link = 0
- # print (len(elmts))
- # # driver.save_screenshot('c:/tmp/test.png')
- # n=0
- # for el in elmts:
- # n+=1
- # href=el.get_attribute('href')
- # txt=el.text
- # # print(txt)
- # if len(txt)>10:
- # if domain in href:
- # domain_in_link += 1
- # print('clicked....')
- # print('href:',href)
- # print('txt:',txt)
- # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a')
- # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
- # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
- # time.sleep(2)
- # elmts=driver.find_elements("xpath",'//*[@id="web"]/ol/li/div/div[1]/h3/a')
- # domain = 'hhh.com.tw'
- # idx=1
- # ranking=-1
- # domain_in_link = 0
- # print (len(elmts))
- # # driver.save_screenshot('c:/tmp/test.png')
- # n=0
- # for el in elmts:
- # n+=1
- # href=el.get_attribute('href')
- # txt=el.text
- # # print(txt)
- # if len(txt)>10:
- # if domain in href:
- # domain_in_link += 1
- # print('clicked....')
- # print('href:',href)
- # print('txt:',txt)
- # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]')
- # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
- # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
- # time.sleep(5)
- # for i in range(20):
- # try:
- # elmt_next = driver.find_element(By.XPATH, '//*[@id="left"]/div/ol/li[1]/div/div/a[2]')
- # webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
- # webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
- # time.sleep(5)
- # except:
- # time.sleep(200)
- # webdriver.ActionChains(driver).move_to_element(el).click().perform()
- # add_tabs = [7,9,11,13,15,7,9,11,13,15,7,9,11,13,15,7,9,11,13,15]
- # db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
- # driver=None
- # headers = {
- # "Authorization": "Bearer " + "t35vhZtWNgvDNWHc3DJh0OKll3mcB9GvC8K2EAkBug2",
- # "Content-Type": "application/x-www-form-urlencoded"
- # }
- # sleepoffset = 0
- # def send_msg(kw):
- # params = {"message": "處理關鍵字: "+kw}
- # r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
- # def empty_query(q):
- # global driver
- # googleurl='https://www.google.com/search?q='+urllib.parse.quote(q)
- # driver.get(googleurl)
- # time.sleep(3)
- # def process_query(domain, target_domain, brands, query):
- # print(query)
- # sleepoffset = 0
- # global driver
- # if query == "艾立思" and "index" in target_domain:
- # driver.get('https://www.google.com/search?num=100&q=艾立思&rlz=1C1ONGR_zh-TWTW997TW997&ei=zjdUY_DBG9Lm-Abpgq84&start=0&sa=N&filter=0&ved=2ahUKEwjw4KeEvfT6AhVSM94KHWnBCwcQ8tMDegQIARAQ&cshid=1666463754367857&biw=1368&bih=761&dpr=2')
- # time.sleep(4)
- # else:
- # driver.get('https://www.google.com?num=100')
- # time.sleep(3)
- # print(driver.current_url)
- # # elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
- # # ABOVE METHOD IS DEPRECATED STARTING SELENIUM 4.3.0, USE THIS
- # #
- # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
- # time.sleep(1)
- # elmt.send_keys(query)
- # elmt.send_keys(Keys.ENTER)
- # idx=1
- # ranking=-1
- # domain_in_link = 0
- # googleurl = driver.current_url
- # print(driver.current_url)
- # if "sorry" in googleurl:
- # return 444
- # elmts=driver.find_elements("xpath","//div[@class='yuRUbf']/a")
- # print (len(elmts))
- # # driver.save_screenshot('c:/tmp/test.png')
- # n=0
- # for el in elmts:
- # n+=1
- # href=el.get_attribute('href')
- # txt=el.text
- # if len(txt)>10:
- # if domain in href:
- # domain_in_link += 1
- # print('clicked....')
- # print(href)
- # print(txt)
- # if query == "艾立思" and "index" in target_domain and href != "https://hhh.com.tw/brand-index.php?brand_id=211":
- # print("wrong site")
- # continue
-
- # webdriver.ActionChains(driver).move_to_element(el).perform()
- # webdriver.ActionChains(driver).move_to_element(el).click().perform()
- # print("Rank: " + str(n))
- # time.sleep(15)
- # ''' unused
- # new_windows_count = add_tabs[random.randint(0,19)]
- # print(str(new_windows_count) + " new tabs")
- # for i in range (0,new_windows_count):
- # print("Tab " + str(i+1))
- # #original_window = driver.current_window_handle
- # #driver.switch_to.new_window('window')
- # #driver.get(href)
- # sleepoffset += 12
- # driver.execute_script('window.open("'+href+'","_blank");')
- # driver.execute_script("window.scrollTo(0, 600)")
- # time.sleep(15)
- # #driver.close()
- # #driver.switch_to.window(original_window)
-
- # if domain in target_domain:
- # print("Target link found")
- # time_stamp = datetime.fromtimestamp(time.time())
- # time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
- # db['click_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "url": href, "content": txt, "extra_windows": '0'})
- # '''
- # break
- # '''if domain in target_domain:
- # print("Target domain found")
- # time_stamp = datetime.fromtimestamp(time.time())
- # time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
- # db['query_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": query, "googleurl": googleurl, "element_count": len(elmts), "domain_in_link_count": domain_in_link})
- # '''
- # print(domain_in_link)
- # return 200
-
- # def run_once(domain, target_domain, brands, query):
- # global driver
- # result=[]
- # options = webdriver.ChromeOptions()
- # options.add_argument('--headless')
- # # options.add_argument("--user-agent=" +user_agent)
- # options.add_argument("--incognito")
- # options.add_argument('--no-sandbox')
- # options.add_argument('--disable-dev-shm-usage')
- # driver = webdriver.Chrome(
- # options=options)
- # driver.delete_all_cookies()
- # driver.set_window_size(1400,1000)
- # statuscode = process_query(domain, target_domain, brands, query)
- # driver.quit()
- # return statuscode
- # #execution starts here
- # def execute(domain, target_domain, brands, query_list):
- # print("Ctrl+C or Ctrl+Z to stop.")
- # statuscode = 0
- # st = timeit.default_timer()
- # try:
- # statuscode = run_once(domain, target_domain, brands, random.choice(query_list))
- # except:
- # traceback.print_exc()
- # timetaken = timeit.default_timer()-st
- # print("Time taken: " + str(timetaken))
-
- # print("Process returned with " + str(statuscode))
- # if statuscode == 444:
- # print("You have been caught!!!")
-
- # #notify("Clickbot " + brands[domain] + " has been caught by Google and will terminate. IP: ")
- # extrasleep = 0
- # if(timetaken < 50):
- # extrasleep = 50 - timetaken
- # print("Ctrl+C or Ctrl+Z to stop now.")
- # print("You have " + str(10 + extrasleep) + " seconds.")
- # time.sleep(10 + extrasleep)
- # return statuscode
|