import time from datetime import datetime import json from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time import os import urllib.parse from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC import codecs import random import requests import dataset import traceback import sys from selenium.webdriver.common.keys import Keys target_domain=['bennis.com.tw'] brands={'bennis.com.tw':'班尼斯'} driver=None headers = { "Authorization": "Bearer " + "t35vhZtWNgvDNWHc3DJh0OKll3mcB9GvC8K2EAkBug2", "Content-Type": "application/x-www-form-urlencoded" } def send_msg(kw): params = {"message": "處理關鍵字: "+kw} r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params) def empty_query(q): global driver googleurl='https://www.google.com/search?q='+urllib.parse.quote(q) driver.get(googleurl) time.sleep(3) def process_query(): q="班尼斯" domain="bennis.com.tw" global driver driver.get('https://www.google.com?num=100') time.sleep(3) print(driver.current_url) # elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a") # ABOVE METHOD IS DEPRECATED STARTING SELENIUM 4.3.0, USE THIS # elmt = driver.find_element(By.XPATH, "//input[@name='q']") time.sleep(1) elmt.send_keys(q) elmt.send_keys(Keys.ENTER) idx=1 ranking=-1 domain_in_link = 0 googleurl = driver.current_url print(driver.current_url) elmts=driver.find_elements("xpath","//div[@class='yuRUbf']/a") print (len(elmts)) # driver.save_screenshot('c:/tmp/test.png') for el in elmts: href=el.get_attribute('href') txt=el.text if len(txt)>10: if domain in href: domain_in_link += 1 print('clicked....') print(href) print(txt) webdriver.ActionChains(driver).move_to_element(el).perform() webdriver.ActionChains(driver).move_to_element(el).click().perform() time.sleep(6) if domain in target_domain: print("Target link found") time_stamp = datetime.fromtimestamp(time.time()) time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S") db['click_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": q, "url": href, "content": txt}) break if domain in target_domain: print("Target domain found") time_stamp = datetime.fromtimestamp(time.time()) time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S") db['query_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": q, "googleurl": googleurl, "element_count": len(elmts), "domain_in_link_count": domain_in_link}) print(domain_in_link) def run_once(): global driver result=[] options = webdriver.ChromeOptions() options.add_argument('--headless') # options.add_argument("--user-agent=" +user_agent) options.add_argument("--incognito") options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome( options=options) driver.delete_all_cookies() driver.set_window_size(1400,1000) process_query() time.sleep(3) driver.quit() #execution starts here db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4') while True: try: run_once() except: traceback.print_exc() sleepint=random.randint(35,50) print("Completed (" + str(sleepint) + ")") time.sleep(sleepint)