import time import json from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time import os import urllib.parse from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC import codecs import random import requests import datetime import dataset import time import traceback import sys import fire import redis driver = None def process_query(qs): db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4') table = db['general_log'] q = qs[0] domain = qs[1] global driver googleurl = 'https://www.google.com/?num=100' driver.get(googleurl) time.sleep(6) send_kw_elmt = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, '/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input'))) send_kw_elmt.send_keys(q) time.sleep(3) send_kw_elmt.send_keys(Keys.ENTER) time.sleep(6) print(driver.current_url) elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a") idx = 1 ranking = -1 print('網頁數量', len(elmts)) # driver.save_screenshot('c:/tmp/test.png') if 'site' in q: href = elmts[0].get_attribute('href') txt = elmts[0].text print('clicked....') print(href) print(txt) print("ranking", idx) table.insert( {'kw': q, 'domain': domain, 'ranking': idx, 'title': txt, 'url': href, 'dt': datetime.datetime.now()}) webdriver.ActionChains(driver).move_to_element(elmts[0]).perform() time.sleep(3) webdriver.ActionChains(driver).move_to_element(elmts[0]).click().perform() time.sleep(5) else: for elmt in elmts: href = elmt.get_attribute('href') txt = elmt.text if len(txt) > 10: if domain in href: print('clicked....') print('點擊網址', href) print('標題', txt) print("ranking", idx) webdriver.ActionChains(driver).move_to_element(elmt).perform() time.sleep(3) webdriver.ActionChains(driver).move_to_element(elmt).click().perform() table.insert({'kw': q, 'domain': domain, 'ranking': idx, 'title': txt, 'url': href,'dt': datetime.datetime.now()}) time.sleep(5) break idx += 1 db.close() print('資料庫關閉') driver.quit() def run_once(q): global driver s = Service('/root/driver/chromedriver') options = webdriver.ChromeOptions() options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument('--headless') # options.add_argument('--remote-debugging-port=9222') # options.add_experimental_option("debuggerAddress", f"127.0.0.1:{q[2]}") # options.add_argument("--user-agent=" +user_agent) options.add_argument("--incognito") r = redis.Redis(host='db.ptt.cx', port=6379, db=2, password='choozmo9') data = r.get('google_proxy') jstext = data.decode('utf-8') jsobj = json.loads(jstext) # print('Free proxy',jsobj) proxy = random.choice(jsobj) i5 = "--proxy-server=socks5://172.104.93.163:41800" change_ip_list = ['--proxy-server=%s' % proxy, "--proxy-server=socks5://127.0.0.1:9050", "--proxy-server=socks5://192.53.174.202:8180"] change_ip = random.choice(change_ip_list) options.add_argument(i5) # print('使用代理ip',change_ip) driver = webdriver.Chrome(options=options, service=s) driver.delete_all_cookies() driver.set_window_size(1400, 1000) process_query(q) # for c in lst: # while True: # try: # c=random.choice(lst) # except: # traceback.print_exc() # sleepint=random.randint(320,520) # time.sleep(sleepint) class JParams(object): def get(self, kw, domain): run_once((kw, domain)) if __name__ == '__main__': fire.Fire(JParams)