#import redis import time import traceback #import json from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time import os from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import dataset from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.service import Service import json import random import time import datetime import sys import codecs import random import os import time import requests import pymysql import urllib.parse from userAgentRandomizer import userAgents pymysql.install_as_MySQLdb() driver=None headers = { "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi", "Content-Type": "application/x-www-form-urlencoded" } def send_msg(kw): params = {"message":kw} r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params) blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402','https://ipo168.pixnet.net/blog/post/207626239-%E5%95%9F%E7%BF%94%E8%BC%95%E9%87%91%E5%B1%AC%E7%A7%91%E6%8A%80%E8%82%A1%E7%A5%A8%E6%98%AF%E9%80%99%E6%A8%A3%E7%9A%84%E5%85%AC%E5%8F%B8%21%21%E6%8A%95%E8%B3%87%E5%89%8D%E8%A6%81','https://latest.mediatagtw.com/article/%e5%95%9f%e7%bf%94%e8%bc%95%e9%87%91%e5%b1%ac%e7%a7%91%e6%8a%80%e8%82%a1%e4%bb%bd%e6%9c%89%e9%99%90%e5%85%ac%e5%8f%b8#gsc.tab=0'] def re_get_webdriver(): global port global driver result=[] if driver is not None: print('closing....') driver.quit() os.system('killall chrome') print('quit....') driver=None try: options = webdriver.ChromeOptions() # options.add_argument("user-agent=%s" % user_agent) options.add_argument('--headless') options.add_argument("--incognito") driver = webdriver.Chrome(options=options) driver.delete_all_cookies() driver.set_window_size(1400,1000) except: traceback.print_exc() driver=None return None def getDriver(): ua=userAgents().random() options = webdriver.ChromeOptions() #print(ua) #options.add_argument("user-agent="+ua) options.add_argument('--headless') options.add_argument('--incognito') options.add_argument('--no-sandbox') driver=webdriver.Chrome(options=options) driver.set_window_size(1400,1000) return driver def run_once(jsobj): table=db['nda_log'] print(jsobj) global driver # i=random.randint(0,9) i=100 driver=getDriver() try: kw=jsobj['kw'] if jsobj.get('domain') is None: exclude=jsobj['exclude'] domain=None else: domain=jsobj['domain'] exclude=None googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW') driver.get(googleurl) time.sleep(6) print(driver.current_url) if 'sorry' in driver.current_url: print("URL Error: Caught") driver.quit() return # elmt = driver.find_element(By.XPATH, "//input[@name='q']") # time.sleep(1) # elmt.send_keys(kw) # elmt.send_keys(Keys.ENTER) # time.sleep(6) elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a") numresults=len(elmts) print('搜尋結果數量',numresults) if numresults==0: driver.quit() return idx=1 found=False test_lst=[] txt_dict={} for elmt in elmts: href=elmt.get_attribute('href') txt=elmt.text if len(txt)>10: if domain is not None: if domain in href and href not in blacklist: print('found....') print('clicked....') print(href) print("ranking", idx) found=True webdriver.ActionChains(driver).move_to_element(elmt).perform() # elmt.click() webdriver.ActionChains(driver).move_to_element(elmt).click().perform() table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']}) time.sleep(5) page_height = driver.execute_script("return document.body.scrollHeight") scroll_step = page_height // 4 current_height = 0 while current_height < page_height: driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});") time.sleep(3) current_height += scroll_step time.sleep(10) break else: ex=False for ee in exclude: if ee in href: ex=True if not ex: test_lst.append(elmt) txt_dict[elmt]=txt idx+=1 if exclude is not None: print('exclude') elmt=random.choice(test_lst[5:]) print(elmt) print(txt_dict[elmt]) webdriver.ActionChains(driver).move_to_element(elmt).perform() elmt.click() # webdriver.ActionChains(driver).move_to_element(elmt).click().perform() time.sleep(5) if not found: #don't waste resources, pick a random link as long as it is ok pick='' negativeflag=True while negativeflag==True: negativeflag=False pick = random.choice(elmts) href = pick.get_attribute('href') if href in blacklist: negativeflag=True print(href) webdriver.ActionChains(driver).move_to_element(pick).perform() webdriver.ActionChains(driver).move_to_element(pick).click().perform() time.sleep(10) #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']}) except: print('exception') traceback.print_exc() driver.quit() # sys.exit() while True: try:# OLD TABLE NAME: seo.seo_jobs db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres') cursor=db.query("select json from public.seo_jobs_temp where cust='啟翔' and plan='形象SEO' and json like '%陳百欽%' order by random() limit 1") for c in cursor: js=json.loads(c['json']) prefix=js['prefix'] postfix=js['postfix'] domain=js['domain'][0] positive=js['positive'] rnd=js['rnd'] kw='' while '陳百欽' not in kw: kw='' kw1=random.choice(positive) kw2=random.choice(rnd) kw=prefix+" "+kw2+" "+kw1 code='03' print(kw) run_once({'domain':domain,'kw':kw, 'cust':'啟翔'}) if driver is not None: try: driver.quit() except: pass cursor=None driver=None db.close() print("Completed") time.sleep(61) except: traceback.print_exc() print("Execution Error") if driver is not None: try: driver.quit() except: pass cursor=None driver=None db.close() time.sleep(20)