#import redis import time import traceback #import json from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time import os from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import dataset from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.service import Service import json import random import time import datetime import sys import codecs import random import os import time import requests import pymysql import urllib.parse import multiprocessing pymysql.install_as_MySQLdb() from userAgentRandomizer import userAgents driver=None driverclosed = 0 headers = { "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi", "Content-Type": "application/x-www-form-urlencoded" } def send_msg(kw): params = {"message":kw} r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params) blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402','https://ipo168.pixnet.net/blog/post/207626239-%E5%95%9F%E7%BF%94%E8%BC%95%E9%87%91%E5%B1%AC%E7%A7%91%E6%8A%80%E8%82%A1%E7%A5%A8%E6%98%AF%E9%80%99%E6%A8%A3%E7%9A%84%E5%85%AC%E5%8F%B8%21%21%E6%8A%95%E8%B3%87%E5%89%8D%E8%A6%81','https://latest.mediatagtw.com/article/%e5%95%9f%e7%bf%94%e8%bc%95%e9%87%91%e5%b1%ac%e7%a7%91%e6%8a%80%e8%82%a1%e4%bb%bd%e6%9c%89%e9%99%90%e5%85%ac%e5%8f%b8#gsc.tab=0'] def re_get_webdriver(): global port global driver result=[] if driver is not None: print('closing....') driver.quit() os.system('killall chrome') print('quit....') driver=None try: options = webdriver.ChromeOptions() # options.add_argument("user-agent=%s" % user_agent) options.add_argument('--headless') options.add_argument("--incognito") driver = webdriver.Chrome(options=options) driver.delete_all_cookies() driver.set_window_size(1400,1000) except: traceback.print_exc() driver=None return None def getDriver(): ua=userAgents().random() options = webdriver.ChromeOptions() #print(ua) #options.add_argument("user-agent="+ua) options.add_argument('--headless') options.add_argument('--incognito') options.add_argument('--no-sandbox') driver=webdriver.Chrome(options=options) driver.set_window_size(1400,1000) return driver def run_once(jsobj,db): table=db['nda_log'] print(jsobj) global driver driverclosed=0 # i=random.randint(0,9) i=100 driver=getDriver() try: kw=jsobj['kw'] if jsobj.get('domain') is None: exclude=jsobj['exclude'] domain=None else: domain=jsobj['domain'] exclude=None googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW') driver.get(googleurl) time.sleep(6) print(driver.current_url) if 'sorry' in driver.current_url: print("URL Error: Caught") driver.quit() driverclosed=1 return # elmt = driver.find_element(By.XPATH, "//input[@name='q']") # time.sleep(1) # elmt.send_keys(kw) # elmt.send_keys(Keys.ENTER) # time.sleep(6) elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a") numresults=len(elmts) print('搜尋結果數量',numresults) if numresults==0: driver.quit() driverclosed=1 return idx=1 found=False test_lst=[] txt_dict={} for elmt in elmts: href=elmt.get_attribute('href') txt=elmt.text if len(txt)>10: if domain is not None: if domain in href and href not in blacklist: print('found....') print('clicked....') print(href) print("ranking", idx) found=True webdriver.ActionChains(driver).move_to_element(elmt).perform() # elmt.click() webdriver.ActionChains(driver).move_to_element(elmt).click().perform() table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']}) time.sleep(5) page_height = driver.execute_script("return document.body.scrollHeight") scroll_step = page_height // 4 current_height = 0 while current_height < page_height: driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});") time.sleep(3) current_height += scroll_step time.sleep(10) break else: ex=False for ee in exclude: if ee in href: ex=True if not ex: test_lst.append(elmt) txt_dict[elmt]=txt idx+=1 if exclude is not None: print('exclude') elmt=random.choice(test_lst[5:]) print(elmt) print(txt_dict[elmt]) webdriver.ActionChains(driver).move_to_element(elmt).perform() elmt.click() # webdriver.ActionChains(driver).move_to_element(elmt).click().perform() time.sleep(5) if not found: #don't waste resources, pick a random link as long as it is ok attempt=0 pick='' ''' negativeflag=True while negativeflag==True: attempt+=1 negativeflag=False pick = random.choice(elmts) href = pick.get_attribute('href') if href in blacklist: negativeflag=True ''''''try: content = pick.find_element(By.XPATH, "//em[@class='VwiC3b yXK7lf lyLwlc yDYNvb W8l4ac lEBKkf']/").text print(content) if "陳百欽" not in content: Exception except: print("Not Found") negativeFlag = True'''''' if attempt==100: print("Action Terminated") break print(href) webdriver.ActionChains(driver).move_to_element(pick).perform() webdriver.ActionChains(driver).move_to_element(pick).click().perform() time.sleep(10) #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']}) ''' except: print('exception') traceback.print_exc() driver.quit() driverclosed=1 # sys.exit() def exe(): try: # OLD TABLE NAME: seo.seo_jobs db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres') cursor=db.query("select json from public.seo_jobs_temp where cust='啟翔' and plan='形象SEO' and json like '%陳百欽%' and (json like '%chinatimes.com%') order by random() limit 1") for c in cursor: js=json.loads(c['json']) prefix=js['prefix'] postfix=js['postfix'] domain=js['domain'][0] positive=js['positive'] rnd=js['rnd'] kw='' while '陳百欽' not in kw: kw='' kw1=random.choice(positive) kw2=random.choice(rnd) kw=prefix+" "+kw2+" "+kw1 code='03' run_once({'domain':domain,'kw':kw, 'cust':'啟翔'},db) try: if driver is not None: try: driver.quit() except: pass except: pass cursor=None driver=None db.close() print("Completed") time.sleep(61) except: traceback.print_exc() print("Execution Error") try: if driver is not None: try: driver.quit() except: pass except: pass cursor=None driver=None db.close() time.sleep(20) def cleanup(): if driverclosed == 0: try: driver.quit() except: pass if __name__ == '__main__': runcount=1 while True: print("Run "+ str(runcount)) start_time = time.time() p = multiprocessing.Process(target=exe) p.start() p.join(120) if p.is_alive(): print("Overtime") p.kill() cleanup() p.join() duration = time.time()-start_time print("Runs: " + str(runcount) + " | Duration: " + str(duration))