#import redis import time import traceback #import json from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time import os from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import dataset from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.service import Service import json import random import time import datetime import sys import codecs import random import os import time import requests import pymysql import urllib.parse import multiprocessing pymysql.install_as_MySQLdb() from userAgentRandomizer import userAgents driver=None driverclosed = 0 headers = { "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi", "Content-Type": "application/x-www-form-urlencoded" } def send_msg(kw): params = {"message":kw} r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params) blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402','https://ipo168.pixnet.net/blog/post/207626239-%E5%95%9F%E7%BF%94%E8%BC%95%E9%87%91%E5%B1%AC%E7%A7%91%E6%8A%80%E8%82%A1%E7%A5%A8%E6%98%AF%E9%80%99%E6%A8%A3%E7%9A%84%E5%85%AC%E5%8F%B8%21%21%E6%8A%95%E8%B3%87%E5%89%8D%E8%A6%81','https://latest.mediatagtw.com/article/%e5%95%9f%e7%bf%94%e8%bc%95%e9%87%91%e5%b1%ac%e7%a7%91%e6%8a%80%e8%82%a1%e4%bb%bd%e6%9c%89%e9%99%90%e5%85%ac%e5%8f%b8#gsc.tab=0'] def re_get_webdriver(): global port global driver result=[] if driver is not None: print('closing....') driver.quit() os.system('killall chrome') print('quit....') driver=None try: options = webdriver.ChromeOptions() # options.add_argument("user-agent=%s" % user_agent) options.add_argument('--headless') options.add_argument("--incognito") driver = webdriver.Chrome(options=options) driver.delete_all_cookies() driver.set_window_size(1400,1000) except: traceback.print_exc() driver=None return None def getDriver(): ua=userAgents().random() options = webdriver.ChromeOptions() #print(ua) #options.add_argument("user-agent="+ua) options.add_argument('--headless') options.add_argument('--incognito') options.add_argument('--no-sandbox') driver=webdriver.Chrome(options=options) driver.set_window_size(1400,1000) return driver def run_once(jsobj,db): table=db['nda_log'] print(jsobj) global driver driverclosed=0 # i=random.randint(0,9) i=100 driver=getDriver() try: kw=jsobj['kw'] if jsobj.get('domain') is None: exclude=jsobj['exclude'] domain=None else: domain=jsobj['domain'] exclude=None googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW') driver.get(googleurl) time.sleep(6) print(driver.current_url) if 'sorry' in driver.current_url: print("URL Error: Caught") driver.quit() driverclosed=1 return # elmt = driver.find_element(By.XPATH, "//input[@name='q']") # time.sleep(1) # elmt.send_keys(kw) # elmt.send_keys(Keys.ENTER) # time.sleep(6) elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a") numresults=len(elmts) print('搜尋結果數量',numresults) if numresults==0: driver.quit() driverclosed=1 return idx=1 found=False test_lst=[] txt_dict={} for elmt in elmts: href=elmt.get_attribute('href') txt=elmt.text if len(txt)>10: if domain is not None: if domain in href and href not in blacklist: print('found....') print('clicked....') print(href) print("ranking", idx) found=True webdriver.ActionChains(driver).move_to_element(elmt).perform() # elmt.click() webdriver.ActionChains(driver).move_to_element(elmt).click().perform() table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']}) time.sleep(5) page_height = driver.execute_script("return document.body.scrollHeight") scroll_step = page_height // 4 current_height = 0 while current_height < page_height: driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});") time.sleep(3) current_height += scroll_step time.sleep(10) break else: ex=False for ee in exclude: if ee in href: ex=True if not ex: test_lst.append(elmt) txt_dict[elmt]=txt idx+=1 if exclude is not None: print('exclude') elmt=random.choice(test_lst[5:]) print(elmt) print(txt_dict[elmt]) webdriver.ActionChains(driver).move_to_element(elmt).perform() elmt.click() # webdriver.ActionChains(driver).move_to_element(elmt).click().perform() time.sleep(5) if not found: #don't waste resources, pick a random link as long as it is ok attempt=0 pick='' ''' negativeflag=True while negativeflag==True: attempt+=1 negativeflag=False pick = random.choice(elmts) href = pick.get_attribute('href') if href in blacklist: negativeflag=True ''''''try: content = pick.find_element(By.XPATH, "//em[@class='VwiC3b yXK7lf lyLwlc yDYNvb W8l4ac lEBKkf']/").text print(content) if "陳百欽" not in content: Exception except: print("Not Found") negativeFlag = True'''''' if attempt==100: print("Action Terminated") break print(href) webdriver.ActionChains(driver).move_to_element(pick).perform() webdriver.ActionChains(driver).move_to_element(pick).click().perform() time.sleep(10) #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']}) ''' except: print('exception') traceback.print_exc() driver.quit() driverclosed=1 # sys.exit() def exe(): try: # OLD TABLE NAME: seo.seo_jobs db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres') '''cursor=db.query("select json from public.seo_jobs_temp where cust='啟翔' and plan='形象SEO' and json like '%陳百欽%' and (json like '%chinatimes.com%') order by random() limit 1") for c in cursor: js=json.loads(c['json']) prefix=js['prefix'] postfix=js['postfix'] domain=js['domain'][0] positive=js['positive'] rnd=js['rnd'] kw='' while '陳百欽' not in kw: kw='' kw1=random.choice(positive) kw2=random.choice(rnd) kw=prefix+" "+kw2+" "+kw1 code='03' ''' kwlist = ['創新園區','產業聚落','桃園新屋','研發中心','航太工業','節能減碳','中小企業','高端市場','經營哲學','防疫門','Bellavita','IKEA','馬達','家具','歐美','家具家飾','歐美日','台北101','鋁材帷幕牆','金屬合金','鋁產品製程','台灣鋁業市占率','外銷市場','緬甸設廠','產能需求','EMBA','汽車應用','高端價值','W Hotel','北歐家具','日本家具品牌','醫療產業','循環經濟','歐美日訂單','藍海策略','重圍突破','航太產業','LED','綠色','台灣人才','國際市場競爭力','東協','產業升級','光電屋頂','優勢國際綠能公司','台北小巨蛋','大安森林公園','太陽能發電站','綠能環保科技園區','發電設備'] domain='chinatimes.com' kw=random.choice(kwlist) + ' 陳百欽' run_once({'domain':domain,'kw':kw, 'cust':'啟翔'},db) try: if driver is not None: try: driver.quit() except: pass except: pass cursor=None driver=None db.close() print("Completed") time.sleep(61) except: traceback.print_exc() print("Execution Error") try: if driver is not None: try: driver.quit() except: pass except: pass cursor=None driver=None db.close() time.sleep(20) def cleanup(): if driverclosed == 0: try: driver.quit() except: pass if __name__ == '__main__': runcount=1 while True: print("Run "+ str(runcount)) start_time = time.time() p = multiprocessing.Process(target=exe) p.start() p.join(120) if p.is_alive(): print("Overtime") p.kill() cleanup() p.join() duration = time.time()-start_time print("Runs: " + str(runcount) + " | Duration: " + str(duration)) runcount+=1