123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301 |
- #import redis
- import time
- import traceback
- #import json
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- import time
- import os
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- import dataset
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.chrome.service import Service
- import json
- import random
- import time
- import datetime
- import sys
- import codecs
- import random
- import os
- import time
- import requests
- import pymysql
- import urllib.parse
- import multiprocessing
- pymysql.install_as_MySQLdb()
- from userAgentRandomizer import userAgents
- driver=None
- driverclosed = 0
- headers = {
- "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
- "Content-Type": "application/x-www-form-urlencoded"
- }
- def send_msg(kw):
- params = {"message":kw}
- r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
- blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402','https://ipo168.pixnet.net/blog/post/207626239-%E5%95%9F%E7%BF%94%E8%BC%95%E9%87%91%E5%B1%AC%E7%A7%91%E6%8A%80%E8%82%A1%E7%A5%A8%E6%98%AF%E9%80%99%E6%A8%A3%E7%9A%84%E5%85%AC%E5%8F%B8%21%21%E6%8A%95%E8%B3%87%E5%89%8D%E8%A6%81','https://latest.mediatagtw.com/article/%e5%95%9f%e7%bf%94%e8%bc%95%e9%87%91%e5%b1%ac%e7%a7%91%e6%8a%80%e8%82%a1%e4%bb%bd%e6%9c%89%e9%99%90%e5%85%ac%e5%8f%b8#gsc.tab=0']
- def re_get_webdriver():
- global port
- global driver
- result=[]
- if driver is not None:
- print('closing....')
- driver.quit()
- os.system('killall chrome')
- print('quit....')
- driver=None
- try:
- options = webdriver.ChromeOptions()
- # options.add_argument("user-agent=%s" % user_agent)
- options.add_argument('--headless')
- options.add_argument("--incognito")
- driver = webdriver.Chrome(options=options)
- driver.delete_all_cookies()
- driver.set_window_size(1400,1000)
- except:
- traceback.print_exc()
- driver=None
- return None
- def getDriver():
- ua=userAgents().random()
- options = webdriver.ChromeOptions()
- #print(ua)
- #options.add_argument("user-agent="+ua)
- options.add_argument('--headless')
- options.add_argument('--incognito')
- options.add_argument('--no-sandbox')
- driver=webdriver.Chrome(options=options)
- driver.set_window_size(1400,1000)
- return driver
- def run_once(jsobj,db):
- table=db['nda_log']
- print(jsobj)
- global driver
- driverclosed=0
- # i=random.randint(0,9)
- i=100
- driver=getDriver()
- try:
- kw=jsobj['kw']
- if jsobj.get('domain') is None:
- exclude=jsobj['exclude']
- domain=None
- else:
- domain=jsobj['domain']
- exclude=None
- googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW')
- driver.get(googleurl)
- time.sleep(6)
- print(driver.current_url)
- if 'sorry' in driver.current_url:
- print("URL Error: Caught")
- driver.quit()
- driverclosed=1
- return
- # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
- # time.sleep(1)
- # elmt.send_keys(kw)
- # elmt.send_keys(Keys.ENTER)
- # time.sleep(6)
- elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
- numresults=len(elmts)
- print('搜尋結果數量',numresults)
- if numresults==0:
- driver.quit()
- driverclosed=1
- return
- idx=1
- found=False
- test_lst=[]
- txt_dict={}
- for elmt in elmts:
- href=elmt.get_attribute('href')
- txt=elmt.text
- if len(txt)>10:
- if domain is not None:
- if domain in href and href not in blacklist:
- print('found....')
- print('clicked....')
- print(href)
- print("ranking", idx)
- found=True
- webdriver.ActionChains(driver).move_to_element(elmt).perform()
- # elmt.click()
- webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
- table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']})
-
- time.sleep(5)
- page_height = driver.execute_script("return document.body.scrollHeight")
- scroll_step = page_height // 4
- current_height = 0
- while current_height < page_height:
- driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});")
- time.sleep(3)
- current_height += scroll_step
- time.sleep(10)
- break
- else:
- ex=False
- for ee in exclude:
- if ee in href:
- ex=True
- if not ex:
- test_lst.append(elmt)
- txt_dict[elmt]=txt
-
- idx+=1
- if exclude is not None:
- print('exclude')
- elmt=random.choice(test_lst[5:])
- print(elmt)
- print(txt_dict[elmt])
- webdriver.ActionChains(driver).move_to_element(elmt).perform()
- elmt.click()
- # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
- time.sleep(5)
- if not found: #don't waste resources, pick a random link as long as it is ok
- attempt=0
- pick=''
- '''
- negativeflag=True
- while negativeflag==True:
- attempt+=1
- negativeflag=False
- pick = random.choice(elmts)
- href = pick.get_attribute('href')
- if href in blacklist:
- negativeflag=True
- ''''''try:
- content = pick.find_element(By.XPATH, "//em[@class='VwiC3b yXK7lf lyLwlc yDYNvb W8l4ac lEBKkf']/").text
- print(content)
- if "陳百欽" not in content:
- Exception
- except:
- print("Not Found")
- negativeFlag = True''''''
- if attempt==100:
- print("Action Terminated")
- break
- print(href)
- webdriver.ActionChains(driver).move_to_element(pick).perform()
- webdriver.ActionChains(driver).move_to_element(pick).click().perform()
- time.sleep(10)
- #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']})
- '''
- except:
- print('exception')
- traceback.print_exc()
- driver.quit()
- driverclosed=1
- # sys.exit()
- def exe():
- try: # OLD TABLE NAME: seo.seo_jobs
- db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
- '''cursor=db.query("select json from public.seo_jobs_temp where cust='啟翔' and plan='形象SEO' and json like '%陳百欽%' and (json like '%chinatimes.com%') order by random() limit 1")
- for c in cursor:
- js=json.loads(c['json'])
- prefix=js['prefix']
- postfix=js['postfix']
- domain=js['domain'][0]
- positive=js['positive']
- rnd=js['rnd']
- kw=''
- while '陳百欽' not in kw:
- kw=''
- kw1=random.choice(positive)
- kw2=random.choice(rnd)
- kw=prefix+" "+kw2+" "+kw1
- code='03'
- '''
- kwlist = ['創新園區','產業聚落','桃園新屋','研發中心','航太工業','節能減碳','中小企業','高端市場','經營哲學','防疫門','Bellavita','IKEA','馬達','家具','歐美','家具家飾','歐美日','台北101','鋁材帷幕牆','金屬合金','鋁產品製程','台灣鋁業市占率','外銷市場','緬甸設廠','產能需求','EMBA','汽車應用','高端價值','W Hotel','北歐家具','日本家具品牌','醫療產業','循環經濟','歐美日訂單','藍海策略','重圍突破','航太產業','LED','綠色','台灣人才','國際市場競爭力','東協','產業升級','光電屋頂','優勢國際綠能公司','台北小巨蛋','大安森林公園','太陽能發電站','綠能環保科技園區','發電設備']
- domain='chinatimes.com'
- kw=random.choice(kwlist) + ' 陳百欽'
- run_once({'domain':domain,'kw':kw, 'cust':'啟翔'},db)
- try:
- if driver is not None:
- try:
- driver.quit()
- except:
- pass
- except:
- pass
- cursor=None
- driver=None
- db.close()
- print("Completed")
- time.sleep(61)
- except:
- traceback.print_exc()
- print("Execution Error")
- try:
- if driver is not None:
- try:
- driver.quit()
- except:
- pass
- except:
- pass
- cursor=None
- driver=None
- db.close()
- time.sleep(20)
- def cleanup():
- if driverclosed == 0:
- try:
- driver.quit()
- except:
- pass
- if __name__ == '__main__':
- runcount=1
- while True:
- print("Run "+ str(runcount))
- start_time = time.time()
- p = multiprocessing.Process(target=exe)
- p.start()
- p.join(120)
- if p.is_alive():
- print("Overtime")
- p.kill()
- cleanup()
- p.join()
- duration = time.time()-start_time
- print("Runs: " + str(runcount) + " | Duration: " + str(duration))
- runcount+=1
|