|
@@ -0,0 +1,301 @@
|
|
|
+#import redis
|
|
|
+import time
|
|
|
+import traceback
|
|
|
+#import json
|
|
|
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
+import time
|
|
|
+import os
|
|
|
+from selenium.webdriver.support.ui import WebDriverWait
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+import dataset
|
|
|
+from selenium import webdriver
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+from selenium.webdriver.common.keys import Keys
|
|
|
+from selenium.webdriver.chrome.service import Service
|
|
|
+import json
|
|
|
+import random
|
|
|
+import time
|
|
|
+import datetime
|
|
|
+import sys
|
|
|
+import codecs
|
|
|
+import random
|
|
|
+import os
|
|
|
+import time
|
|
|
+import requests
|
|
|
+import pymysql
|
|
|
+import urllib.parse
|
|
|
+import multiprocessing
|
|
|
+pymysql.install_as_MySQLdb()
|
|
|
+from userAgentRandomizer import userAgents
|
|
|
+driver=None
|
|
|
+driverclosed = 0
|
|
|
+
|
|
|
+
|
|
|
+headers = {
|
|
|
+ "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
|
|
|
+ "Content-Type": "application/x-www-form-urlencoded"
|
|
|
+}
|
|
|
+
|
|
|
+def send_msg(kw):
|
|
|
+ params = {"message":kw}
|
|
|
+ r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
|
|
|
+
|
|
|
+blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402','https://ipo168.pixnet.net/blog/post/207626239-%E5%95%9F%E7%BF%94%E8%BC%95%E9%87%91%E5%B1%AC%E7%A7%91%E6%8A%80%E8%82%A1%E7%A5%A8%E6%98%AF%E9%80%99%E6%A8%A3%E7%9A%84%E5%85%AC%E5%8F%B8%21%21%E6%8A%95%E8%B3%87%E5%89%8D%E8%A6%81','https://latest.mediatagtw.com/article/%e5%95%9f%e7%bf%94%e8%bc%95%e9%87%91%e5%b1%ac%e7%a7%91%e6%8a%80%e8%82%a1%e4%bb%bd%e6%9c%89%e9%99%90%e5%85%ac%e5%8f%b8#gsc.tab=0']
|
|
|
+
|
|
|
+
|
|
|
+def re_get_webdriver():
|
|
|
+ global port
|
|
|
+ global driver
|
|
|
+ result=[]
|
|
|
+ if driver is not None:
|
|
|
+ print('closing....')
|
|
|
+ driver.quit()
|
|
|
+ os.system('killall chrome')
|
|
|
+ print('quit....')
|
|
|
+ driver=None
|
|
|
+ try:
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ # options.add_argument("user-agent=%s" % user_agent)
|
|
|
+ options.add_argument('--headless')
|
|
|
+ options.add_argument("--incognito")
|
|
|
+ driver = webdriver.Chrome(options=options)
|
|
|
+ driver.delete_all_cookies()
|
|
|
+ driver.set_window_size(1400,1000)
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ driver=None
|
|
|
+ return None
|
|
|
+
|
|
|
+def getDriver():
|
|
|
+ ua=userAgents().random()
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ #print(ua)
|
|
|
+ #options.add_argument("user-agent="+ua)
|
|
|
+ options.add_argument('--headless')
|
|
|
+ options.add_argument('--incognito')
|
|
|
+ options.add_argument('--no-sandbox')
|
|
|
+ driver=webdriver.Chrome(options=options)
|
|
|
+ driver.set_window_size(1400,1000)
|
|
|
+ return driver
|
|
|
+
|
|
|
+
|
|
|
+def run_once(jsobj,db):
|
|
|
+
|
|
|
+ table=db['nda_log']
|
|
|
+ print(jsobj)
|
|
|
+ global driver
|
|
|
+
|
|
|
+ driverclosed=0
|
|
|
+
|
|
|
+# i=random.randint(0,9)
|
|
|
+ i=100
|
|
|
+ driver=getDriver()
|
|
|
+ try:
|
|
|
+ kw=jsobj['kw']
|
|
|
+ if jsobj.get('domain') is None:
|
|
|
+ exclude=jsobj['exclude']
|
|
|
+ domain=None
|
|
|
+ else:
|
|
|
+ domain=jsobj['domain']
|
|
|
+ exclude=None
|
|
|
+
|
|
|
+ googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW')
|
|
|
+ driver.get(googleurl)
|
|
|
+
|
|
|
+ time.sleep(6)
|
|
|
+ print(driver.current_url)
|
|
|
+ if 'sorry' in driver.current_url:
|
|
|
+ print("URL Error: Caught")
|
|
|
+ driver.quit()
|
|
|
+ driverclosed=1
|
|
|
+ return
|
|
|
+ # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
|
|
|
+ # time.sleep(1)
|
|
|
+ # elmt.send_keys(kw)
|
|
|
+ # elmt.send_keys(Keys.ENTER)
|
|
|
+ # time.sleep(6)
|
|
|
+
|
|
|
+ elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
|
|
|
+
|
|
|
+ numresults=len(elmts)
|
|
|
+ print('搜尋結果數量',numresults)
|
|
|
+ if numresults==0:
|
|
|
+ driver.quit()
|
|
|
+ driverclosed=1
|
|
|
+ return
|
|
|
+
|
|
|
+ idx=1
|
|
|
+ found=False
|
|
|
+ test_lst=[]
|
|
|
+ txt_dict={}
|
|
|
+
|
|
|
+
|
|
|
+ for elmt in elmts:
|
|
|
+ href=elmt.get_attribute('href')
|
|
|
+ txt=elmt.text
|
|
|
+ if len(txt)>10:
|
|
|
+ if domain is not None:
|
|
|
+ if domain in href and href not in blacklist:
|
|
|
+ print('found....')
|
|
|
+ print('clicked....')
|
|
|
+ print(href)
|
|
|
+ print("ranking", idx)
|
|
|
+ found=True
|
|
|
+
|
|
|
+ webdriver.ActionChains(driver).move_to_element(elmt).perform()
|
|
|
+ # elmt.click()
|
|
|
+ webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
|
|
|
+ table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']})
|
|
|
+
|
|
|
+ time.sleep(5)
|
|
|
+ page_height = driver.execute_script("return document.body.scrollHeight")
|
|
|
+
|
|
|
+ scroll_step = page_height // 4
|
|
|
+ current_height = 0
|
|
|
+
|
|
|
+ while current_height < page_height:
|
|
|
+ driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});")
|
|
|
+ time.sleep(3)
|
|
|
+ current_height += scroll_step
|
|
|
+
|
|
|
+ time.sleep(10)
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ ex=False
|
|
|
+ for ee in exclude:
|
|
|
+ if ee in href:
|
|
|
+ ex=True
|
|
|
+ if not ex:
|
|
|
+ test_lst.append(elmt)
|
|
|
+ txt_dict[elmt]=txt
|
|
|
+
|
|
|
+ idx+=1
|
|
|
+
|
|
|
+ if exclude is not None:
|
|
|
+ print('exclude')
|
|
|
+ elmt=random.choice(test_lst[5:])
|
|
|
+ print(elmt)
|
|
|
+ print(txt_dict[elmt])
|
|
|
+
|
|
|
+ webdriver.ActionChains(driver).move_to_element(elmt).perform()
|
|
|
+ elmt.click()
|
|
|
+# webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
+ if not found: #don't waste resources, pick a random link as long as it is ok
|
|
|
+ attempt=0
|
|
|
+ pick=''
|
|
|
+ '''
|
|
|
+ negativeflag=True
|
|
|
+ while negativeflag==True:
|
|
|
+ attempt+=1
|
|
|
+ negativeflag=False
|
|
|
+ pick = random.choice(elmts)
|
|
|
+ href = pick.get_attribute('href')
|
|
|
+ if href in blacklist:
|
|
|
+ negativeflag=True
|
|
|
+ ''''''try:
|
|
|
+ content = pick.find_element(By.XPATH, "//em[@class='VwiC3b yXK7lf lyLwlc yDYNvb W8l4ac lEBKkf']/").text
|
|
|
+ print(content)
|
|
|
+ if "陳百欽" not in content:
|
|
|
+ Exception
|
|
|
+ except:
|
|
|
+ print("Not Found")
|
|
|
+ negativeFlag = True''''''
|
|
|
+ if attempt==100:
|
|
|
+ print("Action Terminated")
|
|
|
+ break
|
|
|
+ print(href)
|
|
|
+ webdriver.ActionChains(driver).move_to_element(pick).perform()
|
|
|
+ webdriver.ActionChains(driver).move_to_element(pick).click().perform()
|
|
|
+ time.sleep(10)
|
|
|
+ #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']})
|
|
|
+ '''
|
|
|
+
|
|
|
+
|
|
|
+ except:
|
|
|
+ print('exception')
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+ driver.quit()
|
|
|
+ driverclosed=1
|
|
|
+ # sys.exit()
|
|
|
+
|
|
|
+def exe():
|
|
|
+ try: # OLD TABLE NAME: seo.seo_jobs
|
|
|
+ db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
|
|
|
+ '''cursor=db.query("select json from public.seo_jobs_temp where cust='啟翔' and plan='形象SEO' and json like '%陳百欽%' and (json like '%chinatimes.com%') order by random() limit 1")
|
|
|
+ for c in cursor:
|
|
|
+ js=json.loads(c['json'])
|
|
|
+ prefix=js['prefix']
|
|
|
+ postfix=js['postfix']
|
|
|
+ domain=js['domain'][0]
|
|
|
+ positive=js['positive']
|
|
|
+ rnd=js['rnd']
|
|
|
+
|
|
|
+ kw=''
|
|
|
+ while '陳百欽' not in kw:
|
|
|
+ kw=''
|
|
|
+ kw1=random.choice(positive)
|
|
|
+ kw2=random.choice(rnd)
|
|
|
+ kw=prefix+" "+kw2+" "+kw1
|
|
|
+ code='03'
|
|
|
+'''
|
|
|
+ kwlist = ['創新園區','產業聚落','桃園新屋','研發中心','航太工業','節能減碳','中小企業','高端市場','經營哲學','防疫門','Bellavita','IKEA','馬達','家具','歐美','家具家飾','歐美日','台北101','鋁材帷幕牆','金屬合金','鋁產品製程','台灣鋁業市占率','外銷市場','緬甸設廠','產能需求','EMBA','汽車應用','高端價值','W Hotel','北歐家具','日本家具品牌','醫療產業','循環經濟','歐美日訂單','藍海策略','重圍突破','航太產業','LED','綠色','台灣人才','國際市場競爭力','東協','產業升級','光電屋頂','優勢國際綠能公司','台北小巨蛋','大安森林公園','太陽能發電站','綠能環保科技園區','發電設備']
|
|
|
+
|
|
|
+ domain='chinatimes.com'
|
|
|
+ kw=random.choice(kwlist) + ' 陳百欽'
|
|
|
+
|
|
|
+ run_once({'domain':domain,'kw':kw, 'cust':'啟翔'},db)
|
|
|
+ try:
|
|
|
+ if driver is not None:
|
|
|
+ try:
|
|
|
+ driver.quit()
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ cursor=None
|
|
|
+ driver=None
|
|
|
+ db.close()
|
|
|
+ print("Completed")
|
|
|
+ time.sleep(61)
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ print("Execution Error")
|
|
|
+ try:
|
|
|
+ if driver is not None:
|
|
|
+ try:
|
|
|
+ driver.quit()
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ cursor=None
|
|
|
+ driver=None
|
|
|
+ db.close()
|
|
|
+ time.sleep(20)
|
|
|
+
|
|
|
+def cleanup():
|
|
|
+ if driverclosed == 0:
|
|
|
+ try:
|
|
|
+ driver.quit()
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ runcount=1
|
|
|
+ while True:
|
|
|
+ print("Run "+ str(runcount))
|
|
|
+ start_time = time.time()
|
|
|
+ p = multiprocessing.Process(target=exe)
|
|
|
+ p.start()
|
|
|
+ p.join(120)
|
|
|
+ if p.is_alive():
|
|
|
+ print("Overtime")
|
|
|
+ p.kill()
|
|
|
+ cleanup()
|
|
|
+ p.join()
|
|
|
+ duration = time.time()-start_time
|
|
|
+ print("Runs: " + str(runcount) + " | Duration: " + str(duration))
|
|
|
+ runcount+=1
|