|
@@ -0,0 +1,236 @@
|
|
|
+#import redis
|
|
|
+import time
|
|
|
+import traceback
|
|
|
+#import json
|
|
|
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
+import time
|
|
|
+import os
|
|
|
+from selenium.webdriver.support.ui import WebDriverWait
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+import dataset
|
|
|
+from selenium import webdriver
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+from selenium.webdriver.common.keys import Keys
|
|
|
+from selenium.webdriver.chrome.service import Service
|
|
|
+import json
|
|
|
+import random
|
|
|
+import time
|
|
|
+import datetime
|
|
|
+import sys
|
|
|
+import codecs
|
|
|
+import random
|
|
|
+import os
|
|
|
+import time
|
|
|
+import requests
|
|
|
+import pymysql
|
|
|
+import urllib.parse
|
|
|
+pymysql.install_as_MySQLdb()
|
|
|
+driver=None
|
|
|
+
|
|
|
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
|
|
|
+
|
|
|
+headers = {
|
|
|
+ "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
|
|
|
+ "Content-Type": "application/x-www-form-urlencoded"
|
|
|
+}
|
|
|
+
|
|
|
+def send_msg(kw):
|
|
|
+ params = {"message":kw}
|
|
|
+ r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
|
|
|
+
|
|
|
+blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402']
|
|
|
+
|
|
|
+
|
|
|
+def re_get_webdriver():
|
|
|
+ global port
|
|
|
+ global driver
|
|
|
+ result=[]
|
|
|
+ if driver is not None:
|
|
|
+ print('closing....')
|
|
|
+ driver.quit()
|
|
|
+ os.system('killall chrome')
|
|
|
+ print('quit....')
|
|
|
+ driver=None
|
|
|
+ try:
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ # options.add_argument("user-agent=%s" % user_agent)
|
|
|
+ options.add_argument('--headless')
|
|
|
+ options.add_argument("--incognito")
|
|
|
+ driver = webdriver.Chrome(options=options)
|
|
|
+ driver.delete_all_cookies()
|
|
|
+ driver.set_window_size(1400,1000)
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ driver=None
|
|
|
+ return None
|
|
|
+
|
|
|
+def getDriver():
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ #options.add_argument("user-agent=%s" % rua())
|
|
|
+ options.add_argument('--headless')
|
|
|
+ options.add_argument('--incognito')
|
|
|
+ options.add_argument('--no-sandbox')
|
|
|
+ driver=webdriver.Chrome(options=options)
|
|
|
+ driver.set_window_size(1400,1000)
|
|
|
+ return driver
|
|
|
+
|
|
|
+
|
|
|
+def run_once(jsobj):
|
|
|
+
|
|
|
+ table=db['nda_log']
|
|
|
+ print(jsobj)
|
|
|
+ global driver
|
|
|
+
|
|
|
+
|
|
|
+# i=random.randint(0,9)
|
|
|
+ i=100
|
|
|
+ driver=getDriver()
|
|
|
+ try:
|
|
|
+ kw=jsobj['kw']
|
|
|
+ if jsobj.get('domain') is None:
|
|
|
+ exclude=jsobj['exclude']
|
|
|
+ domain=None
|
|
|
+ else:
|
|
|
+ domain=jsobj['domain']
|
|
|
+ exclude=None
|
|
|
+
|
|
|
+ googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW')
|
|
|
+ driver.get(googleurl)
|
|
|
+
|
|
|
+ time.sleep(6)
|
|
|
+ print(driver.current_url)
|
|
|
+ if 'sorry' in driver.current_url:
|
|
|
+ print("URL Error: Caught")
|
|
|
+ return
|
|
|
+ # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
|
|
|
+ # time.sleep(1)
|
|
|
+ # elmt.send_keys(kw)
|
|
|
+ # elmt.send_keys(Keys.ENTER)
|
|
|
+ # time.sleep(6)
|
|
|
+
|
|
|
+ elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
|
|
|
+
|
|
|
+ numresults=len(elmts)
|
|
|
+ print('搜尋結果數量',numresults)
|
|
|
+ if numresults==0:
|
|
|
+ send_msg('stop working...')
|
|
|
+ sys.exit()
|
|
|
+
|
|
|
+ idx=1
|
|
|
+ found=False
|
|
|
+ test_lst=[]
|
|
|
+ txt_dict={}
|
|
|
+
|
|
|
+
|
|
|
+ for elmt in elmts:
|
|
|
+ href=elmt.get_attribute('href')
|
|
|
+ txt=elmt.text
|
|
|
+ if len(txt)>10:
|
|
|
+ if domain is not None:
|
|
|
+ if domain in href:
|
|
|
+ print('found....')
|
|
|
+ print('clicked....')
|
|
|
+ print(href)
|
|
|
+ print("ranking", idx)
|
|
|
+ found=True
|
|
|
+
|
|
|
+ webdriver.ActionChains(driver).move_to_element(elmt).perform()
|
|
|
+ # elmt.click()
|
|
|
+ webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
|
|
|
+ table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']})
|
|
|
+ time.sleep(5)
|
|
|
+ page_height = driver.execute_script("return document.body.scrollHeight")
|
|
|
+
|
|
|
+ scroll_step = page_height // 4
|
|
|
+ current_height = 0
|
|
|
+
|
|
|
+ while current_height < page_height:
|
|
|
+ driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});")
|
|
|
+ time.sleep(3)
|
|
|
+ current_height += scroll_step
|
|
|
+
|
|
|
+ time.sleep(10)
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ ex=False
|
|
|
+ for ee in exclude:
|
|
|
+ if ee in href:
|
|
|
+ ex=True
|
|
|
+ if not ex:
|
|
|
+ test_lst.append(elmt)
|
|
|
+ txt_dict[elmt]=txt
|
|
|
+
|
|
|
+ idx+=1
|
|
|
+
|
|
|
+ if exclude is not None:
|
|
|
+ print('exclude')
|
|
|
+ elmt=random.choice(test_lst[5:])
|
|
|
+ print(elmt)
|
|
|
+ print(txt_dict[elmt])
|
|
|
+
|
|
|
+ webdriver.ActionChains(driver).move_to_element(elmt).perform()
|
|
|
+ elmt.click()
|
|
|
+# webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
+ if not found: #don't waste resources, pick a random link as long as it is ok
|
|
|
+ attempt=0
|
|
|
+ pick=''
|
|
|
+ negativeflag=True
|
|
|
+ while negativeflag==True:
|
|
|
+ attempt+=1
|
|
|
+ negativeflag=False
|
|
|
+ pick = random.choice(elmts)
|
|
|
+ href = pick.get_attribute('href')
|
|
|
+ if href in blacklist:
|
|
|
+ negativeflag=True
|
|
|
+ '''try:
|
|
|
+ content = pick.find_element(By.XPATH, "//em[@class='VwiC3b yXK7lf lyLwlc yDYNvb W8l4ac lEBKkf']/").text
|
|
|
+ print(content)
|
|
|
+ if "陳百欽" not in content:
|
|
|
+ Exception
|
|
|
+ except:
|
|
|
+ print("Not Found")
|
|
|
+ negativeFlag = True'''
|
|
|
+ if attempt==100:
|
|
|
+ print("Action Terminated")
|
|
|
+ break
|
|
|
+ webdriver.ActionChains(driver).move_to_element(pick).perform()
|
|
|
+ webdriver.ActionChains(driver).move_to_element(pick).click().perform()
|
|
|
+ #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']})
|
|
|
+
|
|
|
+
|
|
|
+ except:
|
|
|
+ print('exception')
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+ driver.quit()
|
|
|
+ # sys.exit()
|
|
|
+
|
|
|
+while True:
|
|
|
+ try:
|
|
|
+ cursor=db.query('select json from seo.seo_jobs where cust="啟翔" and plan="形象SEO" and json like "%陳百欽%" and (json like "%chinabiz.org.tw%" or json like "%vocus.cc%" or json like "%tw.news.yahoo.com%" or json like "%facebook.com%" or json like "%gvm.com.tw%" or json like "%fingermedia.tw%" or json like "%bg3.co%" or json like "%morningtaiwan.org%" or json like "%pchome.com.tw%" or json like "%twfile.com%" or json like "%twincn.com%" or json like "%theicons.net%" or json like "%nhu.edu.tw%") order by rand() limit 1')
|
|
|
+ for c in cursor:
|
|
|
+ js=json.loads(c['json'])
|
|
|
+ prefix=js['prefix']
|
|
|
+ postfix=js['postfix']
|
|
|
+ domain=js['domain'][0]
|
|
|
+ positive=js['positive']
|
|
|
+ rnd=js['rnd']
|
|
|
+
|
|
|
+ kw=''
|
|
|
+ while '陳百欽' not in kw:
|
|
|
+ kw=''
|
|
|
+ kw1=random.choice(positive)
|
|
|
+ kw2=random.choice(rnd)
|
|
|
+ kw=kw1+" "+prefix+" "+kw2
|
|
|
+ code='03'
|
|
|
+
|
|
|
+ run_once({'domain':domain,'kw':kw, 'cust':'啟翔'})
|
|
|
+ time.sleep(61)
|
|
|
+ cursor=None
|
|
|
+ driver=None
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ print("Execution Error")
|
|
|
+ time.sleep(20)
|