123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211 |
- import time
- import traceback
- #import json
- from selenium import webdriver
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- import time
- import os
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
- import dataset
- from selenium.webdriver.common.keys import Keys
- import json
- import random
- import time
- import sys
- import codecs
- import random
- import os
- import time
- #from userAgentRandomizer import userAgents
- driver=None
- db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
- #db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
- table=db['gnews_clicks']
- def scrolling(driver,pgnum):
- ub = driver.find_element(By.CSS_SELECTOR,'body')
- for i in range(pgnum):
- ub.send_keys(Keys.PAGE_DOWN)
- if pgnum>1:
- time.sleep(0.3)
- def re_get_webdriver():
- global driver
- result=[]
- if driver is not None:
- print('closing....')
- driver.quit()
- os.system('killall chrome')
- print('quit....')
- driver=None
- try:
- # ua = userAgents()
- # user_agent = ua.random()
- options = webdriver.ChromeOptions()
- options.add_argument("--no-sandbox")
- options.add_argument("--disable-dev-shm-usage")
- options.add_argument("--headless")
- options.add_argument("--force-dark-mode")
- options.add_argument('--start-maximized')
- # print(user_agent)
- # options.add_argument("--user-agent=" +user_agent)
- options.add_argument("--incognito")
- driver=None
- try:
- driver = webdriver.Chrome(options=options)
- except:
- # driver.quit()
- # os.system('pkill -f ')
- os.system('kill %d' % os.getpid())
- sys.exit()
- return
- driver.set_window_size(1920, 19200)
- return
- except:
- import traceback
- traceback.print_exc()
- driver=None
- return None
- def run_once(jsobj):
- print(jsobj)
- global driver
- global table
- # i=random.randint(0,9)
- i=10
- if driver is None:
- time.sleep(8)
- re_get_webdriver()
- if driver is None:
- return
- try:
- kw=jsobj['kw']
- fname=jsobj['fname']
- if jsobj.get('domain') is None:
- # exclude=jsobj['exclude']
- exclude=None
- domain=None
- else:
- domain=jsobj['domain']
- exclude=None
- # q='裝潢'
- q=jsobj['kw']
- driver.get('https://news.google.com/search?q='+q+'&hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant')
- time.sleep(2)
- print(driver.current_url)
- time.sleep(2)
- # scrolling(driver,20)
- # elmts = driver.find_elements(By.XPATH, "//h3[@class='ipQwMb ekueJc RD0gLb']/a")
- elmts = driver.find_elements(By.XPATH, "//a[@class='JtKRv' and @jsaction='click:kkIcoc;']")
- numresults=len(elmts)
- print('搜尋結果數量',numresults)
- datadict={'搜尋詞':[],'結果標題':[],'結果網址':[],'結果名次':[],'結果說明':[]}
- idx=1
- found=False
- test_lst=[]
- txt_dict={}
- for elmt in elmts:
- href=elmt.get_attribute('href')
- txt=elmt.text
- desc=None
- try:
- desc=txt
- # elmt2=elmt.find_element(By.XPATH, "./../../..//div[@data-content-feature=1]")
- # desc=elmt2.text
- except:
- desc=None
- if len(txt)>10:
- if domain is not None:
- for d in domain:
- if d in txt:
- print('found....')
- print('clicked....')
- print(href)
- print(txt)
- print("ranking", idx)
- found=True
- webdriver.ActionChains(driver).move_to_element(elmt).perform()
- webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
- time.sleep(6)
- table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt})
- return
- else:
- ex=False
- if not ex:
- test_lst.append(elmt)
- txt_dict[elmt]=txt
-
- idx+=1
- # time.sleep(9999)
- if exclude is not None:
- print('exclude')
- elmt=random.choice(test_lst[5:])
- print(elmt)
- print(txt_dict[elmt])
- webdriver.ActionChains(driver).move_to_element(elmt).perform()
- webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
- time.sleep(6)
- if not found:
- table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
- except:
- print('exception')
- traceback.print_exc()
- entries=[]
- entry={'kw':'啟翔輕金屬 樂鋁屋','domain':['永續實踐再創新'],'ranking':'-1','fname':'fname'}
- entries.append(entry)
- entry={'kw':'啟翔輕金屬 防疫門','domain':['政治大學'],'ranking':'-1','fname':'fname'}
- entries.append(entry)
- entry={'kw':'啟翔輕金屬 鋁鋁創新','domain':['把痛點變新商機,'],'ranking':'-1','fname':'fname'}
- entries.append(entry)
- entry={'kw':'啟翔輕金屬 台灣國際室內設計','domain':['台灣國際室內設計'],'ranking':'-1','fname':'fname'}
- entries.append(entry)
- entry={'kw':'啟翔輕金屬 人工智慧','domain':['人工智慧'],'ranking':'-1','fname':'fname'}
- entries.append(entry)
- entry={'kw':'啟翔輕金屬 緬甸','domain':['緬甸'],'ranking':'-1','fname':'fname'}
- entries.append(entry)
- entry={'kw':'啟翔輕金屬 鋁業','domain':['臺灣鋁業'],'ranking':'-1','fname':'fname'}
- entries.append(entry)
- #entries.append(entry)
- entry=random.choice(entries)
- run_once(entry)
- #run_once({'kw':'啟翔 防疫門','domain':'政治大學','ranking':'-1','fname':'fname'})
- #run_once({'kw':'啟翔輕金屬 防疫門','domain':['政治大學'],'ranking':'-1','fname':'fname'})r
- #for c in cursor:
- # run_once({'kw':c['kw']})
|