jared пре 1 година
родитељ
комит
5d37420840
1 измењених фајлова са 211 додато и 0 уклоњено
  1. 211 0
      deployment/gnews_click.py

+ 211 - 0
deployment/gnews_click.py

@@ -0,0 +1,211 @@
+import time
+import traceback
+#import json
+from selenium import webdriver
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import time
+import os
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+import dataset
+from selenium.webdriver.common.keys import Keys
+import json
+import random
+import time
+import sys
+
+import codecs
+import random
+import os
+import time
+#from userAgentRandomizer import userAgents
+
+driver=None
+db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
+
+#db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
+table=db['gnews_clicks']
+
+def scrolling(driver,pgnum):
+    ub = driver.find_element(By.CSS_SELECTOR,'body')
+    for i in range(pgnum):
+        ub.send_keys(Keys.PAGE_DOWN)
+        if pgnum>1:
+            time.sleep(0.3) 
+
+
+def re_get_webdriver():
+    global driver
+    result=[]
+    if driver is not None:
+        print('closing....')
+        driver.quit()
+        os.system('killall chrome')
+        print('quit....')
+        driver=None
+    try:
+#        ua = userAgents()
+
+#        user_agent = ua.random()        
+
+        options = webdriver.ChromeOptions()
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_argument("--headless")
+        options.add_argument("--force-dark-mode")
+        options.add_argument('--start-maximized')
+#        print(user_agent)
+#        options.add_argument("--user-agent=" +user_agent)
+        options.add_argument("--incognito")
+        driver=None
+        try:
+            driver = webdriver.Chrome(options=options)
+        except:
+#            driver.quit()
+#            os.system('pkill -f ')
+            os.system('kill %d' % os.getpid())
+            sys.exit()
+            return
+        driver.set_window_size(1920, 19200)
+        return
+    except:
+        import traceback
+        traceback.print_exc()
+        driver=None
+        return None
+
+
+
+def run_once(jsobj):
+
+    print(jsobj)
+    global driver
+    global table
+
+#    i=random.randint(0,9)
+    i=10
+    if driver is None:
+        time.sleep(8)
+        re_get_webdriver()
+    if driver is None:
+        return
+    try:
+        kw=jsobj['kw']
+        fname=jsobj['fname']
+
+        if jsobj.get('domain') is None:
+#            exclude=jsobj['exclude']
+            exclude=None
+            domain=None
+        else:
+            domain=jsobj['domain']
+            exclude=None
+#        q='裝潢'
+        q=jsobj['kw']
+        driver.get('https://news.google.com/search?q='+q+'&hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant')
+
+        time.sleep(2)
+        print(driver.current_url)
+        time.sleep(2)
+#        scrolling(driver,20)
+
+#        elmts = driver.find_elements(By.XPATH, "//h3[@class='ipQwMb ekueJc RD0gLb']/a")
+        elmts = driver.find_elements(By.XPATH, "//a[@class='JtKRv' and @jsaction='click:kkIcoc;']")
+
+        numresults=len(elmts)
+        print('搜尋結果數量',numresults)
+        datadict={'搜尋詞':[],'結果標題':[],'結果網址':[],'結果名次':[],'結果說明':[]}
+
+
+        idx=1
+        found=False
+        test_lst=[]
+        txt_dict={}
+        for elmt in elmts:
+            href=elmt.get_attribute('href')
+            txt=elmt.text
+            desc=None
+            try:
+                desc=txt
+#                elmt2=elmt.find_element(By.XPATH, "./../../..//div[@data-content-feature=1]")
+#                desc=elmt2.text
+            except:
+                desc=None
+
+            if len(txt)>10:
+                if domain is not None:
+                    for d in domain:
+                        if d in txt:
+                            print('found....')
+                            print('clicked....')
+                            print(href)
+                            print(txt)
+                            print("ranking", idx)
+                            found=True
+                            webdriver.ActionChains(driver).move_to_element(elmt).perform()
+                            webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+
+
+
+                            time.sleep(6)
+                            table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt})
+
+                            return
+                else:
+                    ex=False
+                    if not ex:
+                        test_lst.append(elmt)
+                        txt_dict[elmt]=txt
+                    
+            idx+=1
+#        time.sleep(9999)
+
+        if exclude is not None:
+            print('exclude')
+            elmt=random.choice(test_lst[5:])
+            print(elmt)
+            print(txt_dict[elmt])
+
+            webdriver.ActionChains(driver).move_to_element(elmt).perform()
+            webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+            time.sleep(6)
+
+        if not found:
+            table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
+
+
+
+
+    except:
+        print('exception')
+        traceback.print_exc()
+
+entries=[]
+entry={'kw':'啟翔輕金屬 樂鋁屋','domain':['永續實踐再創新'],'ranking':'-1','fname':'fname'}
+entries.append(entry)
+entry={'kw':'啟翔輕金屬 防疫門','domain':['政治大學'],'ranking':'-1','fname':'fname'}
+entries.append(entry)
+entry={'kw':'啟翔輕金屬 鋁鋁創新','domain':['把痛點變新商機,'],'ranking':'-1','fname':'fname'}
+entries.append(entry)
+entry={'kw':'啟翔輕金屬 台灣國際室內設計','domain':['台灣國際室內設計'],'ranking':'-1','fname':'fname'}
+entries.append(entry)
+entry={'kw':'啟翔輕金屬 人工智慧','domain':['人工智慧'],'ranking':'-1','fname':'fname'}
+entries.append(entry)
+entry={'kw':'啟翔輕金屬 緬甸','domain':['緬甸'],'ranking':'-1','fname':'fname'}
+entries.append(entry)
+entry={'kw':'啟翔輕金屬 鋁業','domain':['臺灣鋁業'],'ranking':'-1','fname':'fname'}
+entries.append(entry)
+
+#entries.append(entry)
+entry=random.choice(entries)
+run_once(entry)
+#run_once({'kw':'啟翔 防疫門','domain':'政治大學','ranking':'-1','fname':'fname'})
+#run_once({'kw':'啟翔輕金屬 防疫門','domain':['政治大學'],'ranking':'-1','fname':'fname'})r
+
+
+
+
+#for c in cursor:
+#    run_once({'kw':c['kw']})
+