ソースを参照

Merge branch 'master' of http://git.choozmo.com:3000/jared/news_seo1

Jared 2 年 前
コミット
e6b7fbbc34
3 ファイル変更296 行追加28 行削除
  1. 20 28
      gen_seo.py
  2. 198 0
      gen_seo2.py
  3. 78 0
      test_ip.py

+ 20 - 28
gen_seo.py

@@ -139,22 +139,21 @@ def run_once(jsobj):
             txt=elmt.text
             if len(txt)>10:
                 if domain is not None:
-                    for d in domain:
-                        if d in href:
-                            print('found....')
-                            print('clicked....')
-                            print(href)
-                            print(txt)
-                            print("ranking", idx)
-                            found=True
-
-                            webdriver.ActionChains(driver).move_to_element(elmt).perform()
-                            elmt.click()
-    #                        webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
-                            table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt})
-
-                            time.sleep(6)
-                            return
+                    if domain in href:
+                        print('found....')
+                        print('clicked....')
+                        print(href)
+                        print(txt)
+                        print("ranking", idx)
+                        found=True
+
+                        webdriver.ActionChains(driver).move_to_element(elmt).perform()
+                        elmt.click()
+#                        webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+                        table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt})
+
+                        time.sleep(6)
+                        break
                 else:
                     ex=False
                     for ee in exclude:
@@ -190,24 +189,17 @@ par1=sys.argv[1]
 port=sys.argv[2]
 #kws=['職籃','PLG','高雄','鋼鐵人','內幕','中資','股東','姊夫','中國','老賴','香港','無極','原始股東','外資','董事長','股權結構','高雄人','黑人','陳建州','職籃聯盟','球團','球團高層','香港無極','張憲銘','吳同喬','監察人']
 kws=['金融', '人才', '國際接軌','國際','投資金童','投資','金童','對沖基金','香港','外資','原始股東','職籃','PLG','職籃聯盟','球團','台灣女婿','抹紅','保守','港元','美國','升息','戰爭','通膨','亞洲','亞洲投資金童']
-positive=['引新聞','亞洲最強對沖基金','亞洲投資金童','年底前投資須保守','對沖基金創始人錢濤','升息','職籃夢','innews','66474','生活消費 網友熱搜','懷孕','亞洲對沖基金','攤證據','證據','台灣女婿','通膨','喊冤','亞洲投資金童','創始人','年底前','自由財經','美國升息','兼執行官','個人因素','經濟通','LTN','奇摩','金融巨鱷','投資績效','掌門人','亮眼成績','在台生根','孕妻待產','長住台灣','兼首席執行官','無極資本提供','新冠疫情反覆','戰事膠著','華爾街日報','國際接軌','本地券商','台灣金融人才','彭博社','路透社','量化投資團隊','量化投資業務','人工智慧','演算法','大數據分析','解決方案','全球化']
-
-#positive=['錢濤','亞洲最強對沖基金','亞洲投資金童','年底前投資須保守','對沖基金創始人錢濤','錢濤 升息','錢濤 職籃夢','引新聞 錢濤']
+positive=['錢濤','錢濤 升息','錢濤 職籃夢']
 os.system('docker container restart '+par1)
 kw=random.choice(kws)
 #time.sleep(9)
 #run_once({'domain':'ettoday.net','kw':'錢濤'})
 #run_once({'exclude':['moreptt.com','ptt.cc','tnews.cc','mirrormedia.mg','newtalk.tw','pourquoi.tw','match.net.tw','freshweekly.tw','z-upload.facebook.com','udn.com'],'kw':kw+' 錢濤'})
-domain=['yahoo.com','ettoday.net','tvbs.com.tw','sina.com.tw','ltn.com.tw','owlting.com','ctee.com.tw']
-#domain=random.choice(domains)
-#p=random.choice(positive)
-#run_once({'domain':domain,'kw':p})
-#pairs=[{'domain':'innews.com.tw','kw':'錢濤 引新聞'},{'domain':'innews.com.tw','kw':'innews 錢濤'},{'domain':'innews.com.tw','kw':'錢濤 66474'},{'domain':'innews.com.tw','kw':'錢濤 生活消費 網友熱搜'},{'domain':'yahoo.com','kw':'錢濤 懷孕'},{'domain':'yahoo.com','kw':'亞洲對沖基金 錢濤'},
-#{'domain':'ltn.com.tw','kw':'科技人才 錢濤'}]
-#p=random.choice(pairs)
-
+domains=['yahoo.com','ettoday.net','tvbs.com.tw','sina.com.tw','ltn.com.tw','owlting.com','ctee.com.tw']
+domain=random.choice(domains)
 p=random.choice(positive)
-run_once({'domain':domain,'kw':p+" 錢濤"})
+#run_once({'domain':domain,'kw':p})
+run_once({'domain':'ettoday.net','kw':p})
 
 #run_once({'domain':domain,'kw':kw+' 錢濤'})
 

+ 198 - 0
gen_seo2.py

@@ -0,0 +1,198 @@
+# import redis
+import time
+import traceback
+# import json
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import time
+import os
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import dataset
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.chrome.service import Service
+import json
+import random
+import time
+import redis
+import sys
+import codecs
+import random
+import os
+import time
+from userAgentRandomizer import userAgents
+import requests
+import pymysql
+
+pymysql.install_as_MySQLdb()
+driver = None
+
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
+
+headers = {
+    "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
+    "Content-Type": "application/x-www-form-urlencoded"
+}
+
+
+def send_msg(kw):
+    params = {"message": kw}
+    r = requests.post("https://notify-api.line.me/api/notify", headers=headers, params=params)
+
+
+def re_get_webdriver():
+    global port
+    global driver
+    result = []
+    if driver is not None:
+        print('closing....')
+        driver.quit()
+        os.system('killall chrome')
+        print('quit....')
+        driver = None
+    try:
+        s = Service('/root/driver/chromedriver')
+        options = webdriver.ChromeOptions()
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_argument("--headless")
+        options.add_argument('--remote-debugging-port=9222')
+        options.add_experimental_option("debuggerAddress", '127.0.0.1:9927')
+        options.add_argument("--incognito")
+        r = redis.Redis(host='db.ptt.cx', port=6379, db=2, password='choozmo9')
+        data = r.get('google_proxy')
+        jstext = data.decode('utf-8')
+        jsobj = json.loads(jstext)
+        proxy = random.choice(jsobj)
+        change_ip_list = ['--proxy-server=%s' % proxy, "--proxy-server=socks5://127.0.0.1:9050",
+                          "--proxy-server=socks5://192.53.174.202:8180"]
+        change_ip = random.choice(change_ip_list)
+        options.add_argument(change_ip)
+        print('使用代理ip', change_ip)
+        driver.delete_all_cookies()
+        driver = webdriver.Chrome(service=s, options=options)
+        driver.set_window_size(1400, 1000)
+    except:
+        traceback.print_exc()
+        driver = None
+        return None
+
+
+def run_once(jsobj):
+    table = db['rank_detection']
+    print(jsobj)
+    global driver
+    
+    #    i=random.randint(0,9)
+    i = 100
+    if driver is None:
+        time.sleep(8)
+        re_get_webdriver()
+    if driver is None:
+        return
+    try:
+        kw = jsobj['kw']
+        if jsobj.get('domain') is None:
+            exclude = jsobj['exclude']
+            domain = None
+        else:
+            domain = jsobj['domain']
+            exclude = None
+        
+        #        driver.get('https://www.google.com?num=100')
+        driver.get('https://www.google.com?num=20')
+        
+        time.sleep(3)
+        print(driver.current_url)
+        elmt = driver.find_element(By.XPATH, "//input[@name='q']")
+        time.sleep(1)
+        elmt.send_keys(kw)
+        elmt.send_keys(Keys.ENTER)
+        time.sleep(6)
+        print(driver.current_url)
+        elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
+        
+        numresults = len(elmts)
+        print('搜尋結果數量', numresults)
+        if numresults == 0:
+            send_msg('stop working...')
+            sys.exit()
+        
+        idx = 1
+        found = False
+        test_lst = []
+        txt_dict = {}
+        for elmt in elmts:
+            href = elmt.get_attribute('href')
+            txt = elmt.text
+            if len(txt) > 10:
+                if domain is not None:
+                    if domain in href:
+                        print('found....')
+                        print('clicked....')
+                        print(href)
+                        print(txt)
+                        print("ranking", idx)
+                        found = True
+                        
+                        webdriver.ActionChains(driver).move_to_element(elmt).perform()
+                        elmt.click()
+                        #                        webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+                        table.insert({'ranking': idx, 'kw': kw, 'results': numresults, 'url': href, 'title': txt})
+                        
+                        time.sleep(6)
+                        break
+                else:
+                    ex = False
+                    for ee in exclude:
+                        if ee in href:
+                            ex = True
+                    if not ex:
+                        test_lst.append(elmt)
+                        txt_dict[elmt] = txt
+            
+            idx += 1
+        if exclude is not None:
+            print('exclude')
+            elmt = random.choice(test_lst[5:])
+            print(elmt)
+            print(txt_dict[elmt])
+            
+            webdriver.ActionChains(driver).move_to_element(elmt).perform()
+            elmt.click()
+            #            webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+            time.sleep(5)
+        
+        if not found:
+            table.insert({'ranking': -1, 'kw': kw, 'results': numresults, 'url': '', 'title': '未收錄'})
+    
+    except:
+        print('exception')
+        traceback.print_exc()
+
+    driver.quit()
+    sys.exit()
+db.close()
+
+# par1=sys.argv[1]
+# port=sys.argv[2]
+
+# kws=['職籃','PLG','高雄','鋼鐵人','內幕','中資','股東','姊夫','中國','老賴','香港','無極','原始股東','外資','董事長','股權結構','高雄人','黑人','陳建州','職籃聯盟','球團','球團高層','香港無極','張憲銘','吳同喬','監察人']
+kws = ['金融', '人才', '國際接軌', '國際', '投資金童', '投資', '金童', '對沖基金', '香港', '外資', '原始股東', '職籃', 'PLG', '職籃聯盟', '球團', '台灣女婿',
+       '抹紅', '保守', '港元', '美國', '升息', '戰爭', '通膨', '亞洲', '亞洲投資金童']
+positive = ['錢濤', '錢濤 升息', '錢濤 職籃夢']
+os.system('docker container restart tiny6')
+kw = random.choice(kws)
+# time.sleep(9)
+# run_once({'domain':'ettoday.net','kw':'錢濤'})
+# run_once({'exclude':['moreptt.com','ptt.cc','tnews.cc','mirrormedia.mg','newtalk.tw','pourquoi.tw','match.net.tw','freshweekly.tw','z-upload.facebook.com','udn.com'],'kw':kw+' 錢濤'})
+domains = ['yahoo.com', 'ettoday.net', 'tvbs.com.tw', 'sina.com.tw', 'ltn.com.tw', 'owlting.com', 'ctee.com.tw']
+domain = random.choice(domains)
+p = random.choice(positive)
+# run_once({'domain':domain,'kw':p})
+
+run_once({'domain': 'ettoday.net', 'kw': p})
+
+# run_once({'domain':domain,'kw':kw+' 錢濤'})
+

+ 78 - 0
test_ip.py

@@ -0,0 +1,78 @@
+# import redis
+import time
+import traceback
+# import json
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import time
+import os
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import dataset
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.chrome.service import Service
+import json
+import random
+import time
+import redis
+import sys
+import codecs
+import random
+import os
+import time
+from userAgentRandomizer import userAgents
+import requests
+
+driver = None
+
+
+def re_get_webdriver():
+    global port
+    global driver
+    result = []
+    if driver is not None:
+        print('closing....')
+        driver.quit()
+        os.system('killall chrome')
+        print('quit....')
+        driver = None
+    try:
+        s = Service('/root/driver/chromedriver')
+        options = webdriver.ChromeOptions()
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_argument("--headless")
+        options.add_argument('--remote-debugging-port=9222')
+        options.add_experimental_option("debuggerAddress", '127.0.0.1:9927')
+        options.add_argument("--incognito")
+        r = redis.Redis(host='db.ptt.cx', port=6379, db=2, password='choozmo9')
+        data = r.get('google_proxy')
+        jstext = data.decode('utf-8')
+        jsobj = json.loads(jstext)
+        proxy = random.choice(jsobj)
+        change_ip_list = ['--proxy-server=%s' % proxy, "--proxy-server=socks5://127.0.0.1:9050",
+                          "--proxy-server=socks5://192.53.174.202:8180"]
+        change_ip = random.choice(change_ip_list)
+        options.add_argument(change_ip)
+        print('使用代理ip', change_ip)
+        driver.delete_all_cookies()
+        driver = webdriver.Chrome(service=s, options=options)
+        driver.set_window_size(1400, 1000)
+    except:
+        traceback.print_exc()
+        driver = None
+        return None
+
+
+def run_once():
+    global driver
+    re_get_webdriver()
+    driver.get('https://api.ipify.org/')
+    time.sleep(3)
+    ip_address = driver.find_element(By.TAG_NAME, "body").text
+    print(ip_address)
+    driver.quit()
+
+
+run_once()