Bläddra i källkod

Merge branch 'master' of http://git.choozmo.com:3000/choozmo/kw_tools

Jared 2 år sedan
förälder
incheckning
e1e84e8f71
5 ändrade filer med 322 tillägg och 3 borttagningar
  1. 2 3
      INNNews/run_sns.py
  2. 1 0
      INNNews/sns_clickbot.py
  3. 112 0
      SEO/news_clickbot.py
  4. 108 0
      SEO/single_page_clickbot.py
  5. 99 0
      choozmo/ads_selenium.py

+ 2 - 3
INNNews/run_sns.py

@@ -49,13 +49,12 @@ class JParams(object):
             try:
                 os.system('docker container restart tiny6')
                 time.sleep(1)
-                
+                run_once(9927, 'tiny6')
+                time.sleep(20)
                 break
             except:
                 os.system('docker container restart tiny6')
                 time.sleep(15)
-            run_once(9927, 'tiny6')
-            # time.sleep(20)
 
 if __name__ == '__main__':
     fire.Fire(JParams)

+ 1 - 0
INNNews/sns_clickbot.py

@@ -63,6 +63,7 @@ def process_query(qs):
     # df=pd.DataFrame()
 
     elmts = driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
+    print('網頁數量',len(elmts))
     idx = 1
     for elmt in elmts:
         href=elmt.get_attribute('href')

+ 112 - 0
SEO/news_clickbot.py

@@ -0,0 +1,112 @@
+from random import randint
+import sys
+import os
+import dataset
+from selenium import webdriver
+import traceback
+import datetime
+import codecs
+import time
+import urllib
+import argparse
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+import logging
+import sys
+from logging.handlers import SysLogHandler
+import socket
+import pandas as pd
+
+_LOG_SERVER = ('hhh.ptt.cx', 514)
+logger = logging.getLogger('clickbot_100')
+handler1 = SysLogHandler(address=_LOG_SERVER, socktype=socket.SOCK_DGRAM)
+logger.addHandler(handler1)
+logger.debug('[clickbot_100][火柴星人]begin')
+
+
+path = 'C:\portable\chromedriver'
+path_z = '/Users/zooeytsai/Downloads/chromedriver 4'
+
+
+def restart_browser(pport):
+    while True:
+        try:
+            os.system('docker container restart tiny4')
+            time.sleep(1)
+            break
+        except:
+            os.system('docker container restart tiny4')
+            time.sleep(15)
+    s = Service('/root/driver/chromedriver')
+    options = webdriver.ChromeOptions()
+    options.add_argument('--headless')
+    options.add_argument('--remote-debugging-port=9222')
+    options.add_experimental_option("debuggerAddress", f"127.0.0.1:{pport}")
+    options.add_argument("--user-agent=" + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0")
+    options.add_argument("--incognito")
+    driver = webdriver.Chrome(options=options, service=s)
+    driver.delete_all_cookies()
+    driver.set_window_size(950, 20000)
+    return driver
+
+
+def process_one(pport):
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
+    lst = ['好睡王 引新聞','好睡王 Yahoo','好睡王 HiNet','好睡王 PCHOME','好睡王 蕃新聞','好睡王 新浪','好睡王 台北郵報','好睡王 LIFE.tw','好睡王 match生活網','好睡王炎炎夏日 POPDAILY','好睡王 LINE TODAY']
+    table = db['news_log']
+
+    for term in lst[7::]:
+        print(term)
+        logger.debug('[clickbot_100][' + term + ']')
+        driver = restart_browser(pport)
+        googleurl = 'https://www.google.com/?num=30'
+        driver.get(googleurl)
+        time.sleep(6)
+        send_kw_elmt = driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input')
+        send_kw_elmt.send_keys(term)
+        time.sleep(3)
+        send_kw_elmt.send_keys(Keys.ENTER)
+        time.sleep(6)
+        elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
+        idx = 1
+        print(len(elmts))
+        for elmt in elmts:
+            href = elmt.get_attribute('href')
+            txt = elmt.text
+            print(txt)
+            if len(txt) > 10:
+                if '炎炎夏日易輾轉難眠' in txt:
+                    print("ranking", idx)
+                    table.insert({'kw': term, 'ranking': idx, 'title': txt, 'url': href,
+                                  'dt': datetime.datetime.now()})
+                    print('clicked....')
+                    webdriver.ActionChains(driver).move_to_element(elmt).perform()
+                    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+                    time.sleep(5)
+                    break
+            idx += 1
+        
+        
+        driver.quit()
+        print('中場休息')
+        time.sleep(randint(90, 120))
+    db.close()
+
+
+process_one('9925')
+
+# parser = argparse.ArgumentParser()
+# parser.add_argument('--loop', action="store_true")
+# args = parser.parse_args()
+
+# if args.loop:
+#     schedule.every(0.4).minutes.do(process_one)
+#     # print('今天開始')
+#     # schedule.every().day.at('9:30').do(process_one)
+#
+#     while True:
+#         schedule.run_pending()
+#         time.sleep(1)
+
+# >> C:\tmp\seo_line.txt 2>&1

+ 108 - 0
SEO/single_page_clickbot.py

@@ -0,0 +1,108 @@
+from random import randint
+import sys
+import os
+import dataset
+from selenium import webdriver
+import traceback
+import datetime
+import codecs
+import time
+import urllib
+import argparse
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+import logging
+import sys
+from logging.handlers import SysLogHandler
+import socket
+import pandas as pd
+
+
+_LOG_SERVER = ('hhh.ptt.cx', 514)
+logger = logging.getLogger('clickbot_100')
+handler1 = SysLogHandler(address=_LOG_SERVER, socktype=socket.SOCK_DGRAM)
+logger.addHandler(handler1)
+logger.debug('[clickbot_100][火柴星人]begin')
+
+
+def restart_browser(pport):
+    while True:
+        try:
+            os.system('docker container restart tiny4')
+            time.sleep(1)
+            break
+        except:
+            os.system('docker container restart tiny4')
+            time.sleep(15)
+    s = Service('/root/driver/chromedriver')
+    options = webdriver.ChromeOptions()
+    options.add_argument('--headless')
+    options.add_argument('--remote-debugging-port=9222')
+    options.add_experimental_option("debuggerAddress", f"127.0.0.1:{pport}")
+    options.add_argument(
+        "--user-agent=" + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0")
+    options.add_argument("--incognito")
+    driver = webdriver.Chrome(options=options, service=s)
+    driver.delete_all_cookies()
+    driver.set_window_size(950, 20000)
+    return driver
+
+
+def process_one(pport):
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
+    lst = ['裝潢預算','工程費','丈量費','裝修預算']
+    table = db['general_log']
+    main_url = 'https://hhh.com.tw/columns/detail/2094/'
+    for term in lst:
+        print(term)
+        logger.debug('[clickbot_100][' + term + ']')
+        driver = restart_browser(pport)
+        googleurl = 'https://www.google.com/?num=40'
+        driver.get(googleurl)
+        time.sleep(6)
+        send_kw_elmt = driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input')
+        send_kw_elmt.send_keys(term)
+        time.sleep(3)
+        send_kw_elmt.send_keys(Keys.ENTER)
+        time.sleep(6)
+        elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
+        idx = 1
+        print(len(elmts))
+        for elmt in elmts:
+            href = elmt.get_attribute('href')
+            txt = elmt.text
+            # print(txt)
+            if len(txt) > 10:
+                if href == main_url:
+                    print("ranking", idx)
+                    table.insert({'kw': term, 'domain':'hhh.com.tw','ranking': idx, 'title': txt, 'url': href,'dt': datetime.datetime.now()})
+                    print('clicked....')
+                    webdriver.ActionChains(driver).move_to_element(elmt).perform()
+                    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+                    time.sleep(5)
+                    break
+            idx += 1
+        
+        driver.quit()
+        print('中場休息')
+        time.sleep(randint(20,30))
+    db.close()
+
+
+process_one('9927')
+
+# parser = argparse.ArgumentParser()
+# parser.add_argument('--loop', action="store_true")
+# args = parser.parse_args()
+
+# if args.loop:
+#     schedule.every(0.4).minutes.do(process_one)
+#     # print('今天開始')
+#     # schedule.every().day.at('9:30').do(process_one)
+#
+#     while True:
+#         schedule.run_pending()
+#         time.sleep(1)
+
+# >> C:\tmp\seo_line.txt 2>&1

+ 99 - 0
choozmo/ads_selenium.py

@@ -0,0 +1,99 @@
+import time
+import json
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import os
+import urllib.parse
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+import codecs
+import random
+import requests
+import datetime
+import dataset
+import time
+import traceback
+import sys
+import fire
+import pymysql
+pymysql.install_as_MySQLdb()
+
+driver = None
+
+
+def process_query(qs):
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
+    table = db['ads']
+    q = qs[0]
+    client = qs[1]
+    global driver
+    googleurl = 'https://www.google.com/webhp?hl=zh-TW&sa=X&ved=0ahUKEwj84vXliMX4AhUYqFYBHcUMAlgQPAgI'
+    driver.get(googleurl)
+    time.sleep(6)
+    send_kw_elmt = driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input')
+    send_kw_elmt.send_keys(q)
+    time.sleep(3)
+    send_kw_elmt.send_keys(Keys.ENTER)
+    time.sleep(6)
+    elmts_title = driver.find_elements(By.XPATH, "//div[@class='CCgQ5 vCa9Yd QfkTvb MUxGbd v0nnCb']/span")
+    elmts_content = driver.find_elements(By.XPATH, "//div[@class='MUxGbd yDYNvb lyLwlc']")
+    print(len(elmts_title))
+    print(len(elmts_content))
+    
+    for i, j in zip(elmts_title,elmts_content):
+        title = i.text
+        content = j.text
+        
+        table.insert({'kw': q, 'client': client, 'title': title, 'content': content,
+                              'dt': datetime.datetime.now()})
+
+    db.close()
+
+
+def run_once(q):
+    global driver
+    result = []
+    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 OPR/68.0.3618.125'
+    s = Service('/Users/zooeytsai/Downloads/chromedriver 4')
+    op = webdriver.ChromeOptions()
+    # options.add_argument('--headless')
+    # options.add_argument('--remote-debugging-port=9222')
+    # options.add_experimental_option("debuggerAddress", "192.168.192.45:9922")
+    # options.add_argument("--user-agent=" + user_agent)
+    # options.add_argument("--incognito")
+    op.add_argument('--disable-dev-shm-usage')
+    op.add_argument('--no-sandbox')
+    op.add_argument(r'user-data-dir=C:\Users\Administrator\AppData\Local\Google\Chrome\User Data')
+    op.add_argument(f"profile-directory=Profile 25")
+    op.add_experimental_option("excludeSwitches", ["enable-automation"])
+    op.add_experimental_option('useAutomationExtension', False)
+    driver = webdriver.Chrome(options=op,
+                              executable_path=r'C:\Users\Administrator\Downloads\chromedriver_win32 (4)\chromedriver')
+    
+    driver = webdriver.Chrome(
+        options=op, service=s)
+    str1 = driver.capabilities['browserVersion']
+    print('版本', str1)
+    driver.delete_all_cookies()
+    driver.set_window_size(1400, 1000)
+    
+    print(q)
+    process_query(q)
+    time.sleep(3)
+    driver.quit()
+
+
+class JParams(object):
+    
+    def get(self, kw, domain):
+        print(kw)
+        print(domain)
+        run_once((kw, domain))
+
+
+if __name__ == '__main__':
+    # fire.Fire(JParams)
+    run_once(('新北 家具 推薦','班尼斯'))