jared 1 년 전
부모
커밋
0d5192b5d2
5개의 변경된 파일486개의 추가작업 그리고 17개의 파일을 삭제
  1. 5 2
      SEO/extract_content.py
  2. 104 15
      SEO/gtrend_newwire.py
  3. 280 0
      deployment/click_choozmo.py
  4. 26 0
      deployment/cronjobs.py
  5. 71 0
      docker-compose.yml

+ 5 - 2
SEO/extract_content.py

@@ -1,7 +1,10 @@
 from newspaper import Article
 from chinese_keybert import Chinese_Extractor
 kw_extractor = Chinese_Extractor()
-url='https://www.decorations.com.tw/'
+#url='https://www.momoshop.com.tw/category/MgrpCategory.jsp?m_code=1803900396&cateLevel=2'
+#url='https://www.100.com.tw/article/3471'
+url='http://www.fingermedia.tw/?tag=%E8%91%A3%E4%BA%8B%E9%95%B7%E9%99%B3%E7%99%BE%E6%AC%BD'
+#url='https://www.decorations.com.tw/'
 #url = 'https://www.decorations.com.tw/'
 #url='https://dctdesign.tw/taipei-house-design-top10/'
 #url='https://tw.stock.yahoo.com/news/ccs-insight%E9%A0%90%E6%B8%ACaigc%E8%A2%AB%E9%81%8E%E5%BA%A6%E7%82%92%E4%BD%9C-%E6%98%8E%E5%B9%B4%E5%B0%87-%E9%99%8D%E6%BA%AB-003743296.html'
@@ -12,5 +15,5 @@ article.parse()
 txt=article.text
 print(txt)
 text=[txt]
-result = kw_extractor.generate_keywords(text,top_k=40,rank_methods="mmr",diversity=0.6)
+result = kw_extractor.generate_keywords(text,top_k=50,rank_methods="mmr",diversity=0.6)
 print(result)

+ 104 - 15
SEO/gtrend_newwire.py

@@ -8,23 +8,41 @@ from seleniumwire.utils import decode
 import sys
 import json
 import dataset
+import os
 db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrend2?charset=utf8mb4')
+table_logs=db['gtrend_logs']
 table=db['topics']
 singles={}
-cursor=db.query('select distinct sessionid,query from topics ')
-for c in cursor:
-    singles[(c['sessionid'],c['query'])]=1
 def init_webdriver():
+#    os.system('taskkill /f /im chrome.exe')
+
     options = webdriver.ChromeOptions()
+    options.add_argument("--disable-blink-features=AutomationControlled") 
     options.add_argument('--ignore-certificate-errors')
-    options.add_argument("--no-sandbox")
+    options.add_experimental_option("excludeSwitches", ["enable-automation"]) 
+    options.add_experimental_option("useAutomationExtension", False)
+#    options.debugger_address = "127.0.0.1:" + '8888'
+    
+#    options.add_argument("--no-sandbox")
 #    options.add_argument("--headless")
+#    options.add_argument("--incognito")
+
     options.add_argument("--disable-gpu")
     options.add_argument("--disable-dev-shm-usage")
+    options.add_argument("user-data-dir=C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data\\")
+#    options.add_argument('--profile-directory=Profile 7')
+#    options.add_argument('--profile-directory=Profile 47')
+    options.add_argument('--profile-directory=Default')
+#    options.add_argument('--profile-directory=Profile 64')
+
+#    options.add_argument('--profile-directory=Profile 101')
+
+
     driver = webdriver.Chrome(
         options=options
     )
     driver.set_window_size(1400,1000)
+    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") 
 
     return driver
 
@@ -36,10 +54,12 @@ def interceptor(request):
         if 'relatedsearches' in request.url:
             if request.response is not None:
                 rows=[]
+                if request.response.body is None:
+                    continue
     #            print(request.response.body)
                 body = decode(request.response.body,'gzip')
                 bd=body.decode()
-    #            print(body)
+                print(body)
                 bd=bd.replace(r")]}\',\n",'')
                 bd=bd.replace(r")]}',",'')
                 bd=bd.encode().decode('unicode-escape')
@@ -69,34 +89,103 @@ def interceptor(request):
 #            print(bd)
 
 #sessionid='20231014-關鍵字'
-sessionid='20231014-HHH'
+#sessionid='20231018-ChoozMo'
+#sessionid='20231024-AI'
+#sessionid='20231124-HHH'
+#sessionid='20231201-HHH'
+sessionid='20240119-HHH'
+
+cursor=db.query('select distinct sessionid,query from topics ')
+for c in cursor:
+    singles[(c['sessionid'],c['query'])]=1
+
 
 driver=init_webdriver()
 driver.request_interceptor = interceptor
 
-driver.get('https://trends.google.com.tw/')
-time.sleep(0.5)
+#driver.get('https://google.com.tw/')
+
+#driver.get('https://trends.google.com.tw/')
+#time.sleep(9999)
+
 #driver.get('https://trends.google.com.tw/trends/explore?geo=TW&hl=zh-TW')
+#driver.get('https://trends.google.com.tw/trends/')
+
+#time.sleep(3)
+
+#elmt = driver.find_element(By.XPATH, "//textarea[@type='search']")
+
+#time.sleep(1)
+#elmt.send_keys('家具')
+#elmt.send_keys(Keys.ENTER)
+#time.sleep(5)
+
 driver.get('https://trends.google.com.tw/trends/explore?date=now%207-d&geo=TW&hl=zh-TW')
-time.sleep(2)
-#elmt = driver.find_element(By.XPATH, "//div[@jsname='E470yf']//input[@aria-label='搜尋']")
-elmt = driver.find_element(By.XPATH, "//input[@aria-label='新增搜尋字詞']")
+time.sleep(5)
+#kw_list=['風水','小坪數','老宅','購屋','買房',]
+
+#kw_list=['鍋','洗衣機','冷氣','除濕機','烘碗機','床墊']
+kw_list=['/m/01c979','/g/122rvzch','/g/1q6jh4d9s','/m/0c_jw','/m/0d4wf','/m/0bl2jb','/g/11sr9_h44g','/m/06ht1','/m/03gfsp','/m/06wqb','/g/121kx11r','/m/02cwm','/m/02rfdq','/m/01j2bj','/g/11sr9_mdk7']
+#%2Fm%2F01748f
+#%2Fm%2F02vkqh8
+#'室內裝修'
+#%2Fm%2F02z51p
+#%2Fm%2F0m8q5
+#%2Fm%2F04vct9
+#kw_list=['建材']
+#kw_list=['/m/0mkz']
+#kw_list=['nvidia']
+
+#kw_list=['沙發']
+#房價
+#kw_list=['系統櫃']
+
+
+for kw in kw_list:
+    try:
+        table_logs.insert({'kw':kw,'sessionid':sessionid})
+    except:
+        print('dup')
+    print(kw)
+    #elmt = driver.find_element(By.XPATH, "//div[@jsname='E470yf']//input[@aria-label='搜尋']")
+    elmt = driver.find_element(By.XPATH, "//input[@aria-label='新增搜尋字詞']")
+    elmt.clear()
+    for i in range(20):
+        elmt.send_keys(Keys.BACK_SPACE)
+    elmt.send_keys(kw)
+
+    elmt.send_keys(Keys.ENTER)
+    time.sleep(11)
+time.sleep(9999)
+
+#kw_list=['/g/11j7ys83vr','/g/1yqccwk9n']
+#,'/m/019dx1']
+#kw_list=['/m/01c979','/g/122rvzch','/g/1q6jh4d9s']
+#kw_list=['/m/0c_jw','/m/0d4wf','/m/0bl2jb']
+
+#kw_list=['/g/11sr9_h44g','/m/06ht1','/m/03gfsp']
+#kw_list=['/m/06wqb','/g/121kx11r','/m/02cwm']
+
+#kw_list=['/m/02rfdq','/m/01j2bj','/g/11sr9_mdk7']
 
 print(elmt)
 time.sleep(1)
 
 #elmt.send_keys(Keys.ENTER)
-elmt.clear()
+#elmt.clear()
 #ais=['/m/0mkz','/g/11rsc2xsp1']
 # 電商'/m/02m96'
 
-elmt.send_keys('/m/0fy6m3')
-elmt.send_keys(Keys.ENTER)
+#elmt.send_keys('/m/0fy6m3')
+#elmt.send_keys('/m/077mq')
+
+#elmt.send_keys(Keys.ENTER)
 
 
 
-time.sleep(5)
 
+time.sleep(5)
+#time.sleep(9999)
 
 #https://trends.google.com.tw/trends/api/widgetdata/relatedsearches?hl=zh-TW&tz=-480&req=%7B%22restriction%22:%7B%22geo%22:%7B%22country%22:%22TW%22%7D,%22time%22:%222023-10-13T06%5C%5C:10%5C%5C:54+2023-10-14T06%5C%5C:10%5C%5C:54%22,%22originalTimeRangeForExploreUrl%22:%22now+1-d%22,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22%E5%8B%95%E7%89%A9%22%7D%5D%7D%7D,%22keywordType%22:%22QUERY%22,%22metric%22:%5B%22TOP%22,%22RISING%22%5D,%22trendinessSettings%22:%7B%22compareTime%22:%222023-10-12T06%5C%5C:10%5C%5C:54+2023-10-13T06%5C%5C:10%5C%5C:54%22%7D,%22requestOptions%22:%7B%22property%22:%22%22,%22backend%22:%22CM%22,%22category%22:0%7D,%22language%22:%22zh%22,%22userCountryCode%22:%22TW%22,%22userConfig%22:%7B%22userType%22:%22USER_TYPE_LEGIT_USER%22%7D%7D&token=APP6_UEAAAAAZSuCbrHsaUiytOcIA80ZR-ChhKV3nwvA
 #driver.get('https://trends.google.com.tw/trends/explore?q=%E5%8F%B0%E7%A9%8D%E9%9B%BB%E9%81%8B%E5%8B%95%E6%9C%83&date=now%201-d&geo=TW&hl=zh-TW')

+ 280 - 0
deployment/click_choozmo.py

@@ -0,0 +1,280 @@
+#import redis
+import time
+import traceback
+#import json
+from selenium import webdriver
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import time
+#import urllib
+import os
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+import dataset
+from selenium.webdriver.common.keys import Keys
+import json
+import random
+import time
+#import redis
+import sys
+import codecs
+import random
+import os
+import time
+import requests
+driver=None
+dockername='p4444'
+
+is_docker=True
+#is_docker=False
+db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
+
+#db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
+
+
+if is_docker:
+    portnum=random.randint(4444,4555)
+    print(portnum)
+    os.system('docker container stop '+dockername)
+    time.sleep(0.5)
+    os.system('docker container rm '+dockername)
+    time.sleep(0.5)
+    os.system('docker run -d -p '+str(portnum)+':4444 --shm-size=2g --name '+dockername+' --dns 168.95.1.1 selenium/standalone-chrome:103.0')
+    time.sleep(7)
+
+
+def re_get_webdriver():
+    global port
+    global driver
+    global portnum
+    global is_docker
+    result=[]
+    if driver is not None:
+        print('closing....')
+        driver.quit()
+        print('quit....')
+        driver=None
+    try:
+        options = webdriver.ChromeOptions()
+        options.add_argument("--no-sandbox")
+        options.add_argument("--headless")
+        options.add_argument("--incognito")
+#        options.add_argument('--proxy-server=socks5://172.104.92.245:14900')
+
+        mobile_emulation = {
+            "deviceMetrics": { "width": 360, "height": 640, "pixelRatio": 3.0 },
+            "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" }
+#        options.add_experimental_option("mobileEmulation", mobile_emulation)
+
+        if is_docker:
+            try:
+                driver = webdriver.Remote(
+                    command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
+                options=options)
+            except:
+                traceback.print_exc()
+                time.sleep(9999)
+                return None
+            return driver
+
+
+
+        try:
+            driver = webdriver.Chrome(options=options)
+
+        except:
+            traceback.print_exc()
+            return None
+        return driver
+    except:
+        traceback.print_exc()
+        driver=None
+        return None
+    return driver
+
+
+
+def run_once(jsobj):
+
+    table=db['seo_jobs_ranking']
+    history=db['seo_search_history']
+
+    print(jsobj)
+    kw=jsobj['kw']
+
+    i=100
+    while True:
+        driver=re_get_webdriver()
+        print('re_get_webdriver')
+        if driver is not None:
+            break
+        time.sleep(3)
+    try:
+        kw=jsobj['kw']
+        if jsobj.get('domain') is None:
+            exclude=jsobj['exclude']
+            domain=None
+        else:
+            domain=jsobj['domain']
+            exclude=None
+        driver.get('https://www.google.com?num=100')
+        time.sleep(1)
+        while True:
+            try:
+                print(driver.current_url)
+                break
+            except:
+                traceback.print_exc()
+                driver=re_get_webdriver()
+                time.sleep(3)
+                driver.get('https://www.google.com?num=100')
+#                time.sleep(3)
+
+            time.sleep(3)
+
+#        elmt = driver.find_element(By.XPATH, "//input[@name='q']")
+        elmt = driver.find_element(By.XPATH, "//textarea[@type='search']")
+
+        time.sleep(1)
+        elmt.send_keys(kw)
+        elmt.send_keys(Keys.ENTER)
+        time.sleep(3)
+
+#        elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
+        elmts = driver.find_elements(By.XPATH, "//a[@jsname='UWckNb']")
+
+
+        numresults=len(elmts)
+        
+        print('搜尋結果數量',numresults)
+        if numresults==0:
+            print(driver.current_url)
+            print(driver.title)
+            sys.exit()
+#        time.sleep(9999)
+
+        idx=1
+        found=False
+        test_lst=[]
+        clickelmt=None
+        neg_count=0
+        neg_total=0
+        clickidx=0
+        clickhref=''
+        clicktitle=''
+        for elmt in elmts:
+            href=elmt.get_attribute('href')
+            txt=elmt.text
+            history.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt})
+            if '坑殺' in txt or '侵占' in txt or '判決書' in txt or '強佔' in txt or '掏空' in txt or '送達公告' in txt or '違反勞動'in txt:
+                neg_count+=1
+                neg_total+=idx
+            if len(txt)>10:
+                if domain is not None:
+                    random.shuffle(domain)
+                    for d in domain:
+                        if d in href:
+                            print('found....')
+                            print('clicked....')
+                            print(href)
+                            print(txt)
+                            print("ranking", idx)
+                            found=True
+                            clickelmt=elmt
+                            clickidx=idx
+                            clickhref=href
+                            clicktitle=txt
+
+                else:
+                    if exclude not in href:
+                        test_lst.append(elmt)
+
+
+
+                    
+            idx+=1
+        if exclude is not None:
+            print('exclude')
+            elmt=random.choice(test_lst)
+            print(elmt)
+
+            webdriver.ActionChains(driver).move_to_element(elmt).perform()
+            webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+            time.sleep(5)
+        if neg_count ==0:
+            negstr='0'
+        else:
+            negstr=str(neg_total/neg_count)
+        print(' negative: ' +negstr)
+        if not found:
+            True
+            table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
+        else:
+            webdriver.ActionChains(driver).move_to_element(clickelmt).perform()
+            webdriver.ActionChains(driver).move_to_element(clickelmt).click().perform()
+            print('clicked...')
+            table.insert({'ranking':clickidx,'kw':kw,'results':numresults,'url':clickhref,'title':clicktitle,'avg_neg':negstr})
+            time.sleep(6)
+            print('sleep 6')
+            return
+
+
+    except:
+        traceback.print_exc()
+
+        print('exception')
+        traceback.print_exc()
+
+    driver.quit()
+time.sleep(5)
+
+#r=random.randint(0,7)
+r=987
+
+#JNOTE: 關鍵字點擊
+related=''
+
+cursor=db.query('SELECT cust,plan,prefix,domain,kw,positive FROM public.seo_jobs order by random() limit 1')
+for c in cursor:
+    cust=c['cust']
+    kw=c['kw']
+    plan=c['plan']
+    prefix=c['prefix']
+    domain=eval(c['domain'])
+    positive=eval(c['positive'])
+    break
+if r==6:
+    cust='CHOOZMO'
+#    plan='補lost'
+    plan='文章'
+    prefix=''
+    postfix=''
+    domain=['choozmo.com']
+#    positive=['集仕多']
+#    positive=['集仕多 AIGV']
+    positive=['集仕多 三立']
+
+#    positive=['台北室內設計公司排名']
+#    positive=[related]
+#    positive=['半 日照 植物 推薦']
+#    positive=['3 坪 多大']  
+#    positive=['鞋櫃']
+#    positive=['裝修屋子']
+#    positive=['']
+#    kw='幸福空間'
+#    kw='輕裝修'
+#    kw='輕裝修'
+
+
+#朱英凱
+#琢隱設計
+
+#os.system('curl --socks5 choozmo:choozmo9@172.104.92.245:14900 http://www.google.com')
+
+newkw=prefix+" "+kw+' '+random.choice(positive)
+print(newkw)
+#newkw=kw
+run_once({'domain':domain,'kw':newkw})
+
+
+

+ 26 - 0
deployment/cronjobs.py

@@ -0,0 +1,26 @@
+import time
+from apscheduler.schedulers.blocking import BlockingScheduler
+from datetime import datetime
+import os
+
+def job1():
+#    os.chdir(r'C:/gitlab/news_seo1/content_farm/AI/')
+    os.system('python3 click_choozmo.py')
+    print(f'工作1啟動: 目前時間{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
+
+scheduler = BlockingScheduler(timezone="Asia/Taipei")
+
+job1()
+# 每1分鐘執行job1函式
+scheduler.add_job(job1, 'interval', minutes=1)
+
+#scheduler.add_job(job2, 'interval', seconds=5)
+# 每1秒執行job3函式
+#scheduler.add_job(job3, 'interval', seconds=1)
+# 每週二到日的下午6點30分執行job4函式
+#scheduler.add_job(job4, 'cron', day_of_week='1-6', hour=18, minute=30)
+
+scheduler.start()
+
+print('Schedule started ...')  # 這行不會被執行
+

+ 71 - 0
docker-compose.yml

@@ -0,0 +1,71 @@
+version: "3"
+
+services:
+  huggingface_inference:
+    image: ghcr.io/huggingface/text-generation-inference:1.1.0
+    volumes:
+      - ./models/:/data
+    ports:
+      - "3000:80"
+    shm_size: '1gb'
+    command: --model-id meta-llama/Llama-2-7b-chat-hf --sharded true --num-shards 2
+#    --json-output
+    environment:
+      # - num_shard=1
+      - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
+      - DISABLE_CUSTOM_KERNELS=${DISABLE_CUSTOM_KERNELS}
+      - HF_HUB_ENABLE_HF_TRANSFER={HF_HUB_ENABLE_HF_TRANSFER}
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    working_dir: /app
+
+  mongo_chatui:
+    image: mongo:latest
+    ports:
+      - "27017:27017"
+
+  chat_ui:
+    build:
+      context: ./chat-ui
+      dockerfile: Dockerfile
+    command: >
+      sh -c "npm run dev -- --host"
+    volumes:
+      - ./chat-ui/.env.local:/app/chat-ui/.env.local
+    ports:
+      - "5173:5173"
+    depends_on:
+      - mongo_chatui
+      - huggingface_inference
+
+#  llamacpp:
+#    image: ghcr.io/ggerganov/llama.cpp:full-cuda
+  # debug:
+  #   image: ubuntu:latest
+  #   entrypoint: /bin/sh
+  #   stdin_open: true # docker run -i
+  #   tty: true        # docker run -t
+  #   volumes:
+  #     - ./models/:/data
+
+# 70b
+# meta-llama/Llama-2-70b-chat-hf
+
+# 70b q
+# TheBloke/Llama-2-70B-chat-AWQ
+
+# 7b
+# meta-llama/Llama-2-7b-chat-hf
+# mistralai/Mistral-7B-Instruct-v0.1
+
+# 7b q
+# TheBloke/Mistral-7B-Instruct-v0.1-AWQ
+# TheBloke/Llama-2-7b-Chat-AWQ
+
+# daryl149/llama-2-7b-chat-hf
+# georgesung/llama2_7b_chat_uncensored