Jared 1 year ago
commit
810df493f0
2 changed files with 119 additions and 0 deletions
  1. 16 0
      SEO/extract_content.py
  2. 103 0
      SEO/gtrend_newwire.py

+ 16 - 0
SEO/extract_content.py

@@ -0,0 +1,16 @@
+from newspaper import Article
+from chinese_keybert import Chinese_Extractor
+kw_extractor = Chinese_Extractor()
+url='https://www.decorations.com.tw/'
+#url = 'https://www.decorations.com.tw/'
+#url='https://dctdesign.tw/taipei-house-design-top10/'
+#url='https://tw.stock.yahoo.com/news/ccs-insight%E9%A0%90%E6%B8%ACaigc%E8%A2%AB%E9%81%8E%E5%BA%A6%E7%82%92%E4%BD%9C-%E6%98%8E%E5%B9%B4%E5%B0%87-%E9%99%8D%E6%BA%AB-003743296.html'
+#url='https://www.flexclip.com/tw/create/artificial-intelligence-video.html'
+article = Article(url)
+article.download()
+article.parse()
+txt=article.text
+print(txt)
+text=[txt]
+result = kw_extractor.generate_keywords(text,top_k=40,rank_methods="mmr",diversity=0.6)
+print(result)

+ 103 - 0
SEO/gtrend_newwire.py

@@ -0,0 +1,103 @@
+from seleniumwire import webdriver
+import time
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.keys import Keys
+from seleniumwire.utils import decode
+import sys
+import json
+import dataset
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrend2?charset=utf8mb4')
+table=db['topics']
+singles={}
+cursor=db.query('select distinct sessionid,query from topics ')
+for c in cursor:
+    singles[(c['sessionid'],c['query'])]=1
+def init_webdriver():
+    options = webdriver.ChromeOptions()
+    options.add_argument('--ignore-certificate-errors')
+    options.add_argument("--no-sandbox")
+#    options.add_argument("--headless")
+    options.add_argument("--disable-gpu")
+    options.add_argument("--disable-dev-shm-usage")
+    driver = webdriver.Chrome(
+        options=options
+    )
+    driver.set_window_size(1400,1000)
+
+    return driver
+
+
+def interceptor(request):
+    global sessionid
+    global singles
+    for request in driver.requests:
+        if 'relatedsearches' in request.url:
+            if request.response is not None:
+                rows=[]
+    #            print(request.response.body)
+                body = decode(request.response.body,'gzip')
+                bd=body.decode()
+    #            print(body)
+                bd=bd.replace(r")]}\',\n",'')
+                bd=bd.replace(r")]}',",'')
+                bd=bd.encode().decode('unicode-escape')
+                js=json.loads(bd)
+                print(js)
+                rlist=js['default']['rankedList']
+                for r in rlist:
+                    kws=r['rankedKeyword']
+                    for k in kws:
+                        if k.get('topic') is not None:
+                            if singles.get((sessionid,k['topic']['title'])) is None:
+                                singles[(sessionid,k['topic']['title'])]=1
+                                rows.append({'sessionid':sessionid,'mid':k['topic']['mid'],'query':k['topic']['title'],'type':k['topic']['type'],'value':k['value']})
+#                            table.insert()
+                            print(k['topic'])
+                            print(k['value'])
+                        if k.get('query') is not None:
+#                            table.insert({'query':k['query'],'value':k['value']})
+                            if singles.get((sessionid,k['query'])) is None:
+                                singles[(sessionid,k['query'])]=1
+                                rows.append({'sessionid':sessionid,'query':k['query'],'value':k['value']})
+                            print(k['query'])
+                            print(k['value'])
+
+                table.insert_many(rows)
+#                sys.exit()
+#            print(bd)
+
+#sessionid='20231014-關鍵字'
+sessionid='20231014-HHH'
+
+driver=init_webdriver()
+driver.request_interceptor = interceptor
+
+driver.get('https://trends.google.com.tw/')
+time.sleep(0.5)
+#driver.get('https://trends.google.com.tw/trends/explore?geo=TW&hl=zh-TW')
+driver.get('https://trends.google.com.tw/trends/explore?date=now%207-d&geo=TW&hl=zh-TW')
+time.sleep(2)
+#elmt = driver.find_element(By.XPATH, "//div[@jsname='E470yf']//input[@aria-label='搜尋']")
+elmt = driver.find_element(By.XPATH, "//input[@aria-label='新增搜尋字詞']")
+
+print(elmt)
+time.sleep(1)
+
+#elmt.send_keys(Keys.ENTER)
+elmt.clear()
+#ais=['/m/0mkz','/g/11rsc2xsp1']
+# 電商'/m/02m96'
+
+elmt.send_keys('/m/0fy6m3')
+elmt.send_keys(Keys.ENTER)
+
+
+
+time.sleep(5)
+
+
+#https://trends.google.com.tw/trends/api/widgetdata/relatedsearches?hl=zh-TW&tz=-480&req=%7B%22restriction%22:%7B%22geo%22:%7B%22country%22:%22TW%22%7D,%22time%22:%222023-10-13T06%5C%5C:10%5C%5C:54+2023-10-14T06%5C%5C:10%5C%5C:54%22,%22originalTimeRangeForExploreUrl%22:%22now+1-d%22,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22%E5%8B%95%E7%89%A9%22%7D%5D%7D%7D,%22keywordType%22:%22QUERY%22,%22metric%22:%5B%22TOP%22,%22RISING%22%5D,%22trendinessSettings%22:%7B%22compareTime%22:%222023-10-12T06%5C%5C:10%5C%5C:54+2023-10-13T06%5C%5C:10%5C%5C:54%22%7D,%22requestOptions%22:%7B%22property%22:%22%22,%22backend%22:%22CM%22,%22category%22:0%7D,%22language%22:%22zh%22,%22userCountryCode%22:%22TW%22,%22userConfig%22:%7B%22userType%22:%22USER_TYPE_LEGIT_USER%22%7D%7D&token=APP6_UEAAAAAZSuCbrHsaUiytOcIA80ZR-ChhKV3nwvA
+#driver.get('https://trends.google.com.tw/trends/explore?q=%E5%8F%B0%E7%A9%8D%E9%9B%BB%E9%81%8B%E5%8B%95%E6%9C%83&date=now%201-d&geo=TW&hl=zh-TW')
+#time.sleep(9999)