Jared 2 年 前
コミット
adbd0ec348
2 ファイル変更178 行追加40 行削除
  1. 34 6
      choozmo/fetch_content.py
  2. 144 34
      hhh/Browser_ads_kw.py

+ 34 - 6
choozmo/fetch_content.py

@@ -2,13 +2,41 @@
 from bs4 import BeautifulSoup
 import requests
 import html2text
+import jieba
+import dataset
+
+jieba.load_userdict("c:/tmp/userdict.txt")
+stopwords=', 的/-。*.|)(][_!、」「::jpgmenu有了也gif%stylespnghttpsimagesicogovRSSscript'
+db = dataset.connect('sqlite:///c:/tmp/jieba.db')
+db.query('delete from tmp')
+#db.query('drop table tmp')
+
+urls=['https://oghome.com.tw/%E4%B9%B3%E8%86%A0%E5%BA%8A%E5%A2%8A/']
+
+
+#db = dataset.connect('sqlite:///:memory:')
+table=db['tmp']
 
 # request web page
-resp = requests.get("https://casino543.com/2021%E5%B9%B4%E5%8D%81%E5%A4%A7%E7%B7%9A%E4%B8%8A%E5%A8%9B%E6%A8%82%E5%9F%8E%E6%8E%92%E5%90%8D%E6%8E%A8%E8%96%A6-%E5%A8%9B%E6%A8%82%E5%9F%8E%E5%89%8D100%E5%90%8D%E5%A8%9B%E6%A8%82%E5%9F%8E%E9%82%84/")
-html = resp.content
-html=html.decode('utf-8')
-h = html2text.HTML2Text()
+#resp = requests.get("https://casino543.com/2021%E5%B9%B4%E5%8D%81%E5%A4%A7%E7%B7%9A%E4%B8%8A%E5%A8%9B%E6%A8%82%E5%9F%8E%E6%8E%92%E5%90%8D%E6%8E%A8%E8%96%A6-%E5%A8%9B%E6%A8%82%E5%9F%8E%E5%89%8D100%E5%90%8D%E5%A8%9B%E6%A8%82%E5%9F%8E%E9%82%84/")
+#resp = requests.get("https://mort.moi.gov.tw/frontsite/cms/newsAction.do?method=viewContentDetail&iscancel=true&contentId=MjU3NA==")
+#resp = requests.get("https://www.memory.com.tw/funeral_ceremony-in.php?i=5&c=3")
+for url in urls:
+    resp = requests.get(url)
+    html = resp.content
+    html=html.decode('utf-8')
+    h = html2text.HTML2Text()
+
+    h.ignore_links = True
+
+    docs=h.handle(html )
+    words = jieba.cut(docs, cut_all=False)
+    for word in words:
+        if word not in stopwords:
+            table.insert({'word':word})
 
-h.ignore_links = True
-print (h.handle(html ))
 
+cursor=db.query('select word,count(word) as cnt from tmp group by word having count(word) >2 order by count(word) desc')
+for c in cursor:
+    print(c['word'])
+    print(c['cnt'])

+ 144 - 34
hhh/Browser_ads_kw.py

@@ -6,6 +6,9 @@ import dataset
 import pickle
 import traceback
 import codecs
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common import keys
 from selenium.webdriver.common.keys import Keys
 import sys
@@ -16,29 +19,35 @@ import pandas as pd
 from browser_common import JBrowser
 import datetime
 import dataset
+import glob
 
 #db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/hhh?charset=utf8mb4')
 
 
-def get_designer_statistics():
+def get_designer_statistics(kw):
     global db
-    jb=JBrowser()
     data=""
-    jb.set_profile_path("Profile 7")
+    chrome_options = webdriver.ChromeOptions()
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("start-maximized")
+    chrome_options.add_argument("user-data-dir=C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data")
+    chrome_options.add_argument("--user-profile=Profile 7")
+    driver = webdriver.Chrome(chrome_options=chrome_options)
+
+
 #    googleurl='https://ads.google.com/aw/keywordplanner/'
 #    googleurl='https://ads.google.com/aw/overview?ocid=600024232&euid=459838964&__u=6055300436&uscid=600024232&__c=2195332968&authuser=0&subid=ALL-zh-TW-et-g-aw-c-home-awhp_xin1_signin!o2'
 #    googleurl='https://ads.google.com/aw/keywordplanner/home?ocid=600024232&euid=459838964&__u=6055300436&uscid=600024232&__c=2195332968&authuser=0&subid=ALL-zh-TW-et-g-aw-c-home-awhp_xin1_signin%21o2'
     googleurl='https://ads.google.com/aw/keywordplanner/home?ocid=600024232&euid=459838964&__u=6055300436&uscid=600024232&__c=2195332968&authuser=0&subid=ALL-zh-TW-et-g-aw-c-home-awhp_xin1_signin%21o2'
 #    googleurl='https://ads.google.com/aw/overview?ocid=732105824&euid=459838964&__u=6055300436&uscid=732105824&__c=5922164576&authuser=0'
-    jb.get(googleurl)
-    driver=jb.get_driver()
+    driver.get(googleurl)
+
     driver.refresh()
 
     time.sleep(5)
     driver.refresh()
-
-    time.sleep(15)
-#    elmts=driver.find_elements_by_xpath("//div[@role='button' and contains(text(),'尋找新的關鍵字')]")
+    element = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'尋找新的關鍵字')]/../..")))
     elmts=driver.find_elements_by_xpath("//span[contains(text(),'尋找新的關鍵字')]/../..")
     print(elmts)
     for elmt in elmts:
@@ -48,38 +57,50 @@ def get_designer_statistics():
         webdriver.ActionChains(driver).move_to_element(elmt).perform()
         webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
 
-    time.sleep(15)
+    elmt=WebDriverWait(driver, 25).until(EC.presence_of_element_located((By.XPATH, "//input[contains(@class,'search-input')]")))
 
-    elmt=driver.find_element_by_xpath("//input[contains(@class,'search-input')]")
+#    elmt=driver.find_element_by_xpath("//input[contains(@class,'search-input')]")
 #    elmt.send_keys("紓困")
-    elmt.send_keys("疫苗")
+#    elmt.send_keys("疫苗")
+#    elmt.send_keys("直播")
+#    elmt.send_keys("影片製作")
+    elmt.send_keys(kw)
 
-    time.sleep(3)
+    time.sleep(2)
 
-    elmt=driver.find_element_by_xpath("//div[contains(text(),'取得結果')]/..")
-    print('clicking....')
-    webdriver.ActionChains(driver).move_to_element(elmt).perform()
-    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+    elmt.send_keys(Keys.ENTER)
+
+    elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//div[contains(text(),'取得結果')]/..")))
 
-    time.sleep(5)
-    elmt=driver.find_element_by_xpath("//span[contains(text(),'關鍵字檢視畫面')]/../../..")
     print('clicking....')
     webdriver.ActionChains(driver).move_to_element(elmt).perform()
     webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
 
-#    elmt=driver.find_element_by_xpath("//span[contains(text(),'分組檢視畫面')]/../..")
-#    elmt=driver.find_element_by_xpath("//span[contains(text(),'分組檢視畫面')]")
+    time.sleep(8)
+    try:
+        elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'關鍵字檢視畫面')]/../../..")))
+
+#        elmt=driver.find_element_by_xpath("//span[contains(text(),'關鍵字檢視畫面')]/../../..")
+        print('clicking....')
+        webdriver.ActionChains(driver).move_to_element(elmt).perform()
+        webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+    except:
+        traceback.print_exc()
     try:
-        time.sleep(4)
+        time.sleep(7)
+
+#        elmt=driver.find_element_by_xpath("//material-select-item[contains(@aria-label,'分組檢視畫面')]")
+#        elmt=driver.find_element_by_xpath("//span[contains(@aria-label,'分組檢視畫面')]")
+        elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'分組檢視畫面')]")))
+
+        webdriver.ActionChains(driver).move_to_element(elmt).perform()
+        webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
 
-        elmt=driver.find_element_by_xpath("//material-select-item[contains(@aria-label,'分組檢視畫面')]")
     except:
         print('except')
         traceback.print_exc()
     print('clicking....')
 
-    webdriver.ActionChains(driver).move_to_element(elmt).perform()
-    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
 
 #    try:
 #        time.sleep(8)
@@ -91,26 +112,115 @@ def get_designer_statistics():
 
     print('clicking....')
 
+    time.sleep(10)
+#    elmt=driver.find_element_by_xpath("//material-menu[contains(@class,'download download-menu')]")
+#    elmt=driver.find_element_by_xpath("//material-menu[contains(@class,'download download-menu')]//material-button[contains(@class,'trigger-button')]//i[@role='img']")
+    elmt=driver.find_element_by_xpath("//material-menu[contains(@class,'download download-menu-compact')]//material-ripple")
+    print(elmt)
+    print(elmt.text)
+
+    webdriver.ActionChains(driver).move_to_element(elmt).perform()
+    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+#    body=driver.find_element_by_xpath('//body')
+    time.sleep(5)
+
+#    elmt=driver.find_element_by_xpath("//span[contains(@class,'menu-item-label') and contains(text(),'.csv')]")
+#    elmt=driver.find_element_by_xpath("/html/body/div[4]/div[6]/div/div/div[2]/div[2]/div/menu-item-groups/div/material-select-item[1]/span/span")
+#    elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[4]/div[6]/div/div/div[2]/div[2]/div/menu-item-groups/div/material-select-item[1]/span/span")))
+#    elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(@class,'menu-item-label') and contains(text(),'.csv')]")))
+    elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//material-select-item[@aria-label='.csv']" )))
+#    elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'.csv')]")))
+
 
 
 
-    time.sleep(40)
+#    elmt=driver.find_element_by_xpath("//div/material-select-item[1]/span/span[contains(text(),'.csv')]")
+    print(elmt)
+    print(elmt.text)
+
+    webdriver.ActionChains(driver).move_to_element(elmt).perform()
+    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+    
+#    body.send_keys(Keys.ARROW_DOWN)
+##    time.sleep(1)
+
+#    body.send_keys(Keys.ARROW_DOWN)
+#    time.sleep(1)
+#    body.send_keys(Keys.ENTER)
+
+#    elmt=driver.find_element_by_xpath("//span[contains(@class,'menu-item-label') and contains(text(),'.csv')]")
+#    elmt=driver.find_element_by_xpath("//span[contains(@class,'menu-item-label') and contains(text(),'.csv')]")
+
+
+#    print(elmt)
+#    print(elmt.text)
+
+
+
+    time.sleep(10)
     print('after sleep')
 #    elmts=driver.find_elements_by_xpath("//div[@class='keyword-text _ngcontent-owh-97']")
-    elmts=driver.find_elements_by_xpath("//zippy-icon/..//keyword-text")
-    for elmt in elmts:
-        print(elmt.text)
-        data+=elmt.text+"\n"
+#    elmts=driver.find_elements_by_xpath("//zippy-icon/..//keyword-text")
+#    for elmt in elmts:
+#        print(elmt.text)
+#        data+=elmt.text+"\n"
 
-    fw=codecs.open('c:/tmp/out.txt','w','utf-8')
-    fw.write(data)
-    fw.close()
+#    fw=codecs.open('c:/tmp/out.txt','w','utf-8')
+#    fw.write(data)
+#    fw.close()
 
 #        print(elmt)
 
-    time.sleep(9999)
+#    time.sleep(9999)
 
     return 'ok'
 
 
-get_designer_statistics()
+def proc_latest_file():
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
+    table=db['wordprice']
+    list_of_files = glob.glob('C:\\Users\\jared\\Downloads\\*.csv')
+#    print(list_of_files)
+    latest_file = max(list_of_files, key=os.path.getmtime)
+    print(latest_file)
+    fr=codecs.open(latest_file,'r','utf-16')
+    lines=fr.readlines()
+    for l in lines[3:]:
+        elmts=l.split('\t')
+        month=elmts[2]
+        if '--' in month:
+            month=0
+
+        if len(month)<=0:
+            month=0
+
+        change3m=elmts[3]
+        change3m=change3m.replace('%','')
+        if '--' in change3m:
+            change3m=0
+        if change3m=='∞':
+            change3m=99999
+        change1y=elmts[4]
+        change1y=change1y.replace('%','')
+        if '--' in change1y:
+            change1y=0
+        if change1y=='∞':
+            change1y=99999
+
+        if len(elmts)<=28:
+            brand=''
+        else:
+#            print(len(elmts))
+            brand=elmts[28]
+        entry={'keyword':elmts[0],'month':int(month),'change3m':int(change3m),'change1y':int(change1y),'comptetion':elmts[5],'compidx':elmts[6],'low':elmts[7],'high':elmts[8],'brand':brand,'dt':datetime.date.today()}
+        table.insert(entry)
+        print(entry)
+    fr.close()
+
+#get_designer_statistics('關鍵字')
+#get_designer_statistics('影片特效')
+#get_designer_statistics('行銷')
+#get_designer_statistics('生前契約')
+get_designer_statistics('塔位')
+
+proc_latest_file()