3 years ago · adbd0ec348
--- a/choozmo/fetch_content.py
+++ b/choozmo/fetch_content.py
@@ -2,13 +2,41 @@
 
				 from bs4 import BeautifulSoup
			
 
				 import requests
			
 
				 import html2text
			
 
				+import jieba
			
 
				+import dataset
			
 
				+
			
 
				+jieba.load_userdict("c:/tmp/userdict.txt")
			
 
				+stopwords='， 的/-。*.|)(][_!、」「:：jpgmenu有了也gif%stylespnghttpsimagesicogovRSSscript'
			
 
				+db = dataset.connect('sqlite:///c:/tmp/jieba.db')
			
 
				+db.query('delete from tmp')
			
 
				+#db.query('drop table tmp')
			
 
				+
			
 
				+urls=['https://oghome.com.tw/%E4%B9%B3%E8%86%A0%E5%BA%8A%E5%A2%8A/']
			
 
				+
			
 
				+
			
 
				+#db = dataset.connect('sqlite:///:memory:')
			
 
				+table=db['tmp']
			
 
				 
			
 
				 # request web page
			
 
				-resp = requests.get("https://casino543.com/2021%E5%B9%B4%E5%8D%81%E5%A4%A7%E7%B7%9A%E4%B8%8A%E5%A8%9B%E6%A8%82%E5%9F%8E%E6%8E%92%E5%90%8D%E6%8E%A8%E8%96%A6-%E5%A8%9B%E6%A8%82%E5%9F%8E%E5%89%8D100%E5%90%8D%E5%A8%9B%E6%A8%82%E5%9F%8E%E9%82%84/")
			
 
				-html = resp.content
			
 
				-html=html.decode('utf-8')
			
 
				-h = html2text.HTML2Text()
			
 
				+#resp = requests.get("https://casino543.com/2021%E5%B9%B4%E5%8D%81%E5%A4%A7%E7%B7%9A%E4%B8%8A%E5%A8%9B%E6%A8%82%E5%9F%8E%E6%8E%92%E5%90%8D%E6%8E%A8%E8%96%A6-%E5%A8%9B%E6%A8%82%E5%9F%8E%E5%89%8D100%E5%90%8D%E5%A8%9B%E6%A8%82%E5%9F%8E%E9%82%84/")
			
 
				+#resp = requests.get("https://mort.moi.gov.tw/frontsite/cms/newsAction.do?method=viewContentDetail&iscancel=true&contentId=MjU3NA==")
			
 
				+#resp = requests.get("https://www.memory.com.tw/funeral_ceremony-in.php?i=5&c=3")
			
 
				+for url in urls:
			
 
				+    resp = requests.get(url)
			
 
				+    html = resp.content
			
 
				+    html=html.decode('utf-8')
			
 
				+    h = html2text.HTML2Text()
			
 
				+
			
 
				+    h.ignore_links = True
			
 
				+
			
 
				+    docs=h.handle(html )
			
 
				+    words = jieba.cut(docs, cut_all=False)
			
 
				+    for word in words:
			
 
				+        if word not in stopwords:
			
 
				+            table.insert({'word':word})
			
 
				 
			
 
				-h.ignore_links = True
			
 
				-print (h.handle(html ))
			
 
				 
			
 
				+cursor=db.query('select word,count(word) as cnt from tmp group by word having count(word) >2 order by count(word) desc')
			
 
				+for c in cursor:
			
 
				+    print(c['word'])
			
 
				+    print(c['cnt'])
			
--- a/hhh/Browser_ads_kw.py
+++ b/hhh/Browser_ads_kw.py
@@ -6,6 +6,9 @@ import dataset
 
				 import pickle
			
 
				 import traceback
			
 
				 import codecs
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.webdriver.support.ui import WebDriverWait
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				 from selenium.webdriver.common import keys
			
 
				 from selenium.webdriver.common.keys import Keys
			
 
				 import sys
			
@@ -16,29 +19,35 @@ import pandas as pd
 
				 from browser_common import JBrowser
			
 
				 import datetime
			
 
				 import dataset
			
 
				+import glob
			
 
				 
			
 
				 #db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/hhh?charset=utf8mb4')
			
 
				 
			
 
				 
			
 
				-def get_designer_statistics():
			
 
				+def get_designer_statistics(kw):
			
 
				     global db
			
 
				-    jb=JBrowser()
			
 
				     data=""
			
 
				-    jb.set_profile_path("Profile 7")
			
 
				+    chrome_options = webdriver.ChromeOptions()
			
 
				+    chrome_options.add_argument("--no-sandbox")
			
 
				+    chrome_options.add_argument("--disable-dev-shm-usage")
			
 
				+    chrome_options.add_argument("start-maximized")
			
 
				+    chrome_options.add_argument("user-data-dir=C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data")
			
 
				+    chrome_options.add_argument("--user-profile=Profile 7")
			
 
				+    driver = webdriver.Chrome(chrome_options=chrome_options)
			
 
				+
			
 
				+
			
 
				 #    googleurl='https://ads.google.com/aw/keywordplanner/'
			
 
				 #    googleurl='https://ads.google.com/aw/overview?ocid=600024232&euid=459838964&__u=6055300436&uscid=600024232&__c=2195332968&authuser=0&subid=ALL-zh-TW-et-g-aw-c-home-awhp_xin1_signin!o2'
			
 
				 #    googleurl='https://ads.google.com/aw/keywordplanner/home?ocid=600024232&euid=459838964&__u=6055300436&uscid=600024232&__c=2195332968&authuser=0&subid=ALL-zh-TW-et-g-aw-c-home-awhp_xin1_signin%21o2'
			
 
				     googleurl='https://ads.google.com/aw/keywordplanner/home?ocid=600024232&euid=459838964&__u=6055300436&uscid=600024232&__c=2195332968&authuser=0&subid=ALL-zh-TW-et-g-aw-c-home-awhp_xin1_signin%21o2'
			
 
				 #    googleurl='https://ads.google.com/aw/overview?ocid=732105824&euid=459838964&__u=6055300436&uscid=732105824&__c=5922164576&authuser=0'
			
 
				-    jb.get(googleurl)
			
 
				-    driver=jb.get_driver()
			
 
				+    driver.get(googleurl)
			
 
				+
			
 
				     driver.refresh()
			
 
				 
			
 
				     time.sleep(5)
			
 
				     driver.refresh()
			
 
				-
			
 
				-    time.sleep(15)
			
 
				-#    elmts=driver.find_elements_by_xpath("//div[@role='button' and contains(text(),'尋找新的關鍵字')]")
			
 
				+    element = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'尋找新的關鍵字')]/../..")))
			
 
				     elmts=driver.find_elements_by_xpath("//span[contains(text(),'尋找新的關鍵字')]/../..")
			
 
				     print(elmts)
			
 
				     for elmt in elmts:
			
@@ -48,38 +57,50 @@ def get_designer_statistics():
 
				         webdriver.ActionChains(driver).move_to_element(elmt).perform()
			
 
				         webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
			
 
				 
			
 
				-    time.sleep(15)
			
 
				+    elmt=WebDriverWait(driver, 25).until(EC.presence_of_element_located((By.XPATH, "//input[contains(@class,'search-input')]")))
			
 
				 
			
 
				-    elmt=driver.find_element_by_xpath("//input[contains(@class,'search-input')]")
			
 
				+#    elmt=driver.find_element_by_xpath("//input[contains(@class,'search-input')]")
			
 
				 #    elmt.send_keys("紓困")
			
 
				-    elmt.send_keys("疫苗")
			
 
				+#    elmt.send_keys("疫苗")
			
 
				+#    elmt.send_keys("直播")
			
 
				+#    elmt.send_keys("影片製作")
			
 
				+    elmt.send_keys(kw)
			
 
				 
			
 
				-    time.sleep(3)
			
 
				+    time.sleep(2)
			
 
				 
			
 
				-    elmt=driver.find_element_by_xpath("//div[contains(text(),'取得結果')]/..")
			
 
				-    print('clicking....')
			
 
				-    webdriver.ActionChains(driver).move_to_element(elmt).perform()
			
 
				-    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
			
 
				+    elmt.send_keys(Keys.ENTER)
			
 
				+
			
 
				+    elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//div[contains(text(),'取得結果')]/..")))
			
 
				 
			
 
				-    time.sleep(5)
			
 
				-    elmt=driver.find_element_by_xpath("//span[contains(text(),'關鍵字檢視畫面')]/../../..")
			
 
				     print('clicking....')
			
 
				     webdriver.ActionChains(driver).move_to_element(elmt).perform()
			
 
				     webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
			
 
				 
			
 
				-#    elmt=driver.find_element_by_xpath("//span[contains(text(),'分組檢視畫面')]/../..")
			
 
				-#    elmt=driver.find_element_by_xpath("//span[contains(text(),'分組檢視畫面')]")
			
 
				+    time.sleep(8)
			
 
				+    try:
			
 
				+        elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'關鍵字檢視畫面')]/../../..")))
			
 
				+
			
 
				+#        elmt=driver.find_element_by_xpath("//span[contains(text(),'關鍵字檢視畫面')]/../../..")
			
 
				+        print('clicking....')
			
 
				+        webdriver.ActionChains(driver).move_to_element(elmt).perform()
			
 
				+        webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
			
 
				+    except:
			
 
				+        traceback.print_exc()
			
 
				     try:
			
 
				-        time.sleep(4)
			
 
				+        time.sleep(7)
			
 
				+
			
 
				+#        elmt=driver.find_element_by_xpath("//material-select-item[contains(@aria-label,'分組檢視畫面')]")
			
 
				+#        elmt=driver.find_element_by_xpath("//span[contains(@aria-label,'分組檢視畫面')]")
			
 
				+        elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'分組檢視畫面')]")))
			
 
				+
			
 
				+        webdriver.ActionChains(driver).move_to_element(elmt).perform()
			
 
				+        webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
			
 
				 
			
 
				-        elmt=driver.find_element_by_xpath("//material-select-item[contains(@aria-label,'分組檢視畫面')]")
			
 
				     except:
			
 
				         print('except')
			
 
				         traceback.print_exc()
			
 
				     print('clicking....')
			
 
				 
			
 
				-    webdriver.ActionChains(driver).move_to_element(elmt).perform()
			
 
				-    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
			
 
				 
			
 
				 #    try:
			
 
				 #        time.sleep(8)
			
@@ -91,26 +112,115 @@ def get_designer_statistics():
 
				 
			
 
				     print('clicking....')
			
 
				 
			
 
				+    time.sleep(10)
			
 
				+#    elmt=driver.find_element_by_xpath("//material-menu[contains(@class,'download download-menu')]")
			
 
				+#    elmt=driver.find_element_by_xpath("//material-menu[contains(@class,'download download-menu')]//material-button[contains(@class,'trigger-button')]//i[@role='img']")
			
 
				+    elmt=driver.find_element_by_xpath("//material-menu[contains(@class,'download download-menu-compact')]//material-ripple")
			
 
				+    print(elmt)
			
 
				+    print(elmt.text)
			
 
				+
			
 
				+    webdriver.ActionChains(driver).move_to_element(elmt).perform()
			
 
				+    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
			
 
				+#    body=driver.find_element_by_xpath('//body')
			
 
				+    time.sleep(5)
			
 
				+
			
 
				+#    elmt=driver.find_element_by_xpath("//span[contains(@class,'menu-item-label') and contains(text(),'.csv')]")
			
 
				+#    elmt=driver.find_element_by_xpath("/html/body/div[4]/div[6]/div/div/div[2]/div[2]/div/menu-item-groups/div/material-select-item[1]/span/span")
			
 
				+#    elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[4]/div[6]/div/div/div[2]/div[2]/div/menu-item-groups/div/material-select-item[1]/span/span")))
			
 
				+#    elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(@class,'menu-item-label') and contains(text(),'.csv')]")))
			
 
				+    elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//material-select-item[@aria-label='.csv']" )))
			
 
				+#    elmt = WebDriverWait(driver, 25).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'.csv')]")))
			
 
				+
			
 
				 
			
 
				 
			
 
				 
			
 
				-    time.sleep(40)
			
 
				+#    elmt=driver.find_element_by_xpath("//div/material-select-item[1]/span/span[contains(text(),'.csv')]")
			
 
				+    print(elmt)
			
 
				+    print(elmt.text)
			
 
				+
			
 
				+    webdriver.ActionChains(driver).move_to_element(elmt).perform()
			
 
				+    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
			
 
				+    
			
 
				+#    body.send_keys(Keys.ARROW_DOWN)
			
 
				+##    time.sleep(1)
			
 
				+
			
 
				+#    body.send_keys(Keys.ARROW_DOWN)
			
 
				+#    time.sleep(1)
			
 
				+#    body.send_keys(Keys.ENTER)
			
 
				+
			
 
				+#    elmt=driver.find_element_by_xpath("//span[contains(@class,'menu-item-label') and contains(text(),'.csv')]")
			
 
				+#    elmt=driver.find_element_by_xpath("//span[contains(@class,'menu-item-label') and contains(text(),'.csv')]")
			
 
				+
			
 
				+
			
 
				+#    print(elmt)
			
 
				+#    print(elmt.text)
			
 
				+
			
 
				+
			
 
				+
			
 
				+    time.sleep(10)
			
 
				     print('after sleep')
			
 
				 #    elmts=driver.find_elements_by_xpath("//div[@class='keyword-text _ngcontent-owh-97']")
			
 
				-    elmts=driver.find_elements_by_xpath("//zippy-icon/..//keyword-text")
			
 
				-    for elmt in elmts:
			
 
				-        print(elmt.text)
			
 
				-        data+=elmt.text+"\n"
			
 
				+#    elmts=driver.find_elements_by_xpath("//zippy-icon/..//keyword-text")
			
 
				+#    for elmt in elmts:
			
 
				+#        print(elmt.text)
			
 
				+#        data+=elmt.text+"\n"
			
 
				 
			
 
				-    fw=codecs.open('c:/tmp/out.txt','w','utf-8')
			
 
				-    fw.write(data)
			
 
				-    fw.close()
			
 
				+#    fw=codecs.open('c:/tmp/out.txt','w','utf-8')
			
 
				+#    fw.write(data)
			
 
				+#    fw.close()
			
 
				 
			
 
				 #        print(elmt)
			
 
				 
			
 
				-    time.sleep(9999)
			
 
				+#    time.sleep(9999)
			
 
				 
			
 
				     return 'ok'
			
 
				 
			
 
				 
			
 
				-get_designer_statistics()
			
 
				+def proc_latest_file():
			
 
				+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
			
 
				+    table=db['wordprice']
			
 
				+    list_of_files = glob.glob('C:\\Users\\jared\\Downloads\\*.csv')
			
 
				+#    print(list_of_files)
			
 
				+    latest_file = max(list_of_files, key=os.path.getmtime)
			
 
				+    print(latest_file)
			
 
				+    fr=codecs.open(latest_file,'r','utf-16')
			
 
				+    lines=fr.readlines()
			
 
				+    for l in lines[3:]:
			
 
				+        elmts=l.split('\t')
			
 
				+        month=elmts[2]
			
 
				+        if '--' in month:
			
 
				+            month=0
			
 
				+
			
 
				+        if len(month)<=0:
			
 
				+            month=0
			
 
				+
			
 
				+        change3m=elmts[3]
			
 
				+        change3m=change3m.replace('%','')
			
 
				+        if '--' in change3m:
			
 
				+            change3m=0
			
 
				+        if change3m=='∞':
			
 
				+            change3m=99999
			
 
				+        change1y=elmts[4]
			
 
				+        change1y=change1y.replace('%','')
			
 
				+        if '--' in change1y:
			
 
				+            change1y=0
			
 
				+        if change1y=='∞':
			
 
				+            change1y=99999
			
 
				+
			
 
				+        if len(elmts)<=28:
			
 
				+            brand=''
			
 
				+        else:
			
 
				+#            print(len(elmts))
			
 
				+            brand=elmts[28]
			
 
				+        entry={'keyword':elmts[0],'month':int(month),'change3m':int(change3m),'change1y':int(change1y),'comptetion':elmts[5],'compidx':elmts[6],'low':elmts[7],'high':elmts[8],'brand':brand,'dt':datetime.date.today()}
			
 
				+        table.insert(entry)
			
 
				+        print(entry)
			
 
				+    fr.close()
			
 
				+
			
 
				+#get_designer_statistics('關鍵字')
			
 
				+#get_designer_statistics('影片特效')
			
 
				+#get_designer_statistics('行銷')
			
 
				+#get_designer_statistics('生前契約')
			
 
				+get_designer_statistics('塔位')
			
 
				+
			
 
				+proc_latest_file()