from selenium import webdriver import time import networkx as nx import dataset import pickle import codecs from selenium.webdriver.common import keys from selenium.webdriver.common.keys import Keys import sys import os import time import re import jieba from browser_common import JBrowser import jinja2 import df2sheet import pandas as pd #db = dataset.connect('sqlite:///c:/tmp/gspace.db') db = dataset.connect('sqlite:///:memory:') jieba.load_userdict('C:\\src\\farmcodes\\jared\\browser\\dict.txt') jieba.load_userdict('C:\\tmp\\dict.txt') table=db['tmp'] #db.query('delete from ranking') #db.query('delete from hhh_ranking') jb=JBrowser() jb.set_profile_path("Profile 7") #kw='收納櫃' #kw='收納櫃推薦' #kw='收納櫃ikea' #kw='收納櫃塑膠' #kw='收納櫃設計' #kw='收納' #kw='系統櫃' #kw='抽屜' #kw='系統櫃推薦' #kw='系統櫃價格' #kw='系統櫃廠商' #kw='系統櫃ptt' #kw='系統櫃材質' #kw='系統櫃樣式' #kw='系統櫃品牌' #kw='系統櫃衣櫃尺寸' #kw='系統櫃板材' #kw='合砌設計' #kw='富億空間' kw='空間 設計' googleurl='https://www.google.com/search?q='+kw jb.get(googleurl) for i in range(3): driver=jb.get_driver() time.sleep(3) elmts=driver.find_elements_by_xpath("//div[@class='g']//div[@class='yuRUbf']//a") idx=1 ranking=-1 for elmt in elmts: href=elmt.get_attribute('href') txt=elmt.text # print(href) # print(txt) try: elmt2=elmt.find_element_by_xpath("../../../div") fulldesc=elmt2.text print(fulldesc) seg_list = jieba.cut(fulldesc, cut_all=True) for s in seg_list: print(s) table.insert({'kw':s}) except: print('except') idx+=1 elmt=driver.find_element_by_xpath("//a[@id='pnnext']") webdriver.ActionChains(driver).move_to_element(elmt).perform() webdriver.ActionChains(driver).move_to_element(elmt).click().perform() time.sleep(1) ## time.sleep(9999) # time.sleep(2) #from jinja2 import Environment, FileSystemLoader #THIS_DIR = os.path.dirname(os.path.abspath(__file__)) #j2_env = Environment(loader=FileSystemLoader('c:/tmp')) #'http://icons-for-free.com/free-icons/png/128/1312099.png' data=[] df = pd.DataFrame(columns=('keywords','cnt')) cursor=db.query('select kw as kw,count(*) as cnt from tmp group by kw having count(*)>=1 order by count(*) desc') for c in cursor: print(c['kw']) print(c['cnt']) data.append({'kw':c['kw'],'cnt':c['cnt']}) df.loc[idx]=[c['kw'],c['cnt']] idx+=1 content=df.to_csv(index=False) fw=codecs.open('c:/tmp/exp.csv','w','utf-8') fw.write(content) fw.close() #df2sheet.save_sheet(df,'May-Event',kw,startpos='A1') #output=j2_env.get_template('hhh_kw.tmpl').render(data=data) #import codecs #fw=codecs.open('c:/tmp/hhh_kw.html', 'w','utf-8') #fw.write(output) #fw.close()