123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- from selenium import webdriver
- import time
- import networkx as nx
- import dataset
- import pickle
- import codecs
- from selenium.webdriver.common import keys
- from selenium.webdriver.common.keys import Keys
- import sys
- import os
- import time
- import re
- import jieba
- from browser_common import JBrowser
- import jinja2
- import df2sheet
- import pandas as pd
- #db = dataset.connect('sqlite:///c:/tmp/gspace.db')
- db = dataset.connect('sqlite:///:memory:')
- jieba.load_userdict('C:\\src\\farmcodes\\jared\\browser\\dict.txt')
- jieba.load_userdict('C:\\tmp\\dict.txt')
- table=db['tmp']
- #db.query('delete from ranking')
- #db.query('delete from hhh_ranking')
- jb=JBrowser()
- jb.set_profile_path("Profile 7")
- #kw='收納櫃'
- #kw='收納櫃推薦'
- #kw='收納櫃ikea'
- #kw='收納櫃塑膠'
- #kw='收納櫃設計'
- #kw='收納'
- #kw='系統櫃'
- #kw='抽屜'
- #kw='系統櫃推薦'
- #kw='系統櫃價格'
- #kw='系統櫃廠商'
- #kw='系統櫃ptt'
- #kw='系統櫃材質'
- #kw='系統櫃樣式'
- #kw='系統櫃品牌'
- #kw='系統櫃衣櫃尺寸'
- #kw='系統櫃板材'
- #kw='合砌設計'
- #kw='富億空間'
- kw='空間 設計'
- googleurl='https://www.google.com/search?q='+kw
- jb.get(googleurl)
- for i in range(3):
- driver=jb.get_driver()
- time.sleep(3)
- elmts=driver.find_elements_by_xpath("//div[@class='g']//div[@class='yuRUbf']//a")
- idx=1
- ranking=-1
- for elmt in elmts:
- href=elmt.get_attribute('href')
- txt=elmt.text
- # print(href)
- # print(txt)
- try:
- elmt2=elmt.find_element_by_xpath("../../../div")
- fulldesc=elmt2.text
- print(fulldesc)
- seg_list = jieba.cut(fulldesc, cut_all=True)
- for s in seg_list:
- print(s)
- table.insert({'kw':s})
- except:
- print('except')
- idx+=1
- elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
- webdriver.ActionChains(driver).move_to_element(elmt).perform()
- webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
- time.sleep(1)
-
- ## time.sleep(9999)
- # time.sleep(2)
- #from jinja2 import Environment, FileSystemLoader
- #THIS_DIR = os.path.dirname(os.path.abspath(__file__))
- #j2_env = Environment(loader=FileSystemLoader('c:/tmp'))
- #'http://icons-for-free.com/free-icons/png/128/1312099.png'
- data=[]
- df = pd.DataFrame(columns=('keywords','cnt'))
- cursor=db.query('select kw as kw,count(*) as cnt from tmp group by kw having count(*)>=1 order by count(*) desc')
- for c in cursor:
- print(c['kw'])
- print(c['cnt'])
- data.append({'kw':c['kw'],'cnt':c['cnt']})
- df.loc[idx]=[c['kw'],c['cnt']]
- idx+=1
- content=df.to_csv(index=False)
- fw=codecs.open('c:/tmp/exp.csv','w','utf-8')
- fw.write(content)
- fw.close()
- #df2sheet.save_sheet(df,'May-Event',kw,startpos='A1')
- #output=j2_env.get_template('hhh_kw.tmpl').render(data=data)
- #import codecs
- #fw=codecs.open('c:/tmp/hhh_kw.html', 'w','utf-8')
- #fw.write(output)
- #fw.close()
|