123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- from selenium import webdriver
- import time
- import networkx as nx
- import dataset
- import pickle
- import codecs
- from selenium.webdriver.common import keys
- from selenium.webdriver.common.keys import Keys
- import sys
- import os
- import time
- import re
- import jieba
- from browser_common import JBrowser
- import jinja2
- import df2sheet
- import pandas as pd
- db = dataset.connect('sqlite:///:memory:')
- jieba.load_userdict('C:\\src\\farmcodes\\jared\\browser\\dict.txt')
- jieba.load_userdict('C:\\tmp\\dict.txt')
- table=db['tmp']
- jb=JBrowser()
- jb.set_profile_path("Profile 7")
- kw='空間 設計'
- googleurl='https://www.google.com/search?q='+kw
- jb.get(googleurl)
- for i in range(3):
- driver=jb.get_driver()
- time.sleep(3)
- elmts=driver.find_elements_by_xpath("//div[@class='g']//div[@class='yuRUbf']//a")
- idx=1
- ranking=-1
- for elmt in elmts:
- href=elmt.get_attribute('href')
- txt=elmt.text
- try:
- elmt2=elmt.find_element_by_xpath("../../../div")
- fulldesc=elmt2.text
- print(fulldesc)
- seg_list = jieba.cut(fulldesc, cut_all=True)
- for s in seg_list:
- print(s)
- table.insert({'kw':s})
- except:
- print('except')
- idx+=1
- elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
- webdriver.ActionChains(driver).move_to_element(elmt).perform()
- webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
- time.sleep(1)
-
- data=[]
- df = pd.DataFrame(columns=('keywords','cnt'))
- cursor=db.query('select kw as kw,count(*) as cnt from tmp group by kw having count(*)>=1 order by count(*) desc')
- for c in cursor:
- print(c['kw'])
- print(c['cnt'])
- data.append({'kw':c['kw'],'cnt':c['cnt']})
- df.loc[idx]=[c['kw'],c['cnt']]
- idx+=1
- content=df.to_csv(index=False)
- fw=codecs.open('c:/tmp/exp.csv','w','utf-8')
- fw.write(content)
- fw.close()
|