gspace_keywords.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. from selenium import webdriver
  2. import time
  3. import networkx as nx
  4. import dataset
  5. import pickle
  6. import codecs
  7. from selenium.webdriver.common import keys
  8. from selenium.webdriver.common.keys import Keys
  9. import sys
  10. import os
  11. import time
  12. import re
  13. import jieba
  14. from browser_common import JBrowser
  15. import jinja2
  16. import df2sheet
  17. import pandas as pd
  18. #db = dataset.connect('sqlite:///c:/tmp/gspace.db')
  19. db = dataset.connect('sqlite:///:memory:')
  20. jieba.load_userdict('C:\\src\\farmcodes\\jared\\browser\\dict.txt')
  21. jieba.load_userdict('C:\\tmp\\dict.txt')
  22. table=db['tmp']
  23. #db.query('delete from ranking')
  24. #db.query('delete from hhh_ranking')
  25. jb=JBrowser()
  26. jb.set_profile_path("Profile 7")
  27. #kw='收納櫃'
  28. #kw='收納櫃推薦'
  29. #kw='收納櫃ikea'
  30. #kw='收納櫃塑膠'
  31. #kw='收納櫃設計'
  32. #kw='收納'
  33. #kw='系統櫃'
  34. #kw='抽屜'
  35. #kw='系統櫃推薦'
  36. #kw='系統櫃價格'
  37. #kw='系統櫃廠商'
  38. #kw='系統櫃ptt'
  39. #kw='系統櫃材質'
  40. #kw='系統櫃樣式'
  41. #kw='系統櫃品牌'
  42. #kw='系統櫃衣櫃尺寸'
  43. #kw='系統櫃板材'
  44. #kw='合砌設計'
  45. #kw='富億空間'
  46. kw='空間 設計'
  47. googleurl='https://www.google.com/search?q='+kw
  48. jb.get(googleurl)
  49. for i in range(3):
  50. driver=jb.get_driver()
  51. time.sleep(3)
  52. elmts=driver.find_elements_by_xpath("//div[@class='g']//div[@class='yuRUbf']//a")
  53. idx=1
  54. ranking=-1
  55. for elmt in elmts:
  56. href=elmt.get_attribute('href')
  57. txt=elmt.text
  58. # print(href)
  59. # print(txt)
  60. try:
  61. elmt2=elmt.find_element_by_xpath("../../../div")
  62. fulldesc=elmt2.text
  63. print(fulldesc)
  64. seg_list = jieba.cut(fulldesc, cut_all=True)
  65. for s in seg_list:
  66. print(s)
  67. table.insert({'kw':s})
  68. except:
  69. print('except')
  70. idx+=1
  71. elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
  72. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  73. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  74. time.sleep(1)
  75. ## time.sleep(9999)
  76. # time.sleep(2)
  77. #from jinja2 import Environment, FileSystemLoader
  78. #THIS_DIR = os.path.dirname(os.path.abspath(__file__))
  79. #j2_env = Environment(loader=FileSystemLoader('c:/tmp'))
  80. #'http://icons-for-free.com/free-icons/png/128/1312099.png'
  81. data=[]
  82. df = pd.DataFrame(columns=('keywords','cnt'))
  83. cursor=db.query('select kw as kw,count(*) as cnt from tmp group by kw having count(*)>=1 order by count(*) desc')
  84. for c in cursor:
  85. print(c['kw'])
  86. print(c['cnt'])
  87. data.append({'kw':c['kw'],'cnt':c['cnt']})
  88. df.loc[idx]=[c['kw'],c['cnt']]
  89. idx+=1
  90. content=df.to_csv(index=False)
  91. fw=codecs.open('c:/tmp/exp.csv','w','utf-8')
  92. fw.write(content)
  93. fw.close()
  94. #df2sheet.save_sheet(df,'May-Event',kw,startpos='A1')
  95. #output=j2_env.get_template('hhh_kw.tmpl').render(data=data)
  96. #import codecs
  97. #fw=codecs.open('c:/tmp/hhh_kw.html', 'w','utf-8')
  98. #fw.write(output)
  99. #fw.close()