|
@@ -5,14 +5,15 @@ import html2text
|
|
|
import jieba
|
|
|
import dataset
|
|
|
|
|
|
+
|
|
|
jieba.load_userdict("c:/tmp/userdict.txt")
|
|
|
-stopwords=', 的/-。*.|)(][_!、」「::jpgmenu有了也gif%stylespnghttpsimagesicogovRSSscript'
|
|
|
+stopwords=', 的/-。*.|)(][_!、」「::|)』『(xmlimgursvgbase64jpgmenuMenu有了也gif%stylespnghttpsimagesicogovRSSscript'
|
|
|
db = dataset.connect('sqlite:///c:/tmp/jieba.db')
|
|
|
db.query('delete from tmp')
|
|
|
#db.query('drop table tmp')
|
|
|
|
|
|
-urls=['https://www.nightnight.tw/%E5%BA%8A%E5%A2%8A%E6%8E%A8%E8%96%A6/']
|
|
|
-
|
|
|
+#urls=['https://www.dcard.tw/f/house/p/232318765?cid=BBDFB720-BAE4-406E-8449-D2F12EA11241']
|
|
|
+urls=['https://www.gold-kirin.com.tw/about']
|
|
|
|
|
|
#db = dataset.connect('sqlite:///:memory:')
|
|
|
table=db['tmp']
|
|
@@ -36,7 +37,7 @@ for url in urls:
|
|
|
table.insert({'word':word})
|
|
|
|
|
|
|
|
|
-cursor=db.query('select word,count(word) as cnt from tmp group by word having count(word) >2 order by count(word) desc')
|
|
|
+cursor=db.query('select word,count(word) as cnt from tmp group by word having ( count(word) >1 or length(word)>2) order by count(word) desc')
|
|
|
for c in cursor:
|
|
|
print(c['word'])
|
|
|
print(c['cnt'])
|