Jared пре 2 година
родитељ
комит
ef1be0bed4
1 измењених фајлова са 5 додато и 4 уклоњено
  1. 5 4
      choozmo/fetch_content.py

+ 5 - 4
choozmo/fetch_content.py

@@ -5,14 +5,15 @@ import html2text
 import jieba
 import dataset
 
+
 jieba.load_userdict("c:/tmp/userdict.txt")
-stopwords=', 的/-。*.|)(][_!、」「::jpgmenu有了也gif%stylespnghttpsimagesicogovRSSscript'
+stopwords=', 的/-。*.|)(][_!、」「::|)』『(xmlimgursvgbase64jpgmenuMenu有了也gif%stylespnghttpsimagesicogovRSSscript'
 db = dataset.connect('sqlite:///c:/tmp/jieba.db')
 db.query('delete from tmp')
 #db.query('drop table tmp')
 
-urls=['https://www.nightnight.tw/%E5%BA%8A%E5%A2%8A%E6%8E%A8%E8%96%A6/']
-
+#urls=['https://www.dcard.tw/f/house/p/232318765?cid=BBDFB720-BAE4-406E-8449-D2F12EA11241']
+urls=['https://www.gold-kirin.com.tw/about']
 
 #db = dataset.connect('sqlite:///:memory:')
 table=db['tmp']
@@ -36,7 +37,7 @@ for url in urls:
             table.insert({'word':word})
 
 
-cursor=db.query('select word,count(word) as cnt from tmp group by word having count(word) >2 order by count(word) desc')
+cursor=db.query('select word,count(word) as cnt from tmp group by word having ( count(word) >1 or length(word)>2) order by count(word) desc')
 for c in cursor:
     print(c['word'])
     print(c['cnt'])