|
@@ -0,0 +1,43 @@
|
|
|
+
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+import requests
|
|
|
+import html2text
|
|
|
+import jieba
|
|
|
+import dataset
|
|
|
+
|
|
|
+
|
|
|
+jieba.load_userdict("c:/tmp/userdict.txt")
|
|
|
+stopwords=', 的/-。*.|)(][_!、」「::|)』『(xmlimgursvgbase64jpgmenuMenu有了也gif%stylespnghttpsimagesicogovRSSscript'
|
|
|
+db = dataset.connect('sqlite:///c:/tmp/jieba.db')
|
|
|
+db.query('delete from tmp')
|
|
|
+#db.query('drop table tmp')
|
|
|
+
|
|
|
+#urls=['https://www.dcard.tw/f/house/p/232318765?cid=BBDFB720-BAE4-406E-8449-D2F12EA11241']
|
|
|
+#urls=['https://www.gold-kirin.com.tw/about']
|
|
|
+urls=['https://www.xingcai-beauty.com/service.html']
|
|
|
+#db = dataset.connect('sqlite:///:memory:')
|
|
|
+table=db['tmp']
|
|
|
+
|
|
|
+# request web page
|
|
|
+#resp = requests.get("https://casino543.com/2021%E5%B9%B4%E5%8D%81%E5%A4%A7%E7%B7%9A%E4%B8%8A%E5%A8%9B%E6%A8%82%E5%9F%8E%E6%8E%92%E5%90%8D%E6%8E%A8%E8%96%A6-%E5%A8%9B%E6%A8%82%E5%9F%8E%E5%89%8D100%E5%90%8D%E5%A8%9B%E6%A8%82%E5%9F%8E%E9%82%84/")
|
|
|
+#resp = requests.get("https://mort.moi.gov.tw/frontsite/cms/newsAction.do?method=viewContentDetail&iscancel=true&contentId=MjU3NA==")
|
|
|
+#resp = requests.get("https://www.memory.com.tw/funeral_ceremony-in.php?i=5&c=3")
|
|
|
+for url in urls:
|
|
|
+ resp = requests.get(url)
|
|
|
+ html = resp.content
|
|
|
+ html=html.decode('utf-8')
|
|
|
+ h = html2text.HTML2Text()
|
|
|
+
|
|
|
+ h.ignore_links = True
|
|
|
+
|
|
|
+ docs=h.handle(html )
|
|
|
+ words = jieba.cut(docs, cut_all=False)
|
|
|
+ for word in words:
|
|
|
+ if word not in stopwords:
|
|
|
+ table.insert({'word':word})
|
|
|
+
|
|
|
+
|
|
|
+cursor=db.query('select word,count(word) as cnt from tmp group by word having ( count(word) >1 or length(word)>2) order by count(word) desc')
|
|
|
+for c in cursor:
|
|
|
+ print(c['word'])
|
|
|
+ print(c['cnt'])
|