extract_content.py 927 B

12345678910111213141516171819
  1. from newspaper import Article
  2. from chinese_keybert import Chinese_Extractor
  3. kw_extractor = Chinese_Extractor()
  4. #url='https://www.momoshop.com.tw/category/MgrpCategory.jsp?m_code=1803900396&cateLevel=2'
  5. #url='https://www.100.com.tw/article/3471'
  6. url='http://www.fingermedia.tw/?tag=%E8%91%A3%E4%BA%8B%E9%95%B7%E9%99%B3%E7%99%BE%E6%AC%BD'
  7. #url='https://www.decorations.com.tw/'
  8. #url = 'https://www.decorations.com.tw/'
  9. #url='https://dctdesign.tw/taipei-house-design-top10/'
  10. #url='https://tw.stock.yahoo.com/news/ccs-insight%E9%A0%90%E6%B8%ACaigc%E8%A2%AB%E9%81%8E%E5%BA%A6%E7%82%92%E4%BD%9C-%E6%98%8E%E5%B9%B4%E5%B0%87-%E9%99%8D%E6%BA%AB-003743296.html'
  11. #url='https://www.flexclip.com/tw/create/artificial-intelligence-video.html'
  12. article = Article(url)
  13. article.download()
  14. article.parse()
  15. txt=article.text
  16. print(txt)
  17. text=[txt]
  18. result = kw_extractor.generate_keywords(text,top_k=50,rank_methods="mmr",diversity=0.6)
  19. print(result)