123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- import operator
- from typing import List, Tuple, Optional
- import os
- import jieba
- import jieba.posseg as pseg
- from .word import Word
- from .utils import notNumStr
- class Rake:
- def __init__(self): # , stopwordPath: str = None, delimWordPath: str = None):
- # If both Found and Initialized
- self.initialized = False
- self.stopWordList = list()
- self.delimWordList = list()
- def initializeFromPath(self, stopwordPath: str = "", delimWordPath: str = ""):
- if not os.path.exists(stopwordPath):
- print("Stop Word Path invalid")
- return
- if not os.path.exists(delimWordPath):
- print("Delim Word Path Invalid")
- return
- swLibList = [line.rstrip('\n') for line in open(stopwordPath,'r',encoding="utf-8")]
- conjLibList = [line.rstrip('\n') for line in open("rakedata/stoplist/中文分隔词词库.txt",'r',encoding="utf-8")]
- self.initializeFromList(swLibList, conjLibList)
- return
-
- def initializeFromList(self, swList : List = None, dwList : List = None):
- self.stopWordList = swList
- self.delimWordList = dwList
-
- if len(self.stopWordList) == 0 or len(self.delimWordList) == 0:
- print("Empty Stop word list or deliminator word list, uninitialized")
- return
- else:
- self.initialized = True
- def extractKeywordFromPath(self, text : str, num_kw : int = 10):
- if not self.initialized:
- print("Not initialized")
- return
- with open(text,'r',encoding="utf-8") as fp:
- text = fp.read()
- return self.extractKeywordFromString(text, num_kw = num_kw)
-
- def extractKeywordFromString(self, text : str, num_kw : int = 10):
- rawtextList = pseg.cut(text)
- # Construct List of Phrases and Preliminary textList
- textList = []
- listofSingleWord = dict()
- lastWord = ''
- poSPrty = ['m','x','uj','ul','mq','u','v','f']
- meaningfulCount = 0
- checklist = []
- for eachWord, flag in rawtextList:
- checklist.append([eachWord,flag])
- if eachWord in self.delimWordList or not notNumStr(eachWord) or eachWord in self.stopWordList or flag in poSPrty or eachWord == '\n':
- if lastWord != '|':
- textList.append("|")
- lastWord = "|"
- elif eachWord not in self.stopWordList and eachWord != '\n':
- textList.append(eachWord)
- meaningfulCount += 1
- if eachWord not in listofSingleWord:
- listofSingleWord[eachWord] = Word(eachWord)
- lastWord = ''
- # Construct List of list that has phrases as wrds
- newList = []
- tempList = []
- for everyWord in textList:
- if everyWord != '|':
- tempList.append(everyWord)
- else:
- newList.append(tempList)
- tempList = []
- tempStr = ''
- for everyWord in textList:
- if everyWord != '|':
- tempStr += everyWord + '|'
- else:
- if tempStr[:-1] not in listofSingleWord:
- listofSingleWord[tempStr[:-1]] = Word(tempStr[:-1])
- tempStr = ''
- # Update the entire List
- for everyPhrase in newList:
- res = ''
- for everyWord in everyPhrase:
- listofSingleWord[everyWord].updateOccur(len(everyPhrase))
- res += everyWord + '|'
- phraseKey = res[:-1]
- if phraseKey not in listofSingleWord:
- listofSingleWord[phraseKey] = Word(phraseKey)
- else:
- listofSingleWord[phraseKey].updateFreq()
- # Get score for entire Set
- outputList = dict()
- for everyPhrase in newList:
- if len(everyPhrase) > 5:
- continue
- score = 0
- phraseString = ''
- outStr = ''
- for everyWord in everyPhrase:
- score += listofSingleWord[everyWord].returnScore()
- phraseString += everyWord + '|'
- outStr += everyWord
- phraseKey = phraseString[:-1]
- freq = listofSingleWord[phraseKey].getFreq()
- if freq / meaningfulCount < 0.01 and freq < 3 :
- continue
- outputList[outStr] = score
- sorted_list = sorted(outputList.items(), key = operator.itemgetter(1), reverse = True)
- return sorted_list[:num_kw]
-
|