ming
/
Podcast


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
							import operator
from typing import List, Tuple, Optional
import os
import jieba
import jieba.posseg as pseg
from .word import Word
from .utils import notNumStr

class Rake:

    def __init__(self): # , stopwordPath: str = None, delimWordPath: str = None):
        # If both Found and Initialized
        self.initialized = False
        self.stopWordList = list()
        self.delimWordList = list()

    def initializeFromPath(self, stopwordPath: str = "", delimWordPath: str = ""):
        if not os.path.exists(stopwordPath):
            print("Stop Word Path invalid")
            return

        if not os.path.exists(delimWordPath):
            print("Delim Word Path Invalid")
            return

        swLibList = [line.rstrip('\n') for line in open(stopwordPath,'r',encoding="utf-8")]
        conjLibList = [line.rstrip('\n') for line in open("rakedata/stoplist/中文分隔词词库.txt",'r',encoding="utf-8")]
        self.initializeFromList(swLibList, conjLibList)
        return
        
    def initializeFromList(self, swList : List = None, dwList : List = None):
        self.stopWordList = swList
        self.delimWordList = dwList
        
        if len(self.stopWordList) == 0 or len(self.delimWordList) == 0:
            print("Empty Stop word list or deliminator word list, uninitialized")
            return
        else:
            self.initialized = True

    def extractKeywordFromPath(self, text : str, num_kw : int = 10):
        if not self.initialized:
            print("Not initialized")
            return 

        with open(text,'r',encoding="utf-8") as fp:
            text = fp.read()
        return self.extractKeywordFromString(text, num_kw = num_kw)
        
    def extractKeywordFromString(self, text : str, num_kw : int = 10):
        rawtextList = pseg.cut(text)

        # Construct List of Phrases and Preliminary textList
        textList = []
        listofSingleWord = dict()
        lastWord = ''
        poSPrty = ['m','x','uj','ul','mq','u','v','f']
        meaningfulCount = 0
        checklist = []
        for eachWord, flag in rawtextList:
            checklist.append([eachWord,flag])
            if eachWord in self.delimWordList or not notNumStr(eachWord) or eachWord in self.stopWordList or flag in poSPrty or eachWord == '\n':
                if lastWord != '|':
                    textList.append("|")
                    lastWord = "|"
            elif eachWord not in self.stopWordList and eachWord != '\n':
                textList.append(eachWord)
                meaningfulCount += 1
                if eachWord not in listofSingleWord:
                    listofSingleWord[eachWord] = Word(eachWord)
                lastWord = ''

        # Construct List of list that has phrases as wrds
        newList = []
        tempList = []
        for everyWord in textList:
            if everyWord != '|':
                tempList.append(everyWord)
            else:
                newList.append(tempList)
                tempList = []

        tempStr = ''
        for everyWord in textList:
            if everyWord != '|':
                tempStr += everyWord + '|'
            else:
                if tempStr[:-1] not in listofSingleWord:
                    listofSingleWord[tempStr[:-1]] = Word(tempStr[:-1])
                    tempStr = ''

        # Update the entire List
        for everyPhrase in newList:
            res = ''
            for everyWord in everyPhrase:
                listofSingleWord[everyWord].updateOccur(len(everyPhrase))
                res += everyWord + '|'
            phraseKey = res[:-1]
            if phraseKey not in listofSingleWord:
                listofSingleWord[phraseKey] = Word(phraseKey)
            else:
                listofSingleWord[phraseKey].updateFreq()

        # Get score for entire Set
        outputList = dict()
        for everyPhrase in newList:

            if len(everyPhrase) > 5:
                continue
            score = 0
            phraseString = ''
            outStr = ''
            for everyWord in everyPhrase:
                score += listofSingleWord[everyWord].returnScore()
                phraseString += everyWord + '|'
                outStr += everyWord
            phraseKey = phraseString[:-1]
            freq = listofSingleWord[phraseKey].getFreq()
            if freq / meaningfulCount < 0.01 and freq < 3 :
                continue
            outputList[outStr] = score

        sorted_list = sorted(outputList.items(), key = operator.itemgetter(1), reverse = True)
        return sorted_list[:num_kw]