Rake.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. import operator
  2. from typing import List, Tuple, Optional
  3. import os
  4. import jieba
  5. import jieba.posseg as pseg
  6. from .word import Word
  7. from .utils import notNumStr
  8. class Rake:
  9. def __init__(self): # , stopwordPath: str = None, delimWordPath: str = None):
  10. # If both Found and Initialized
  11. self.initialized = False
  12. self.stopWordList = list()
  13. self.delimWordList = list()
  14. def initializeFromPath(self, stopwordPath: str = "", delimWordPath: str = ""):
  15. if not os.path.exists(stopwordPath):
  16. print("Stop Word Path invalid")
  17. return
  18. if not os.path.exists(delimWordPath):
  19. print("Delim Word Path Invalid")
  20. return
  21. swLibList = [line.rstrip('\n') for line in open(stopwordPath,'r',encoding="utf-8")]
  22. conjLibList = [line.rstrip('\n') for line in open("rakedata/stoplist/中文分隔词词库.txt",'r',encoding="utf-8")]
  23. self.initializeFromList(swLibList, conjLibList)
  24. return
  25. def initializeFromList(self, swList : List = None, dwList : List = None):
  26. self.stopWordList = swList
  27. self.delimWordList = dwList
  28. if len(self.stopWordList) == 0 or len(self.delimWordList) == 0:
  29. print("Empty Stop word list or deliminator word list, uninitialized")
  30. return
  31. else:
  32. self.initialized = True
  33. def extractKeywordFromPath(self, text : str, num_kw : int = 10):
  34. if not self.initialized:
  35. print("Not initialized")
  36. return
  37. with open(text,'r',encoding="utf-8") as fp:
  38. text = fp.read()
  39. return self.extractKeywordFromString(text, num_kw = num_kw)
  40. def extractKeywordFromString(self, text : str, num_kw : int = 10):
  41. rawtextList = pseg.cut(text)
  42. # Construct List of Phrases and Preliminary textList
  43. textList = []
  44. listofSingleWord = dict()
  45. lastWord = ''
  46. poSPrty = ['m','x','uj','ul','mq','u','v','f']
  47. meaningfulCount = 0
  48. checklist = []
  49. for eachWord, flag in rawtextList:
  50. checklist.append([eachWord,flag])
  51. if eachWord in self.delimWordList or not notNumStr(eachWord) or eachWord in self.stopWordList or flag in poSPrty or eachWord == '\n':
  52. if lastWord != '|':
  53. textList.append("|")
  54. lastWord = "|"
  55. elif eachWord not in self.stopWordList and eachWord != '\n':
  56. textList.append(eachWord)
  57. meaningfulCount += 1
  58. if eachWord not in listofSingleWord:
  59. listofSingleWord[eachWord] = Word(eachWord)
  60. lastWord = ''
  61. # Construct List of list that has phrases as wrds
  62. newList = []
  63. tempList = []
  64. for everyWord in textList:
  65. if everyWord != '|':
  66. tempList.append(everyWord)
  67. else:
  68. newList.append(tempList)
  69. tempList = []
  70. tempStr = ''
  71. for everyWord in textList:
  72. if everyWord != '|':
  73. tempStr += everyWord + '|'
  74. else:
  75. if tempStr[:-1] not in listofSingleWord:
  76. listofSingleWord[tempStr[:-1]] = Word(tempStr[:-1])
  77. tempStr = ''
  78. # Update the entire List
  79. for everyPhrase in newList:
  80. res = ''
  81. for everyWord in everyPhrase:
  82. listofSingleWord[everyWord].updateOccur(len(everyPhrase))
  83. res += everyWord + '|'
  84. phraseKey = res[:-1]
  85. if phraseKey not in listofSingleWord:
  86. listofSingleWord[phraseKey] = Word(phraseKey)
  87. else:
  88. listofSingleWord[phraseKey].updateFreq()
  89. # Get score for entire Set
  90. outputList = dict()
  91. for everyPhrase in newList:
  92. if len(everyPhrase) > 5:
  93. continue
  94. score = 0
  95. phraseString = ''
  96. outStr = ''
  97. for everyWord in everyPhrase:
  98. score += listofSingleWord[everyWord].returnScore()
  99. phraseString += everyWord + '|'
  100. outStr += everyWord
  101. phraseKey = phraseString[:-1]
  102. freq = listofSingleWord[phraseKey].getFreq()
  103. if freq / meaningfulCount < 0.01 and freq < 3 :
  104. continue
  105. outputList[outStr] = score
  106. sorted_list = sorted(outputList.items(), key = operator.itemgetter(1), reverse = True)
  107. return sorted_list[:num_kw]