|
@@ -0,0 +1,71 @@
|
|
|
+import jieba
|
|
|
+from pypinyin import pinyin, Style
|
|
|
+from config import CORRECT_TERMS, ERROR_CORRECTION
|
|
|
+
|
|
|
+def chinese_soundex(pinyin_str):
|
|
|
+ soundex_map = {
|
|
|
+ 'b': '1', 'p': '1', 'm': '1', 'f': '1',
|
|
|
+ 'd': '2', 'n': '2', 'l': '2',
|
|
|
+ 'g': '3', 'k': '3', 'h': '3', 't': '3',
|
|
|
+ 'j': '4', 'q': '4', 'x': '4',
|
|
|
+ 'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
|
|
|
+ 'z': '6', 'c': '6', 's': '6',
|
|
|
+ 'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
|
|
|
+ 'ong': '9', 'un': '9', 'uan': '9',
|
|
|
+ 'i': 'A', 'u': 'A', 'v': 'A',
|
|
|
+ 'e': 'B', 'o': 'B',
|
|
|
+ }
|
|
|
+
|
|
|
+ code = ''
|
|
|
+ tone = '0'
|
|
|
+ i = 0
|
|
|
+ while i < len(pinyin_str):
|
|
|
+ if pinyin_str[i:i+2] in soundex_map:
|
|
|
+ code += soundex_map[pinyin_str[i:i+2]]
|
|
|
+ i += 2
|
|
|
+ elif pinyin_str[i] in soundex_map:
|
|
|
+ code += soundex_map[pinyin_str[i]]
|
|
|
+ i += 1
|
|
|
+ elif pinyin_str[i].isdigit():
|
|
|
+ tone = pinyin_str[i]
|
|
|
+ i += 1
|
|
|
+ else:
|
|
|
+ i += 1
|
|
|
+
|
|
|
+ code = code[:1] + ''.join(sorted(set(code[1:])))
|
|
|
+ return (code[:3] + tone).ljust(4, '0')
|
|
|
+
|
|
|
+def compare_chinese_words(word1, word2, tone_sensitive=True):
|
|
|
+ pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
|
|
|
+ pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
|
|
|
+
|
|
|
+ soundex1 = chinese_soundex(pinyin1)
|
|
|
+ # print(soundex1)
|
|
|
+ soundex2 = chinese_soundex(pinyin2)
|
|
|
+ # print('soundex2', soundex2)
|
|
|
+
|
|
|
+ if tone_sensitive:
|
|
|
+ return soundex1 == soundex2
|
|
|
+ else:
|
|
|
+ return soundex1[:3] == soundex2[:3]
|
|
|
+
|
|
|
+def fuzzy_correct_chinese(text):
|
|
|
+ words = jieba.lcut(text)
|
|
|
+ corrected_words = []
|
|
|
+ for word in words:
|
|
|
+ if word.isalpha():
|
|
|
+ corrected_words.append(word)
|
|
|
+ continue
|
|
|
+ word_pinyin = ''.join([p[0] for p in pinyin(word, style=Style.NORMAL)])
|
|
|
+ # print(f"Term: {word}, Pinyin: {word_pinyin}")
|
|
|
+ if word in ERROR_CORRECTION:
|
|
|
+ corrected_words.append(ERROR_CORRECTION[word])
|
|
|
+ else:
|
|
|
+ for term in CORRECT_TERMS:
|
|
|
+ if compare_chinese_words(word, term, tone_sensitive=True):
|
|
|
+ # print(f"corrected: {word} -> {term}")
|
|
|
+ corrected_words.append(term)
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ corrected_words.append(word)
|
|
|
+ return ''.join(corrected_words)
|