import jieba from pypinyin import pinyin, Style from api.openai_scripts_chinese.config import CORRECT_TERMS, ERROR_CORRECTION def chinese_soundex(pinyin_str): soundex_map = { 'b': '1', 'p': '1', 'm': '1', 'f': '1', 'd': '2', 'n': '2', 'l': '2', 'g': '3', 'k': '3', 'h': '3', 't': '3', 'j': '4', 'q': '4', 'x': '4', 'zh': '5', 'ch': '5', 'sh': '5', 'r': '5', 'z': '6', 'c': '6', 's': '6', 'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8', 'ong': '9', 'un': '9', 'uan': '9', 'i': 'A', 'u': 'A', 'v': 'A', 'e': 'B', 'o': 'B', } code = '' tone = '0' i = 0 while i < len(pinyin_str): if pinyin_str[i:i+2] in soundex_map: code += soundex_map[pinyin_str[i:i+2]] i += 2 elif pinyin_str[i] in soundex_map: code += soundex_map[pinyin_str[i]] i += 1 elif pinyin_str[i].isdigit(): tone = pinyin_str[i] i += 1 else: i += 1 code = code[:1] + ''.join(sorted(set(code[1:]))) return (code[:3] + tone).ljust(4, '0') def compare_chinese_words(word1, word2, tone_sensitive=True): pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]) pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]) soundex1 = chinese_soundex(pinyin1) # print(soundex1) soundex2 = chinese_soundex(pinyin2) # print('soundex2', soundex2) if tone_sensitive: return soundex1 == soundex2 else: return soundex1[:3] == soundex2[:3] def fuzzy_correct_chinese(text): words = jieba.lcut(text) corrected_words = [] for word in words: if word.isalpha(): corrected_words.append(word) continue word_pinyin = ''.join([p[0] for p in pinyin(word, style=Style.NORMAL)]) # print(f"Term: {word}, Pinyin: {word_pinyin}") if word in ERROR_CORRECTION: corrected_words.append(ERROR_CORRECTION[word]) else: for term in CORRECT_TERMS: if compare_chinese_words(word, term, tone_sensitive=True): # print(f"corrected: {word} -> {term}") corrected_words.append(term) break else: corrected_words.append(word) return ''.join(corrected_words)