ling
/
innolux


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
							import jieba
from pypinyin import pinyin, Style
from api.openai_scripts_tai_gi.config import CORRECT_TERMS, ERROR_CORRECTION

def chinese_soundex(pinyin_str):
    soundex_map = {
        'b': '1', 'p': '1', 'm': '1', 'f': '1',
        'd': '2', 'n': '2', 'l': '2',
        'g': '3', 'k': '3', 'h': '3', 't': '3',
        'j': '4', 'q': '4', 'x': '4',
        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
        'z': '6', 'c': '6', 's': '6',
        'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
        'ong': '9', 'un': '9', 'uan': '9',
        'i': 'A', 'u': 'A', 'v': 'A',
        'e': 'B', 'o': 'B',
    }
    
    code = ''
    tone = '0'
    i = 0
    while i < len(pinyin_str):
        if pinyin_str[i:i+2] in soundex_map:
            code += soundex_map[pinyin_str[i:i+2]]
            i += 2
        elif pinyin_str[i] in soundex_map:
            code += soundex_map[pinyin_str[i]]
            i += 1
        elif pinyin_str[i].isdigit():
            tone = pinyin_str[i]
            i += 1
        else:
            i += 1
    
    code = code[:1] + ''.join(sorted(set(code[1:])))
    return (code[:3] + tone).ljust(4, '0')

def compare_chinese_words(word1, word2, tone_sensitive=True):
    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
    
    soundex1 = chinese_soundex(pinyin1)
    # print(soundex1)
    soundex2 = chinese_soundex(pinyin2)
    # print('soundex2', soundex2)

    if tone_sensitive:
        return soundex1 == soundex2
    else:
        return soundex1[:3] == soundex2[:3]

def fuzzy_correct_chinese(text):
    words = jieba.lcut(text)
    corrected_words = []
    for word in words:
        if word.isalpha():
            corrected_words.append(word)
            continue
        word_pinyin = ''.join([p[0] for p in pinyin(word, style=Style.NORMAL)])
        # print(f"Term: {word}, Pinyin: {word_pinyin}")
        if word in ERROR_CORRECTION:
            corrected_words.append(ERROR_CORRECTION[word])
        else:
            for term in CORRECT_TERMS:
                if compare_chinese_words(word, term, tone_sensitive=True):
                    # print(f"corrected: {word} -> {term}")
                    corrected_words.append(term)
                    break
            else:
                corrected_words.append(word)
    return ''.join(corrected_words)