sherry
/
ASR


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
							import os
import argparse
from openai import OpenAI
from dotenv import load_dotenv
import tiktoken
from pypinyin import pinyin, Style
import jieba
from datetime import datetime

def tone_aware_chinese_soundex(pinyin):
    soundex_map = {
        # 聲母（輔音）
        'b': '1', 'p': '1', 'm': '1', 'f': '1',
        'd': '2', 't': '2', 'n': '2', 'l': '2',
        'g': '3', 'k': '3', 'h': '3',
        'j': '4', 'q': '4', 'x': '4',
        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
        'z': '6', 'c': '6', 's': '6',
        # 容易混淆的音
        'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
        'ong': '9', 'un': '9', 'uan': '9',
        # 常見的元音混淆
        'i': 'A', 'ü': 'A', 'u': 'A',
        'e': 'B', 'o': 'B',
    }
    
    code = ''
    tone = '0'
    i = 0
    while i < len(pinyin):
        if pinyin[i:i+3] in soundex_map:
            code += soundex_map[pinyin[i:i+3]]
            i += 3
        elif pinyin[i:i+2] in soundex_map:
            code += soundex_map[pinyin[i:i+2]]
            i += 2
        elif pinyin[i] in soundex_map:
            code += soundex_map[pinyin[i]]
            i += 1
        elif pinyin[i].isdigit():
            tone = pinyin[i]
            i += 1
        else:
            i += 1  # 跳過未知字符
    
    # 保留第一個字符，去除重複，並填充到3個字符
    code = code[0] + ''.join(sorted(set(code[1:])))
    return (code[:3] + tone).ljust(4, '0')

def compare_chinese_words(word1, word2, tone_sensitive=True):
    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
    
    soundex1 = tone_aware_chinese_soundex(pinyin1)
    soundex2 = tone_aware_chinese_soundex(pinyin2)
    
    if tone_sensitive:
        return soundex1 == soundex2
    else:
        return soundex1[:3] == soundex2[:3]  # 忽略聲調比較

# 測試函數
def test_tone_aware_chinese_soundex():
    test_cases = [
        ("碳", "看"),
        ("權", "圈"),
        ("盤查", "盤插"),
        ("排放", "拍放"),
        ("溫室", "文室"),
        ("氣體", "汽體"),
        ("管理", "觀理"),
        ("足跡", "族跡"),
        ("淨零", "精零"),
        ("變遷", "便遷"),
    ]
    
    print("聲調敏感比較:")
    for word1, word2 in test_cases:
        soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
        soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
        result = compare_chinese_words(word1, word2, tone_sensitive=True)
        print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")
    
    print("\n聲調不敏感比較:")
    for word1, word2 in test_cases:
        soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
        soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
        result = compare_chinese_words(word1, word2, tone_sensitive=False)
        print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")

if __name__ == "__main__":
    test_tone_aware_chinese_soundex()