import os import argparse from openai import OpenAI from dotenv import load_dotenv import tiktoken from pypinyin import pinyin, Style import jieba from datetime import datetime def tone_aware_chinese_soundex(pinyin): soundex_map = { # 聲母(輔音) 'b': '1', 'p': '1', 'm': '1', 'f': '1', 'd': '2', 't': '2', 'n': '2', 'l': '2', 'g': '3', 'k': '3', 'h': '3', 'j': '4', 'q': '4', 'x': '4', 'zh': '5', 'ch': '5', 'sh': '5', 'r': '5', 'z': '6', 'c': '6', 's': '6', # 容易混淆的音 'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8', 'ong': '9', 'un': '9', 'uan': '9', # 常見的元音混淆 'i': 'A', 'ü': 'A', 'u': 'A', 'e': 'B', 'o': 'B', } code = '' tone = '0' i = 0 while i < len(pinyin): if pinyin[i:i+3] in soundex_map: code += soundex_map[pinyin[i:i+3]] i += 3 elif pinyin[i:i+2] in soundex_map: code += soundex_map[pinyin[i:i+2]] i += 2 elif pinyin[i] in soundex_map: code += soundex_map[pinyin[i]] i += 1 elif pinyin[i].isdigit(): tone = pinyin[i] i += 1 else: i += 1 # 跳過未知字符 # 保留第一個字符,去除重複,並填充到3個字符 code = code[0] + ''.join(sorted(set(code[1:]))) return (code[:3] + tone).ljust(4, '0') def compare_chinese_words(word1, word2, tone_sensitive=True): pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]) pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]) soundex1 = tone_aware_chinese_soundex(pinyin1) soundex2 = tone_aware_chinese_soundex(pinyin2) if tone_sensitive: return soundex1 == soundex2 else: return soundex1[:3] == soundex2[:3] # 忽略聲調比較 # 測試函數 def test_tone_aware_chinese_soundex(): test_cases = [ ("碳", "看"), ("權", "圈"), ("盤查", "盤插"), ("排放", "拍放"), ("溫室", "文室"), ("氣體", "汽體"), ("管理", "觀理"), ("足跡", "族跡"), ("淨零", "精零"), ("變遷", "便遷"), ] print("聲調敏感比較:") for word1, word2 in test_cases: soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])) soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])) result = compare_chinese_words(word1, word2, tone_sensitive=True) print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}") print("\n聲調不敏感比較:") for word1, word2 in test_cases: soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])) soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])) result = compare_chinese_words(word1, word2, tone_sensitive=False) print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}") if __name__ == "__main__": test_tone_aware_chinese_soundex()