|
@@ -0,0 +1,92 @@
|
|
|
+import os
|
|
|
+import argparse
|
|
|
+from openai import OpenAI
|
|
|
+from dotenv import load_dotenv
|
|
|
+import tiktoken
|
|
|
+from pypinyin import pinyin, Style
|
|
|
+import jieba
|
|
|
+from datetime import datetime
|
|
|
+
|
|
|
+def tone_aware_chinese_soundex(pinyin):
|
|
|
+ soundex_map = {
|
|
|
+ # 聲母(輔音)
|
|
|
+ 'b': '1', 'p': '1', 'm': '1', 'f': '1',
|
|
|
+ 'd': '2', 't': '2', 'n': '2', 'l': '2',
|
|
|
+ 'g': '3', 'k': '3', 'h': '3',
|
|
|
+ 'j': '4', 'q': '4', 'x': '4',
|
|
|
+ 'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
|
|
|
+ 'z': '6', 'c': '6', 's': '6',
|
|
|
+ # 容易混淆的音
|
|
|
+ 'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
|
|
|
+ 'ong': '9', 'un': '9', 'uan': '9',
|
|
|
+ # 常見的元音混淆
|
|
|
+ 'i': 'A', 'ü': 'A', 'u': 'A',
|
|
|
+ 'e': 'B', 'o': 'B',
|
|
|
+ }
|
|
|
+
|
|
|
+ code = ''
|
|
|
+ tone = '0'
|
|
|
+ i = 0
|
|
|
+ while i < len(pinyin):
|
|
|
+ if pinyin[i:i+3] in soundex_map:
|
|
|
+ code += soundex_map[pinyin[i:i+3]]
|
|
|
+ i += 3
|
|
|
+ elif pinyin[i:i+2] in soundex_map:
|
|
|
+ code += soundex_map[pinyin[i:i+2]]
|
|
|
+ i += 2
|
|
|
+ elif pinyin[i] in soundex_map:
|
|
|
+ code += soundex_map[pinyin[i]]
|
|
|
+ i += 1
|
|
|
+ elif pinyin[i].isdigit():
|
|
|
+ tone = pinyin[i]
|
|
|
+ i += 1
|
|
|
+ else:
|
|
|
+ i += 1 # 跳過未知字符
|
|
|
+
|
|
|
+ # 保留第一個字符,去除重複,並填充到3個字符
|
|
|
+ code = code[0] + ''.join(sorted(set(code[1:])))
|
|
|
+ return (code[:3] + tone).ljust(4, '0')
|
|
|
+
|
|
|
+def compare_chinese_words(word1, word2, tone_sensitive=True):
|
|
|
+ pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
|
|
|
+ pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
|
|
|
+
|
|
|
+ soundex1 = tone_aware_chinese_soundex(pinyin1)
|
|
|
+ soundex2 = tone_aware_chinese_soundex(pinyin2)
|
|
|
+
|
|
|
+ if tone_sensitive:
|
|
|
+ return soundex1 == soundex2
|
|
|
+ else:
|
|
|
+ return soundex1[:3] == soundex2[:3] # 忽略聲調比較
|
|
|
+
|
|
|
+# 測試函數
|
|
|
+def test_tone_aware_chinese_soundex():
|
|
|
+ test_cases = [
|
|
|
+ ("碳", "看"),
|
|
|
+ ("權", "圈"),
|
|
|
+ ("盤查", "盤插"),
|
|
|
+ ("排放", "拍放"),
|
|
|
+ ("溫室", "文室"),
|
|
|
+ ("氣體", "汽體"),
|
|
|
+ ("管理", "觀理"),
|
|
|
+ ("足跡", "族跡"),
|
|
|
+ ("淨零", "精零"),
|
|
|
+ ("變遷", "便遷"),
|
|
|
+ ]
|
|
|
+
|
|
|
+ print("聲調敏感比較:")
|
|
|
+ for word1, word2 in test_cases:
|
|
|
+ soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
|
|
|
+ soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
|
|
|
+ result = compare_chinese_words(word1, word2, tone_sensitive=True)
|
|
|
+ print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")
|
|
|
+
|
|
|
+ print("\n聲調不敏感比較:")
|
|
|
+ for word1, word2 in test_cases:
|
|
|
+ soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
|
|
|
+ soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
|
|
|
+ result = compare_chinese_words(word1, word2, tone_sensitive=False)
|
|
|
+ print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ test_tone_aware_chinese_soundex()
|