10 months ago · 70fff6defb
--- a/soundex.py
+++ b/soundex.py
@@ -0,0 +1,92 @@
 
				+import os
			
 
				+import argparse
			
 
				+from openai import OpenAI
			
 
				+from dotenv import load_dotenv
			
 
				+import tiktoken
			
 
				+from pypinyin import pinyin, Style
			
 
				+import jieba
			
 
				+from datetime import datetime
			
 
				+
			
 
				+def tone_aware_chinese_soundex(pinyin):
			
 
				+    soundex_map = {
			
 
				+        # 聲母（輔音）
			
 
				+        'b': '1', 'p': '1', 'm': '1', 'f': '1',
			
 
				+        'd': '2', 't': '2', 'n': '2', 'l': '2',
			
 
				+        'g': '3', 'k': '3', 'h': '3',
			
 
				+        'j': '4', 'q': '4', 'x': '4',
			
 
				+        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
			
 
				+        'z': '6', 'c': '6', 's': '6',
			
 
				+        # 容易混淆的音
			
 
				+        'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
			
 
				+        'ong': '9', 'un': '9', 'uan': '9',
			
 
				+        # 常見的元音混淆
			
 
				+        'i': 'A', 'ü': 'A', 'u': 'A',
			
 
				+        'e': 'B', 'o': 'B',
			
 
				+    }
			
 
				+    
			
 
				+    code = ''
			
 
				+    tone = '0'
			
 
				+    i = 0
			
 
				+    while i < len(pinyin):
			
 
				+        if pinyin[i:i+3] in soundex_map:
			
 
				+            code += soundex_map[pinyin[i:i+3]]
			
 
				+            i += 3
			
 
				+        elif pinyin[i:i+2] in soundex_map:
			
 
				+            code += soundex_map[pinyin[i:i+2]]
			
 
				+            i += 2
			
 
				+        elif pinyin[i] in soundex_map:
			
 
				+            code += soundex_map[pinyin[i]]
			
 
				+            i += 1
			
 
				+        elif pinyin[i].isdigit():
			
 
				+            tone = pinyin[i]
			
 
				+            i += 1
			
 
				+        else:
			
 
				+            i += 1  # 跳過未知字符
			
 
				+    
			
 
				+    # 保留第一個字符，去除重複，並填充到3個字符
			
 
				+    code = code[0] + ''.join(sorted(set(code[1:])))
			
 
				+    return (code[:3] + tone).ljust(4, '0')
			
 
				+
			
 
				+def compare_chinese_words(word1, word2, tone_sensitive=True):
			
 
				+    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
			
 
				+    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
			
 
				+    
			
 
				+    soundex1 = tone_aware_chinese_soundex(pinyin1)
			
 
				+    soundex2 = tone_aware_chinese_soundex(pinyin2)
			
 
				+    
			
 
				+    if tone_sensitive:
			
 
				+        return soundex1 == soundex2
			
 
				+    else:
			
 
				+        return soundex1[:3] == soundex2[:3]  # 忽略聲調比較
			
 
				+
			
 
				+# 測試函數
			
 
				+def test_tone_aware_chinese_soundex():
			
 
				+    test_cases = [
			
 
				+        ("碳", "看"),
			
 
				+        ("權", "圈"),
			
 
				+        ("盤查", "盤插"),
			
 
				+        ("排放", "拍放"),
			
 
				+        ("溫室", "文室"),
			
 
				+        ("氣體", "汽體"),
			
 
				+        ("管理", "觀理"),
			
 
				+        ("足跡", "族跡"),
			
 
				+        ("淨零", "精零"),
			
 
				+        ("變遷", "便遷"),
			
 
				+    ]
			
 
				+    
			
 
				+    print("聲調敏感比較:")
			
 
				+    for word1, word2 in test_cases:
			
 
				+        soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
			
 
				+        soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
			
 
				+        result = compare_chinese_words(word1, word2, tone_sensitive=True)
			
 
				+        print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")
			
 
				+    
			
 
				+    print("\n聲調不敏感比較:")
			
 
				+    for word1, word2 in test_cases:
			
 
				+        soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
			
 
				+        soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
			
 
				+        result = compare_chinese_words(word1, word2, tone_sensitive=False)
			
 
				+        print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    test_tone_aware_chinese_soundex()