Prechádzať zdrojové kódy

improved soundex index and error handling

SherryLiu 6 mesiacov pred
rodič
commit
70fff6defb
1 zmenil súbory, kde vykonal 92 pridanie a 0 odobranie
  1. 92 0
      soundex.py

+ 92 - 0
soundex.py

@@ -0,0 +1,92 @@
+import os
+import argparse
+from openai import OpenAI
+from dotenv import load_dotenv
+import tiktoken
+from pypinyin import pinyin, Style
+import jieba
+from datetime import datetime
+
+def tone_aware_chinese_soundex(pinyin):
+    soundex_map = {
+        # 聲母(輔音)
+        'b': '1', 'p': '1', 'm': '1', 'f': '1',
+        'd': '2', 't': '2', 'n': '2', 'l': '2',
+        'g': '3', 'k': '3', 'h': '3',
+        'j': '4', 'q': '4', 'x': '4',
+        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
+        'z': '6', 'c': '6', 's': '6',
+        # 容易混淆的音
+        'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
+        'ong': '9', 'un': '9', 'uan': '9',
+        # 常見的元音混淆
+        'i': 'A', 'ü': 'A', 'u': 'A',
+        'e': 'B', 'o': 'B',
+    }
+    
+    code = ''
+    tone = '0'
+    i = 0
+    while i < len(pinyin):
+        if pinyin[i:i+3] in soundex_map:
+            code += soundex_map[pinyin[i:i+3]]
+            i += 3
+        elif pinyin[i:i+2] in soundex_map:
+            code += soundex_map[pinyin[i:i+2]]
+            i += 2
+        elif pinyin[i] in soundex_map:
+            code += soundex_map[pinyin[i]]
+            i += 1
+        elif pinyin[i].isdigit():
+            tone = pinyin[i]
+            i += 1
+        else:
+            i += 1  # 跳過未知字符
+    
+    # 保留第一個字符,去除重複,並填充到3個字符
+    code = code[0] + ''.join(sorted(set(code[1:])))
+    return (code[:3] + tone).ljust(4, '0')
+
+def compare_chinese_words(word1, word2, tone_sensitive=True):
+    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
+    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
+    
+    soundex1 = tone_aware_chinese_soundex(pinyin1)
+    soundex2 = tone_aware_chinese_soundex(pinyin2)
+    
+    if tone_sensitive:
+        return soundex1 == soundex2
+    else:
+        return soundex1[:3] == soundex2[:3]  # 忽略聲調比較
+
+# 測試函數
+def test_tone_aware_chinese_soundex():
+    test_cases = [
+        ("碳", "看"),
+        ("權", "圈"),
+        ("盤查", "盤插"),
+        ("排放", "拍放"),
+        ("溫室", "文室"),
+        ("氣體", "汽體"),
+        ("管理", "觀理"),
+        ("足跡", "族跡"),
+        ("淨零", "精零"),
+        ("變遷", "便遷"),
+    ]
+    
+    print("聲調敏感比較:")
+    for word1, word2 in test_cases:
+        soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
+        soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
+        result = compare_chinese_words(word1, word2, tone_sensitive=True)
+        print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")
+    
+    print("\n聲調不敏感比較:")
+    for word1, word2 in test_cases:
+        soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
+        soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
+        result = compare_chinese_words(word1, word2, tone_sensitive=False)
+        print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")
+
+if __name__ == "__main__":
+    test_tone_aware_chinese_soundex()