1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- import os
- import argparse
- from openai import OpenAI
- from dotenv import load_dotenv
- import tiktoken
- from pypinyin import pinyin, Style
- import jieba
- from datetime import datetime
- def tone_aware_chinese_soundex(pinyin):
- soundex_map = {
- # 聲母(輔音)
- 'b': '1', 'p': '1', 'm': '1', 'f': '1',
- 'd': '2', 't': '2', 'n': '2', 'l': '2',
- 'g': '3', 'k': '3', 'h': '3',
- 'j': '4', 'q': '4', 'x': '4',
- 'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
- 'z': '6', 'c': '6', 's': '6',
- # 容易混淆的音
- 'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
- 'ong': '9', 'un': '9', 'uan': '9',
- # 常見的元音混淆
- 'i': 'A', 'ü': 'A', 'u': 'A',
- 'e': 'B', 'o': 'B',
- }
-
- code = ''
- tone = '0'
- i = 0
- while i < len(pinyin):
- if pinyin[i:i+3] in soundex_map:
- code += soundex_map[pinyin[i:i+3]]
- i += 3
- elif pinyin[i:i+2] in soundex_map:
- code += soundex_map[pinyin[i:i+2]]
- i += 2
- elif pinyin[i] in soundex_map:
- code += soundex_map[pinyin[i]]
- i += 1
- elif pinyin[i].isdigit():
- tone = pinyin[i]
- i += 1
- else:
- i += 1 # 跳過未知字符
-
- # 保留第一個字符,去除重複,並填充到3個字符
- code = code[0] + ''.join(sorted(set(code[1:])))
- return (code[:3] + tone).ljust(4, '0')
- def compare_chinese_words(word1, word2, tone_sensitive=True):
- pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
- pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
-
- soundex1 = tone_aware_chinese_soundex(pinyin1)
- soundex2 = tone_aware_chinese_soundex(pinyin2)
-
- if tone_sensitive:
- return soundex1 == soundex2
- else:
- return soundex1[:3] == soundex2[:3] # 忽略聲調比較
- # 測試函數
- def test_tone_aware_chinese_soundex():
- test_cases = [
- ("碳", "看"),
- ("權", "圈"),
- ("盤查", "盤插"),
- ("排放", "拍放"),
- ("溫室", "文室"),
- ("氣體", "汽體"),
- ("管理", "觀理"),
- ("足跡", "族跡"),
- ("淨零", "精零"),
- ("變遷", "便遷"),
- ]
-
- print("聲調敏感比較:")
- for word1, word2 in test_cases:
- soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
- soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
- result = compare_chinese_words(word1, word2, tone_sensitive=True)
- print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")
-
- print("\n聲調不敏感比較:")
- for word1, word2 in test_cases:
- soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
- soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
- result = compare_chinese_words(word1, word2, tone_sensitive=False)
- print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")
- if __name__ == "__main__":
- test_tone_aware_chinese_soundex()
|