soundex.py 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. import os
  2. import argparse
  3. from openai import OpenAI
  4. from dotenv import load_dotenv
  5. import tiktoken
  6. from pypinyin import pinyin, Style
  7. import jieba
  8. from datetime import datetime
  9. def tone_aware_chinese_soundex(pinyin):
  10. soundex_map = {
  11. # 聲母(輔音)
  12. 'b': '1', 'p': '1', 'm': '1', 'f': '1',
  13. 'd': '2', 't': '2', 'n': '2', 'l': '2',
  14. 'g': '3', 'k': '3', 'h': '3',
  15. 'j': '4', 'q': '4', 'x': '4',
  16. 'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
  17. 'z': '6', 'c': '6', 's': '6',
  18. # 容易混淆的音
  19. 'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
  20. 'ong': '9', 'un': '9', 'uan': '9',
  21. # 常見的元音混淆
  22. 'i': 'A', 'ü': 'A', 'u': 'A',
  23. 'e': 'B', 'o': 'B',
  24. }
  25. code = ''
  26. tone = '0'
  27. i = 0
  28. while i < len(pinyin):
  29. if pinyin[i:i+3] in soundex_map:
  30. code += soundex_map[pinyin[i:i+3]]
  31. i += 3
  32. elif pinyin[i:i+2] in soundex_map:
  33. code += soundex_map[pinyin[i:i+2]]
  34. i += 2
  35. elif pinyin[i] in soundex_map:
  36. code += soundex_map[pinyin[i]]
  37. i += 1
  38. elif pinyin[i].isdigit():
  39. tone = pinyin[i]
  40. i += 1
  41. else:
  42. i += 1 # 跳過未知字符
  43. # 保留第一個字符,去除重複,並填充到3個字符
  44. code = code[0] + ''.join(sorted(set(code[1:])))
  45. return (code[:3] + tone).ljust(4, '0')
  46. def compare_chinese_words(word1, word2, tone_sensitive=True):
  47. pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
  48. pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
  49. soundex1 = tone_aware_chinese_soundex(pinyin1)
  50. soundex2 = tone_aware_chinese_soundex(pinyin2)
  51. if tone_sensitive:
  52. return soundex1 == soundex2
  53. else:
  54. return soundex1[:3] == soundex2[:3] # 忽略聲調比較
  55. # 測試函數
  56. def test_tone_aware_chinese_soundex():
  57. test_cases = [
  58. ("碳", "看"),
  59. ("權", "圈"),
  60. ("盤查", "盤插"),
  61. ("排放", "拍放"),
  62. ("溫室", "文室"),
  63. ("氣體", "汽體"),
  64. ("管理", "觀理"),
  65. ("足跡", "族跡"),
  66. ("淨零", "精零"),
  67. ("變遷", "便遷"),
  68. ]
  69. print("聲調敏感比較:")
  70. for word1, word2 in test_cases:
  71. soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
  72. soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
  73. result = compare_chinese_words(word1, word2, tone_sensitive=True)
  74. print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")
  75. print("\n聲調不敏感比較:")
  76. for word1, word2 in test_cases:
  77. soundex1 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]))
  78. soundex2 = tone_aware_chinese_soundex(''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]))
  79. result = compare_chinese_words(word1, word2, tone_sensitive=False)
  80. print(f"{word1} ({soundex1}) vs {word2} ({soundex2}): {'相同' if result else '不同'}")
  81. if __name__ == "__main__":
  82. test_tone_aware_chinese_soundex()