text_processing.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. import jieba
  2. from pypinyin import pinyin, Style
  3. from api.openai_scripts_chinese.config import CORRECT_TERMS, ERROR_CORRECTION
  4. def chinese_soundex(pinyin_str):
  5. soundex_map = {
  6. 'b': '1', 'p': '1', 'm': '1', 'f': '1',
  7. 'd': '2', 'n': '2', 'l': '2',
  8. 'g': '3', 'k': '3', 'h': '3', 't': '3',
  9. 'j': '4', 'q': '4', 'x': '4',
  10. 'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
  11. 'z': '6', 'c': '6', 's': '6',
  12. 'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
  13. 'ong': '9', 'un': '9', 'uan': '9',
  14. 'i': 'A', 'u': 'A', 'v': 'A',
  15. 'e': 'B', 'o': 'B',
  16. }
  17. code = ''
  18. tone = '0'
  19. i = 0
  20. while i < len(pinyin_str):
  21. if pinyin_str[i:i+2] in soundex_map:
  22. code += soundex_map[pinyin_str[i:i+2]]
  23. i += 2
  24. elif pinyin_str[i] in soundex_map:
  25. code += soundex_map[pinyin_str[i]]
  26. i += 1
  27. elif pinyin_str[i].isdigit():
  28. tone = pinyin_str[i]
  29. i += 1
  30. else:
  31. i += 1
  32. code = code[:1] + ''.join(sorted(set(code[1:])))
  33. return (code[:3] + tone).ljust(4, '0')
  34. def compare_chinese_words(word1, word2, tone_sensitive=True):
  35. pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
  36. pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
  37. soundex1 = chinese_soundex(pinyin1)
  38. # print(soundex1)
  39. soundex2 = chinese_soundex(pinyin2)
  40. # print('soundex2', soundex2)
  41. if tone_sensitive:
  42. return soundex1 == soundex2
  43. else:
  44. return soundex1[:3] == soundex2[:3]
  45. def fuzzy_correct_chinese(text):
  46. words = jieba.lcut(text)
  47. corrected_words = []
  48. for word in words:
  49. if word.isalpha():
  50. corrected_words.append(word)
  51. continue
  52. word_pinyin = ''.join([p[0] for p in pinyin(word, style=Style.NORMAL)])
  53. # print(f"Term: {word}, Pinyin: {word_pinyin}")
  54. if word in ERROR_CORRECTION:
  55. corrected_words.append(ERROR_CORRECTION[word])
  56. else:
  57. for term in CORRECT_TERMS:
  58. if compare_chinese_words(word, term, tone_sensitive=True):
  59. # print(f"corrected: {word} -> {term}")
  60. corrected_words.append(term)
  61. break
  62. else:
  63. corrected_words.append(word)
  64. return ''.join(corrected_words)