cmudict.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. """ from https://github.com/keithito/tacotron """
  2. import re
  3. valid_symbols = [
  4. "AA",
  5. "AA0",
  6. "AA1",
  7. "AA2",
  8. "AE",
  9. "AE0",
  10. "AE1",
  11. "AE2",
  12. "AH",
  13. "AH0",
  14. "AH1",
  15. "AH2",
  16. "AO",
  17. "AO0",
  18. "AO1",
  19. "AO2",
  20. "AW",
  21. "AW0",
  22. "AW1",
  23. "AW2",
  24. "AY",
  25. "AY0",
  26. "AY1",
  27. "AY2",
  28. "B",
  29. "CH",
  30. "D",
  31. "DH",
  32. "EH",
  33. "EH0",
  34. "EH1",
  35. "EH2",
  36. "ER",
  37. "ER0",
  38. "ER1",
  39. "ER2",
  40. "EY",
  41. "EY0",
  42. "EY1",
  43. "EY2",
  44. "F",
  45. "G",
  46. "HH",
  47. "IH",
  48. "IH0",
  49. "IH1",
  50. "IH2",
  51. "IY",
  52. "IY0",
  53. "IY1",
  54. "IY2",
  55. "JH",
  56. "K",
  57. "L",
  58. "M",
  59. "N",
  60. "NG",
  61. "OW",
  62. "OW0",
  63. "OW1",
  64. "OW2",
  65. "OY",
  66. "OY0",
  67. "OY1",
  68. "OY2",
  69. "P",
  70. "R",
  71. "S",
  72. "SH",
  73. "T",
  74. "TH",
  75. "UH",
  76. "UH0",
  77. "UH1",
  78. "UH2",
  79. "UW",
  80. "UW0",
  81. "UW1",
  82. "UW2",
  83. "V",
  84. "W",
  85. "Y",
  86. "Z",
  87. "ZH",
  88. ]
  89. _valid_symbol_set = set(valid_symbols)
  90. class CMUDict:
  91. """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
  92. def __init__(self, file_or_path, keep_ambiguous=True):
  93. if isinstance(file_or_path, str):
  94. with open(file_or_path, encoding="latin-1") as f:
  95. entries = _parse_cmudict(f)
  96. else:
  97. entries = _parse_cmudict(file_or_path)
  98. if not keep_ambiguous:
  99. entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
  100. self._entries = entries
  101. def __len__(self):
  102. return len(self._entries)
  103. def lookup(self, word):
  104. """Returns list of ARPAbet pronunciations of the given word."""
  105. return self._entries.get(word.upper())
  106. _alt_re = re.compile(r"\([0-9]+\)")
  107. def _parse_cmudict(file):
  108. cmudict = {}
  109. for line in file:
  110. if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
  111. parts = line.split(" ")
  112. word = re.sub(_alt_re, "", parts[0])
  113. pronunciation = _get_pronunciation(parts[1])
  114. if pronunciation:
  115. if word in cmudict:
  116. cmudict[word].append(pronunciation)
  117. else:
  118. cmudict[word] = [pronunciation]
  119. return cmudict
  120. def _get_pronunciation(s):
  121. parts = s.strip().split(" ")
  122. for part in parts:
  123. if part not in _valid_symbol_set:
  124. return None
  125. return " ".join(parts)