123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- """ from https://github.com/keithito/tacotron """
- import re
- valid_symbols = [
- "AA",
- "AA0",
- "AA1",
- "AA2",
- "AE",
- "AE0",
- "AE1",
- "AE2",
- "AH",
- "AH0",
- "AH1",
- "AH2",
- "AO",
- "AO0",
- "AO1",
- "AO2",
- "AW",
- "AW0",
- "AW1",
- "AW2",
- "AY",
- "AY0",
- "AY1",
- "AY2",
- "B",
- "CH",
- "D",
- "DH",
- "EH",
- "EH0",
- "EH1",
- "EH2",
- "ER",
- "ER0",
- "ER1",
- "ER2",
- "EY",
- "EY0",
- "EY1",
- "EY2",
- "F",
- "G",
- "HH",
- "IH",
- "IH0",
- "IH1",
- "IH2",
- "IY",
- "IY0",
- "IY1",
- "IY2",
- "JH",
- "K",
- "L",
- "M",
- "N",
- "NG",
- "OW",
- "OW0",
- "OW1",
- "OW2",
- "OY",
- "OY0",
- "OY1",
- "OY2",
- "P",
- "R",
- "S",
- "SH",
- "T",
- "TH",
- "UH",
- "UH0",
- "UH1",
- "UH2",
- "UW",
- "UW0",
- "UW1",
- "UW2",
- "V",
- "W",
- "Y",
- "Z",
- "ZH",
- ]
- _valid_symbol_set = set(valid_symbols)
- class CMUDict:
- """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
- def __init__(self, file_or_path, keep_ambiguous=True):
- if isinstance(file_or_path, str):
- with open(file_or_path, encoding="latin-1") as f:
- entries = _parse_cmudict(f)
- else:
- entries = _parse_cmudict(file_or_path)
- if not keep_ambiguous:
- entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
- self._entries = entries
- def __len__(self):
- return len(self._entries)
- def lookup(self, word):
- """Returns list of ARPAbet pronunciations of the given word."""
- return self._entries.get(word.upper())
- _alt_re = re.compile(r"\([0-9]+\)")
- def _parse_cmudict(file):
- cmudict = {}
- for line in file:
- if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
- parts = line.split(" ")
- word = re.sub(_alt_re, "", parts[0])
- pronunciation = _get_pronunciation(parts[1])
- if pronunciation:
- if word in cmudict:
- cmudict[word].append(pronunciation)
- else:
- cmudict[word] = [pronunciation]
- return cmudict
- def _get_pronunciation(s):
- parts = s.strip().split(" ")
- for part in parts:
- if part not in _valid_symbol_set:
- return None
- return " ".join(parts)
|