ljspeech.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. import os
  2. import librosa
  3. import numpy as np
  4. from scipy.io import wavfile
  5. from tqdm import tqdm
  6. from text import _clean_text
  7. def prepare_align(config):
  8. in_dir = config["path"]["corpus_path"]
  9. out_dir = config["path"]["raw_path"]
  10. sampling_rate = config["preprocessing"]["audio"]["sampling_rate"]
  11. max_wav_value = config["preprocessing"]["audio"]["max_wav_value"]
  12. cleaners = config["preprocessing"]["text"]["text_cleaners"]
  13. speaker = "LJSpeech"
  14. with open(os.path.join(in_dir, "metadata.csv"), encoding="utf-8") as f:
  15. for line in tqdm(f):
  16. parts = line.strip().split("|")
  17. base_name = parts[0]
  18. text = parts[2]
  19. text = _clean_text(text, cleaners)
  20. wav_path = os.path.join(in_dir, "wavs", "{}.wav".format(base_name))
  21. if os.path.exists(wav_path):
  22. os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
  23. wav, _ = librosa.load(wav_path, sampling_rate)
  24. wav = wav / max(abs(wav)) * max_wav_value
  25. wavfile.write(
  26. os.path.join(out_dir, speaker, "{}.wav".format(base_name)),
  27. sampling_rate,
  28. wav.astype(np.int16),
  29. )
  30. with open(
  31. os.path.join(out_dir, speaker, "{}.lab".format(base_name)),
  32. "w",
  33. ) as f1:
  34. f1.write(text)