aishell3.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435
  1. import os
  2. import librosa
  3. import numpy as np
  4. from scipy.io import wavfile
  5. from tqdm import tqdm
  6. def prepare_align(config):
  7. in_dir = config["path"]["corpus_path"]
  8. out_dir = config["path"]["raw_path"]
  9. sampling_rate = config["preprocessing"]["audio"]["sampling_rate"]
  10. max_wav_value = config["preprocessing"]["audio"]["max_wav_value"]
  11. for dataset in ["train", "test"]:
  12. print("Processing {}ing set...".format(dataset))
  13. with open(os.path.join(in_dir, dataset, "content.txt"), encoding="utf-8") as f:
  14. for line in tqdm(f):
  15. wav_name, text = line.strip("\n").split("\t")
  16. speaker = wav_name[:7]
  17. text = text.split(" ")[1::2]
  18. wav_path = os.path.join(in_dir, dataset,speaker, wav_name)
  19. if os.path.exists(wav_path):
  20. os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
  21. wav, _ = librosa.load(wav_path, sampling_rate)
  22. wav = wav / max(abs(wav)) * max_wav_value
  23. wavfile.write(
  24. os.path.join(out_dir, speaker, wav_name),
  25. sampling_rate,
  26. wav.astype(np.int16),
  27. )
  28. with open(
  29. os.path.join(out_dir, speaker, "{}.lab".format(wav_name[:11])),
  30. "w",
  31. ) as f1:
  32. f1.write(" ".join(text))