audio_normalizer.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. #!usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # author: kuangdd
  4. # date: 2019/11/30
  5. """
  6. ### audio_normalizer
  7. 语音正则化,去除音量低的音频段(去除静音),调节音量。
  8. 语音正则化方法基于VAD的方法。
  9. """
  10. from pathlib import Path
  11. import logging
  12. logging.basicConfig(level=logging.INFO)
  13. logger = logging.getLogger(Path(__name__).stem)
  14. from scipy.ndimage.morphology import binary_dilation
  15. from pathlib import Path
  16. from typing import Optional, Union
  17. import numpy as np
  18. import librosa
  19. import struct
  20. from .audio_io import Dict2Obj, _sr, _int16_max
  21. try:
  22. import webrtcvad
  23. except ImportError as e:
  24. logger.info("ImportError: {}".format(e))
  25. # Default hyperparameters
  26. default_hparams = Dict2Obj(dict(
  27. int16_max=(2 ** 15) - 1,
  28. ## Mel-filterbank
  29. mel_window_length=25, # In milliseconds
  30. mel_window_step=10, # In milliseconds
  31. mel_n_channels=40,
  32. ## Audio
  33. sample_rate=16000, # sampling_rate
  34. # Number of spectrogram frames in a partial utterance
  35. partials_n_frames=160, # 1600 ms
  36. # Number of spectrogram frames at inference
  37. inference_n_frames=80, # 800 ms
  38. ## Voice Activation Detection
  39. # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
  40. # This sets the granularity of the VAD. Should not need to be changed.
  41. vad_window_length=30, # In milliseconds
  42. # Number of frames to average together when performing the moving average smoothing.
  43. # The larger this value, the larger the VAD variations must be to not get smoothed out.
  44. vad_moving_average_width=8,
  45. # Maximum number of consecutive silent frames a segment can have.
  46. vad_max_silence_length=6,
  47. ## Audio volume normalization
  48. audio_norm_target_dBFS=-30,
  49. ))
  50. def remove_silence(wav, sr=_sr, max_silence_ms=20):
  51. """
  52. 去除语音中的静音。
  53. :param wav:
  54. :param sr:
  55. :param max_silence_ms: 单位ms
  56. :return:
  57. """
  58. # Compute the voice detection window size
  59. wav = librosa.resample(wav, orig_sr=sr, target_sr=_sr)
  60. vad_window_length = 20
  61. vad_moving_average_width = 10
  62. samples_per_window = (vad_window_length * _sr) // 1000
  63. # Trim the end of the audio to have a multiple of the window size
  64. wav = wav[:len(wav) - (len(wav) % samples_per_window)]
  65. # Convert the float waveform to 16-bit mono PCM
  66. pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * _int16_max)).astype(np.int16))
  67. # Perform voice activation detection
  68. voice_flags = []
  69. vad = webrtcvad.Vad(mode=3)
  70. for window_start in range(0, len(wav), samples_per_window):
  71. window_end = window_start + samples_per_window
  72. voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], sample_rate=_sr))
  73. voice_flags = np.array(voice_flags)
  74. audio_mask = moving_average(voice_flags, vad_moving_average_width)
  75. audio_mask = np.round(audio_mask).astype(np.bool)
  76. # Dilate the voiced regions
  77. audio_mask = binary_dilation(audio_mask, np.ones(max_silence_ms + 1))
  78. audio_mask = np.repeat(audio_mask, samples_per_window)
  79. out = wav[audio_mask == True]
  80. out = librosa.resample(out, orig_sr=_sr, target_sr=sr)
  81. return out
  82. def tune_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
  83. """
  84. 调节音量大小。
  85. :param wav:
  86. :param target_dBFS: 目标音量。
  87. :param increase_only: 是否只是增加音量。
  88. :param decrease_only: 是否只是降低音量。
  89. :return:
  90. """
  91. if increase_only and decrease_only:
  92. raise ValueError("Both increase only and decrease only are set")
  93. rms = np.sqrt(np.mean((wav * _int16_max) ** 2))
  94. wave_dBFS = 20 * np.log10(rms / _int16_max)
  95. dBFS_change = target_dBFS - wave_dBFS
  96. if dBFS_change < 0 and increase_only or dBFS_change > 0 and decrease_only:
  97. return wav
  98. return wav * (10 ** (dBFS_change / 20))
  99. def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None, hparams=None):
  100. """
  101. 预处理语音,去除静音和设置音量。
  102. :param fpath_or_wav:
  103. :param source_sr:
  104. :param hparams:
  105. :return:
  106. """
  107. hparams = hparams or default_hparams
  108. # Load the wav from disk if needed
  109. if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
  110. wav, source_sr = librosa.load(fpath_or_wav, sr=None)
  111. else:
  112. wav = fpath_or_wav
  113. # Resample the wav if needed
  114. if source_sr is not None and source_sr != hparams.sample_rate:
  115. wav = librosa.resample(wav, source_sr, hparams.sample_rate)
  116. # Apply the preprocessing: normalize volume and shorten long silences
  117. wav = tune_volume(wav, hparams.audio_norm_target_dBFS, increase_only=True)
  118. wav = trim_long_silences(wav, hparams=hparams)
  119. return wav
  120. # Smooth the voice detection with a moving average
  121. def moving_average(array, width):
  122. array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
  123. ret = np.cumsum(array_padded, dtype=float)
  124. ret[width:] = ret[width:] - ret[:-width]
  125. return ret[width - 1:] / width
  126. def trim_long_silences(wav, hparams):
  127. """去除语音中的静音。(兼容历史版本)"""
  128. hparams = hparams or default_hparams
  129. wav = remove_silence(wav,
  130. sr=hparams.sample_rate,
  131. max_silence_ms=hparams.vad_max_silence_length)
  132. return wav
  133. if __name__ == "__main__":
  134. print(__file__)