123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166 |
- #!usr/bin/env python
- # -*- coding: utf-8 -*-
- # author: kuangdd
- # date: 2019/11/30
- """
- ### audio_normalizer
- 语音正则化,去除音量低的音频段(去除静音),调节音量。
- 语音正则化方法基于VAD的方法。
- """
- from pathlib import Path
- import logging
- logging.basicConfig(level=logging.INFO)
- logger = logging.getLogger(Path(__name__).stem)
- from scipy.ndimage.morphology import binary_dilation
- from pathlib import Path
- from typing import Optional, Union
- import numpy as np
- import librosa
- import struct
- from .audio_io import Dict2Obj, _sr, _int16_max
- try:
- import webrtcvad
- except ImportError as e:
- logger.info("ImportError: {}".format(e))
- # Default hyperparameters
- default_hparams = Dict2Obj(dict(
- int16_max=(2 ** 15) - 1,
- ## Mel-filterbank
- mel_window_length=25, # In milliseconds
- mel_window_step=10, # In milliseconds
- mel_n_channels=40,
- ## Audio
- sample_rate=16000, # sampling_rate
- # Number of spectrogram frames in a partial utterance
- partials_n_frames=160, # 1600 ms
- # Number of spectrogram frames at inference
- inference_n_frames=80, # 800 ms
- ## Voice Activation Detection
- # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
- # This sets the granularity of the VAD. Should not need to be changed.
- vad_window_length=30, # In milliseconds
- # Number of frames to average together when performing the moving average smoothing.
- # The larger this value, the larger the VAD variations must be to not get smoothed out.
- vad_moving_average_width=8,
- # Maximum number of consecutive silent frames a segment can have.
- vad_max_silence_length=6,
- ## Audio volume normalization
- audio_norm_target_dBFS=-30,
- ))
- def remove_silence(wav, sr=_sr, max_silence_ms=20):
- """
- 去除语音中的静音。
- :param wav:
- :param sr:
- :param max_silence_ms: 单位ms
- :return:
- """
- # Compute the voice detection window size
- wav = librosa.resample(wav, orig_sr=sr, target_sr=_sr)
- vad_window_length = 20
- vad_moving_average_width = 10
- samples_per_window = (vad_window_length * _sr) // 1000
- # Trim the end of the audio to have a multiple of the window size
- wav = wav[:len(wav) - (len(wav) % samples_per_window)]
- # Convert the float waveform to 16-bit mono PCM
- pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * _int16_max)).astype(np.int16))
- # Perform voice activation detection
- voice_flags = []
- vad = webrtcvad.Vad(mode=3)
- for window_start in range(0, len(wav), samples_per_window):
- window_end = window_start + samples_per_window
- voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], sample_rate=_sr))
- voice_flags = np.array(voice_flags)
- audio_mask = moving_average(voice_flags, vad_moving_average_width)
- audio_mask = np.round(audio_mask).astype(np.bool)
- # Dilate the voiced regions
- audio_mask = binary_dilation(audio_mask, np.ones(max_silence_ms + 1))
- audio_mask = np.repeat(audio_mask, samples_per_window)
- out = wav[audio_mask == True]
- out = librosa.resample(out, orig_sr=_sr, target_sr=sr)
- return out
- def tune_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
- """
- 调节音量大小。
- :param wav:
- :param target_dBFS: 目标音量。
- :param increase_only: 是否只是增加音量。
- :param decrease_only: 是否只是降低音量。
- :return:
- """
- if increase_only and decrease_only:
- raise ValueError("Both increase only and decrease only are set")
- rms = np.sqrt(np.mean((wav * _int16_max) ** 2))
- wave_dBFS = 20 * np.log10(rms / _int16_max)
- dBFS_change = target_dBFS - wave_dBFS
- if dBFS_change < 0 and increase_only or dBFS_change > 0 and decrease_only:
- return wav
- return wav * (10 ** (dBFS_change / 20))
- def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None, hparams=None):
- """
- 预处理语音,去除静音和设置音量。
- :param fpath_or_wav:
- :param source_sr:
- :param hparams:
- :return:
- """
- hparams = hparams or default_hparams
- # Load the wav from disk if needed
- if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
- wav, source_sr = librosa.load(fpath_or_wav, sr=None)
- else:
- wav = fpath_or_wav
- # Resample the wav if needed
- if source_sr is not None and source_sr != hparams.sample_rate:
- wav = librosa.resample(wav, source_sr, hparams.sample_rate)
- # Apply the preprocessing: normalize volume and shorten long silences
- wav = tune_volume(wav, hparams.audio_norm_target_dBFS, increase_only=True)
- wav = trim_long_silences(wav, hparams=hparams)
- return wav
- # Smooth the voice detection with a moving average
- def moving_average(array, width):
- array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
- ret = np.cumsum(array_padded, dtype=float)
- ret[width:] = ret[width:] - ret[:-width]
- return ret[width - 1:] / width
- def trim_long_silences(wav, hparams):
- """去除语音中的静音。(兼容历史版本)"""
- hparams = hparams or default_hparams
- wav = remove_silence(wav,
- sr=hparams.sample_rate,
- max_silence_ms=hparams.vad_max_silence_length)
- return wav
- if __name__ == "__main__":
- print(__file__)
|