audio_editor.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. # author: kdd
  2. # date:
  3. """
  4. ### audio_editor
  5. 语音编辑,切分音频,去除语音中的较长静音,去除语音首尾静音,设置采样率,设置通道数。
  6. 音频格式相互转换,例如wav格式转为mp3格式。
  7. 切分音频,去除静音,去除首尾静音输入输出都支持wav格式。
  8. 语音编辑功能基于pydub的方法,增加了数据格式支持。
  9. """
  10. from pathlib import Path
  11. import logging
  12. logging.basicConfig(level=logging.INFO)
  13. logger = logging.getLogger(Path(__name__).stem)
  14. from pydub import AudioSegment
  15. from pydub.silence import detect_nonsilent
  16. from .audio_io import anything2bytesio, _sr, _int16_max
  17. import numpy as np
  18. import io
  19. def convert_channels(wav, sr=_sr, value=1):
  20. aud = wav2audiosegment(wav, sr=sr)
  21. aud = aud.set_channels(channels=value)
  22. wav = audiosegment2wav(aud)
  23. return wav
  24. def convert_sample_rate(wav, sr=_sr, value=_sr):
  25. aud = wav2audiosegment(wav, sr=sr)
  26. aud = aud.set_frame_rate(frame_rate=value)
  27. wav = audiosegment2wav(aud)
  28. return wav
  29. def convert_sample_width(wav, sr=_sr, value=4):
  30. aud = wav2audiosegment(wav, sr=sr)
  31. aud = aud.set_sample_width(sample_width=value)
  32. wav = audiosegment2wav(aud)
  33. return wav
  34. def convert_format(wav, sr=_sr, format='mp3'):
  35. """
  36. 语音信号转为指定音频格式的bytes。
  37. :param wav:
  38. :param sr:
  39. :param format:
  40. :return:
  41. """
  42. aud = wav2audiosegment(wav, sr=sr)
  43. out = io.BytesIO()
  44. aud.export(out, format=format)
  45. return out.getvalue()
  46. def convert_format_os(inpath, outpath, out_format='mp3', in_format=None):
  47. """
  48. 音频格式转换。
  49. :param inpath:
  50. :param outpath:
  51. :param in_format:
  52. :param out_format:
  53. :return:
  54. """
  55. src = AudioSegment.from_file(inpath, format=in_format)
  56. src.export(outpath, format=out_format)
  57. def audiosegment2wav(data: AudioSegment):
  58. """
  59. pydub.AudioSegment格式转为音频信号wav。
  60. :param data:
  61. :return:
  62. """
  63. wav = np.array(data.get_array_of_samples()) / _int16_max
  64. return wav
  65. def wav2audiosegment(wav: np.ndarray, sr):
  66. """
  67. 音频信号wav转为pydub.AudioSegment格式。
  68. :param wav:
  69. :param sr:
  70. :return:
  71. """
  72. tmp = anything2bytesio(wav, sr=sr)
  73. out = AudioSegment.from_wav(tmp)
  74. return out
  75. def strip_silence_wave(wav: np.ndarray, sr=_sr, keep_silence_len=20, min_silence_len=100, silence_thresh=-32, **kwargs):
  76. """
  77. 去除语音前后静音。
  78. :param wav:
  79. :param sr:
  80. :param keep_silence_len:
  81. :param min_silence_len:
  82. :param silence_thresh:
  83. :param kwargs:
  84. :return:
  85. """
  86. data = wav2audiosegment(wav, sr=sr)
  87. out = strip_audio(data,
  88. keep_silence_len=keep_silence_len,
  89. min_silence_len=min_silence_len,
  90. silence_thresh=silence_thresh,
  91. **kwargs)
  92. out = audiosegment2wav(out)
  93. return out
  94. def strip_audio(data: AudioSegment, keep_silence_len=20, min_silence_len=100, silence_thresh=-32, **kwargs):
  95. nsils = detect_nonsilent(data, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
  96. if len(nsils) >= 1:
  97. return data[max(0, nsils[0][0] - keep_silence_len): min(len(data), nsils[-1][1] + keep_silence_len)]
  98. else:
  99. return AudioSegment.empty()
  100. def strip_audio_os(inpath, outpath, **kwargs):
  101. try:
  102. data = AudioSegment.from_file(inpath, kwargs.get('format', 'wav'))
  103. out = strip_audio(data, **kwargs)
  104. out.export(outpath, kwargs.get('format', 'wav'))
  105. except Exception as e:
  106. logger.info('Error path: {}'.format(inpath))
  107. logger.info('Error info: {}'.format(e))
  108. def split_silence_wave(wav, sr=_sr, keep_silence_len=20, min_silence_len=100, silence_thresh=-32, **kwargs):
  109. """
  110. 根据静音切分音频。
  111. :param wav:
  112. :param sr:
  113. :param keep_silence_len:
  114. :param min_silence_len:
  115. :param silence_thresh:
  116. :param kwargs:
  117. :return:
  118. """
  119. data = wav2audiosegment(wav, sr=sr)
  120. outs = split_audio(data,
  121. keep_silence_len=keep_silence_len,
  122. min_silence_len=min_silence_len,
  123. silence_thresh=silence_thresh,
  124. **kwargs)
  125. out_wavs = []
  126. for out in outs:
  127. wav = audiosegment2wav(out)
  128. out_wavs.append(wav)
  129. return out_wavs
  130. def split_audio(data: AudioSegment, keep_silence_len=20, min_silence_len=100, silence_thresh=-32, **kwargs):
  131. nsils = detect_nonsilent(data, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
  132. if len(nsils) >= 1:
  133. outs = []
  134. for ab in nsils:
  135. out = data[max(0, ab[0] - keep_silence_len): min(len(data), ab[1] + keep_silence_len)]
  136. outs.append(out)
  137. else:
  138. outs = [AudioSegment.empty()]
  139. return outs
  140. def remove_silence_wave(wav, sr=_sr, keep_silence_len=20, min_silence_len=100, silence_thresh=-32, **kwargs):
  141. """
  142. 去除音频中的静音段。
  143. :param wav:
  144. :param sr:
  145. :param keep_silence_len:
  146. :param min_silence_len:
  147. :param silence_thresh:
  148. :param kwargs:
  149. :return:
  150. """
  151. data = wav2audiosegment(wav, sr=sr)
  152. out = remove_silence_audio(data,
  153. keep_silence_len=keep_silence_len,
  154. min_silence_len=min_silence_len,
  155. silence_thresh=silence_thresh,
  156. **kwargs)
  157. out = audiosegment2wav(out)
  158. return out
  159. def remove_silence_audio(data: AudioSegment, keep_silence_len=20, min_silence_len=100, silence_thresh=-32, **kwargs):
  160. nsils = detect_nonsilent(data, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
  161. out = AudioSegment.empty()
  162. sf = 0
  163. for i, ab in enumerate(nsils):
  164. si = max(ab[0] - keep_silence_len, sf)
  165. ei = ab[1] + keep_silence_len
  166. out = out + data[si: ei]
  167. sf = ei
  168. return out
  169. if __name__ == "__main__":
  170. print(__file__)