util.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. # !pip install librosa==0.7.2
  2. # !pip install numba==0.48.0
  3. # !pip install pydub
  4. import pandas as pd
  5. import numpy as np
  6. import re
  7. import jieba
  8. import jieba.posseg as pseg
  9. import urllib
  10. import urllib.request
  11. import librosa
  12. from pydub import AudioSegment
  13. from pydub.silence import split_on_silence
  14. import itertools
  15. def import_data():
  16. drop_id = [14,4806,5024]
  17. word_data = pd.DataFrame()
  18. for i in ['si3-1','si3-2','siw']:
  19. tmp_word_data = pd.read_csv('csv_imtong/{}.csv'.format(i))
  20. gender_list = []
  21. tmp_word_data_class = list(tmp_word_data.分類.unique())
  22. for i in range(len(tmp_word_data_class)):
  23. if i%2 == 0:
  24. gender_list += ['Male']
  25. else:
  26. gender_list += ['Female']
  27. gender_df = pd.DataFrame({'分類':tmp_word_data_class,'gender':gender_list})
  28. tmp_word_data = pd.merge(tmp_word_data,gender_df,on='分類')
  29. word_data = word_data.append(tmp_word_data)
  30. word_data = word_data.reset_index(drop=True)
  31. word_data = word_data.loc[:,['客家語','客語標音','客語音檔','華語詞義','gender']]
  32. # 丟掉有問題音檔
  33. word_data = word_data.drop(drop_id,axis=0).reset_index(drop=True)
  34. # 整理資料,產生華語詞義集
  35. # 以下是要被取代的字眼
  36. repleace_list = ['引申為','...','或形容','…','形容','?','例如:','亦可說','「','」','例如','猶如華語中的','相當於華語的']
  37. ch_word_list = []
  38. for i in word_data.華語詞義:
  39. i_ = i
  40. for repleace_word in repleace_list:
  41. i_ = i_.replace(repleace_word,'')
  42. tmp_ch_word = re.sub(r'[,、;]','/',i_)
  43. if tmp_ch_word.find('。')>=0:
  44. tmp_ch_word = re.sub('\。\S+','',tmp_ch_word).replace('。','')
  45. if tmp_ch_word.find('(')>=0:
  46. tmp_ch_word = re.sub('\(\S+\)','',tmp_ch_word)
  47. ch_word_list.append(tmp_ch_word.split('/'))
  48. word_data['華語詞義集'] = ch_word_list
  49. # 找出重複的音檔
  50. multi_sound = word_data.loc[[i.find('(')>=0 or i.find('【')>=0 or i.find('/')>=0 for i in word_data.客語標音],:]
  51. return word_data,multi_sound
  52. # 下載客語單詞
  53. def download_mp3(word_data,multi_sound):
  54. print('Run download_mp3')
  55. for j in range(len(word_data)):
  56. if j%500==0:
  57. print(j,'/',len(word_data))
  58. urllib.request.urlretrieve(word_data.loc[j,:]['客語音檔'], "mp3/{}.mp3".format(j))
  59. #刪除一些重複地念法到mp3_uni
  60. print('Generate unin mp3')
  61. for i in list(multi_sound.index):
  62. sound = AudioSegment.from_mp3("mp3/{}.mp3".format(i))
  63. loudness = sound.dBFS
  64. chunks = split_on_silence(sound,
  65. # must be silent for at least half a second,沉默半秒
  66. min_silence_len=200,
  67. # consider it silent if quieter than -16 dBFS
  68. silence_thresh=-50,
  69. keep_silence=100
  70. )
  71. if len(chunks)==1:
  72. print(i)
  73. chunks[0].export("mp3_uni/{}.mp3".format(i), format="mp3")
  74. def import_hakka_100():
  75. tmp_word_100 = pd.read_csv('csv_imtong/hakka100.csv')
  76. chinese_clean_list = []
  77. for i in tmp_word_100.chinese:
  78. chinese_clean = i[:-1].replace('。','').replace('?','').replace('!','/').replace(',','/').replace('、','/')
  79. if chinese_clean.find('(')>=0:
  80. chinese_clean = re.sub('\(\S+\)','',chinese_clean)
  81. chinese_clean_list += [chinese_clean.split('/')]
  82. tmp_word_100['chinese_clean'] = chinese_clean_list
  83. hakka_100 = tmp_word_100.explode('chinese_clean').reset_index(drop=True)
  84. return hakka_100
  85. def download_hakka_100(hakka_100):
  86. #下載客語100句
  87. print('Run download_hakka_100')
  88. for m in range(len(hakka_100)):
  89. print(m,'/',len(hakka_100))
  90. urllib.request.urlretrieve(hakka_100.loc[m,:]['url '], "mp3/hakka_100_{}.mp3".format(j))
  91. #刪除一些重複地念法到mp3_uni
  92. print('Generate unin hakka_100 mp3')
  93. j = 0
  94. for i in list(hakka_100.index):
  95. sound = AudioSegment.from_mp3("mp3/hakka_100_{}.mp3".format(i))
  96. loudness = sound.dBFS
  97. chunks = split_on_silence(sound,
  98. # must be silent for at least half a second,沉默半秒
  99. min_silence_len=300,
  100. # consider it silent if quieter than -16 dBFS
  101. silence_thresh=-45,
  102. keep_silence=400
  103. )
  104. if len(hakka_100.loc[i,:]['chinese_clean'])==len(chunks):
  105. for k in chunks:
  106. k.export("mp3_uni/hakka_100_{}.mp3".format(j), format="mp3")
  107. j += 1
  108. else:
  109. # 印出錯誤的音檔
  110. print('Error',i,chunks)
  111. def import_jieba_userdict(ch_word_list, userDict_path='userDict.txt'):
  112. f = open(userDict_path, 'w')
  113. for i in range(len(ch_word_list)):
  114. f.write(ch_word_list[i]+'\n')
  115. f.close()
  116. jieba.load_userdict(userDict_path)
  117. return jieba
  118. def gen_hakka_tts(word_data,multi_sound,hakka_100,ch_sentence,mp3_path='test1.mp3',verbose=0):
  119. Y = []
  120. print(jieba.lcut(ch_sentence))
  121. for word in jieba.lcut(ch_sentence):
  122. tmp_mapping_100 = hakka_100.loc[[word == i for i in hakka_100.chinese_clean],:].head(1)
  123. if tmp_mapping_100.empty:
  124. tmp_mapping = word_data.loc[[word in i for i in word_data.華語詞義集],:].head(1)
  125. if verbose==1:print(tmp_mapping)
  126. if tmp_mapping.empty:
  127. if verbose==1:print('no mach',word)
  128. pass
  129. else:
  130. if tmp_mapping.index[0] in list(multi_sound.index):
  131. y, sr = librosa.load('mp3_uni/{}.mp3'.format(tmp_mapping.index[0]))
  132. if tmp_mapping.gender.values[0] == 'Male':
  133. y = librosa.effects.pitch_shift(y, sr, n_steps=8)
  134. else:
  135. y, sr = librosa.load('mp3/{}.mp3'.format(tmp_mapping.index[0]))
  136. if tmp_mapping.gender.values[0] == 'Male':
  137. y = librosa.effects.pitch_shift(y, sr, n_steps=8)
  138. Y += list(y[abs(y)>0.0005])
  139. else:
  140. y, sr = librosa.load('mp3_uni/hakka_100_{}.mp3'.format(tmp_mapping_100.index[0]))
  141. y = librosa.effects.pitch_shift(y, sr, n_steps=4)
  142. Y += list(y[abs(y)>0.0005])
  143. librosa.output.write_wav(mp3_path, np.array(Y), sr)