hakkaUtil.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. # !pip install librosa==0.7.2
  2. # !pip install numba==0.48.0
  3. # !pip install pydub
  4. # !sudo yum update
  5. # !sudo yum install epel-release
  6. # !sudo rpm --import http://li.nux.ro/download/nux/RPM-GPG-KEY-nux.ro
  7. # !sudo rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-5.el7.nux.noarch.rpm
  8. # !sudo yum install ffmpeg ffmpeg-devel -y
  9. import pandas as pd
  10. import numpy as np
  11. import re
  12. import jieba
  13. import jieba.posseg as pseg
  14. import urllib
  15. import urllib.request
  16. import librosa
  17. from pydub import AudioSegment
  18. from pydub.silence import split_on_silence
  19. import itertools
  20. def import_data():
  21. drop_id = [14,4806,5024]
  22. word_data = pd.DataFrame()
  23. for i in ['si3-1','si3-2','siw']:
  24. tmp_word_data = pd.read_csv('csv_imtong/{}.csv'.format(i))
  25. gender_list = []
  26. tmp_word_data_class = list(tmp_word_data.分類.unique())
  27. for i in range(len(tmp_word_data_class)):
  28. if i%2 == 0:
  29. gender_list += ['Male']
  30. else:
  31. gender_list += ['Female']
  32. gender_df = pd.DataFrame({'分類':tmp_word_data_class,'gender':gender_list})
  33. tmp_word_data = pd.merge(tmp_word_data,gender_df,on='分類')
  34. word_data = word_data.append(tmp_word_data)
  35. word_data = word_data.reset_index(drop=True)
  36. word_data = word_data.loc[:,['客家語','客語標音','客語音檔','華語詞義','gender']]
  37. # 丟掉有問題音檔
  38. word_data = word_data.drop(drop_id,axis=0).reset_index(drop=True)
  39. # 整理資料,產生華語詞義集
  40. # 以下是要被取代的字眼
  41. repleace_list = ['引申為','...','或形容','…','形容','?','例如:','亦可說','「','」','例如','猶如華語中的','相當於華語的']
  42. ch_word_list = []
  43. for i in word_data.華語詞義:
  44. i_ = i
  45. for repleace_word in repleace_list:
  46. i_ = i_.replace(repleace_word,'')
  47. tmp_ch_word = re.sub(r'[,、;]','/',i_)
  48. if tmp_ch_word.find('。')>=0:
  49. tmp_ch_word = re.sub('\。\S+','',tmp_ch_word).replace('。','')
  50. if tmp_ch_word.find('(')>=0:
  51. tmp_ch_word = re.sub('\(\S+\)','',tmp_ch_word)
  52. ch_word_list.append(tmp_ch_word.split('/'))
  53. word_data['華語詞義集'] = ch_word_list
  54. # 找出重複的音檔
  55. multi_sound = word_data.loc[[i.find('(')>=0 or i.find('【')>=0 or i.find('/')>=0 for i in word_data.客語標音],:]
  56. return word_data,multi_sound
  57. # 下載客語單詞
  58. def download_mp3(word_data,multi_sound):
  59. print('Run download_mp3')
  60. for j in range(len(word_data)):
  61. if j%500==0:
  62. print(j,'/',len(word_data))
  63. urllib.request.urlretrieve(word_data.loc[j,:]['客語音檔'], "mp3/{}.mp3".format(j))
  64. #刪除一些重複地念法到mp3_uni
  65. print('Generate unin mp3')
  66. for i in list(multi_sound.index):
  67. sound = AudioSegment.from_mp3("mp3/{}.mp3".format(i))
  68. loudness = sound.dBFS
  69. chunks = split_on_silence(sound,
  70. # must be silent for at least half a second,沉默半秒
  71. min_silence_len=200,
  72. # consider it silent if quieter than -16 dBFS
  73. silence_thresh=-50,
  74. keep_silence=100
  75. )
  76. if len(chunks)==1:
  77. print(i)
  78. chunks[0].export("mp3_uni/{}.mp3".format(i), format="mp3")
  79. def import_hakka_100():
  80. tmp_word_100 = pd.read_csv('csv_imtong/hakka100.csv')
  81. chinese_clean_list = []
  82. for i in tmp_word_100.chinese:
  83. chinese_clean = i[:-1].replace('。','').replace('?','').replace('!','/').replace(',','/').replace('、','/')
  84. if chinese_clean.find('(')>=0:
  85. chinese_clean = re.sub('\(\S+\)','',chinese_clean)
  86. chinese_clean_list += [chinese_clean.split('/')]
  87. tmp_word_100['chinese_clean'] = chinese_clean_list
  88. hakka_100 = tmp_word_100.explode('chinese_clean').reset_index(drop=True)
  89. return hakka_100
  90. def download_hakka_100(hakka_100):
  91. #下載客語100句
  92. print('Run download_hakka_100')
  93. for m in range(len(hakka_100)):
  94. print(m,'/',len(hakka_100))
  95. urllib.request.urlretrieve(hakka_100.loc[m,:]['url '], "mp3/hakka_100_{}.mp3".format(j))
  96. #刪除一些重複地念法到mp3_uni
  97. print('Generate unin hakka_100 mp3')
  98. j = 0
  99. for i in list(hakka_100.index):
  100. sound = AudioSegment.from_mp3("mp3/hakka_100_{}.mp3".format(i))
  101. loudness = sound.dBFS
  102. chunks = split_on_silence(sound,
  103. # must be silent for at least half a second,沉默半秒
  104. min_silence_len=300,
  105. # consider it silent if quieter than -16 dBFS
  106. silence_thresh=-45,
  107. keep_silence=400
  108. )
  109. if len(hakka_100.loc[i,:]['chinese_clean'])==len(chunks):
  110. for k in chunks:
  111. k.export("mp3_uni/hakka_100_{}.mp3".format(j), format="mp3")
  112. j += 1
  113. else:
  114. # 印出錯誤的音檔
  115. print('Error',i,chunks)
  116. def import_jieba_userdict(ch_word_list, userDict_path='userDict.txt'):
  117. f = open(userDict_path, 'w',encoding="utf-8")
  118. for i in range(len(ch_word_list)):
  119. f.write(ch_word_list[i]+'\n')
  120. f.close()
  121. jieba.load_userdict(userDict_path)
  122. return jieba
  123. def gen_hakka_tts(word_data,multi_sound,hakka_100,ch_sentence,gender,mp3_path='test1.mp3',verbose=0):
  124. Y = []
  125. pitch_step = 0
  126. if gender == 0:
  127. pitch_step=8
  128. else:
  129. pitch_step=0
  130. print(jieba.lcut(ch_sentence))
  131. for word in jieba.lcut(ch_sentence):
  132. tmp_mapping_100 = hakka_100.loc[[word == i for i in hakka_100.chinese_clean],:].head(1)
  133. if tmp_mapping_100.empty:
  134. tmp_mapping = word_data.loc[[word in i for i in word_data.華語詞義集],:].head(1)
  135. if verbose==1:print(tmp_mapping)
  136. if tmp_mapping.empty:
  137. if verbose==1:print('no mach',word)
  138. pass
  139. else:
  140. if tmp_mapping.index[0] in list(multi_sound.index):
  141. y, sr = librosa.load('mp3_uni/{}.mp3'.format(tmp_mapping.index[0]))
  142. if tmp_mapping.gender.values[0] == 'Male':
  143. y = librosa.effects.pitch_shift(y, sr, n_steps=pitch_step)
  144. else:
  145. y, sr = librosa.load('mp3/{}.mp3'.format(tmp_mapping.index[0]))
  146. if tmp_mapping.gender.values[0] == 'Male':
  147. y = librosa.effects.pitch_shift(y, sr, n_steps=pitch_step)
  148. Y += list(y[abs(y)>0.0005])
  149. else:
  150. y, sr = librosa.load('mp3_uni/hakka_100_{}.mp3'.format(tmp_mapping_100.index[0]))
  151. y = librosa.effects.pitch_shift(y, sr, n_steps=pitch_step)
  152. Y += list(y[abs(y)>0.0005])
  153. librosa.output.write_wav(mp3_path, np.array(Y), sr)