|
@@ -0,0 +1,161 @@
|
|
|
+# !pip install librosa==0.7.2
|
|
|
+# !pip install numba==0.48.0
|
|
|
+# !pip install pydub
|
|
|
+# !sudo yum update
|
|
|
+# !sudo yum install epel-release
|
|
|
+# !sudo rpm --import http://li.nux.ro/download/nux/RPM-GPG-KEY-nux.ro
|
|
|
+# !sudo rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-5.el7.nux.noarch.rpm
|
|
|
+# !sudo yum install ffmpeg ffmpeg-devel -y
|
|
|
+
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+import re
|
|
|
+import jieba
|
|
|
+import jieba.posseg as pseg
|
|
|
+import urllib
|
|
|
+import urllib.request
|
|
|
+import librosa
|
|
|
+from pydub import AudioSegment
|
|
|
+from pydub.silence import split_on_silence
|
|
|
+import itertools
|
|
|
+def import_data():
|
|
|
+ drop_id = [14,4806,5024]
|
|
|
+ word_data = pd.DataFrame()
|
|
|
+ for i in ['si3-1','si3-2','siw']:
|
|
|
+ tmp_word_data = pd.read_csv('csv_imtong/{}.csv'.format(i))
|
|
|
+ gender_list = []
|
|
|
+ tmp_word_data_class = list(tmp_word_data.分類.unique())
|
|
|
+ for i in range(len(tmp_word_data_class)):
|
|
|
+ if i%2 == 0:
|
|
|
+ gender_list += ['Male']
|
|
|
+ else:
|
|
|
+ gender_list += ['Female']
|
|
|
+ gender_df = pd.DataFrame({'分類':tmp_word_data_class,'gender':gender_list})
|
|
|
+ tmp_word_data = pd.merge(tmp_word_data,gender_df,on='分類')
|
|
|
+ word_data = word_data.append(tmp_word_data)
|
|
|
+ word_data = word_data.reset_index(drop=True)
|
|
|
+ word_data = word_data.loc[:,['客家語','客語標音','客語音檔','華語詞義','gender']]
|
|
|
+ # 丟掉有問題音檔
|
|
|
+ word_data = word_data.drop(drop_id,axis=0).reset_index(drop=True)
|
|
|
+ # 整理資料,產生華語詞義集
|
|
|
+ # 以下是要被取代的字眼
|
|
|
+ repleace_list = ['引申為','...','或形容','…','形容','?','例如:','亦可說','「','」','例如','猶如華語中的','相當於華語的']
|
|
|
+ ch_word_list = []
|
|
|
+ for i in word_data.華語詞義:
|
|
|
+ i_ = i
|
|
|
+ for repleace_word in repleace_list:
|
|
|
+ i_ = i_.replace(repleace_word,'')
|
|
|
+ tmp_ch_word = re.sub(r'[,、;]','/',i_)
|
|
|
+ if tmp_ch_word.find('。')>=0:
|
|
|
+ tmp_ch_word = re.sub('\。\S+','',tmp_ch_word).replace('。','')
|
|
|
+ if tmp_ch_word.find('(')>=0:
|
|
|
+ tmp_ch_word = re.sub('\(\S+\)','',tmp_ch_word)
|
|
|
+ ch_word_list.append(tmp_ch_word.split('/'))
|
|
|
+ word_data['華語詞義集'] = ch_word_list
|
|
|
+ # 找出重複的音檔
|
|
|
+ multi_sound = word_data.loc[[i.find('(')>=0 or i.find('【')>=0 or i.find('/')>=0 for i in word_data.客語標音],:]
|
|
|
+ return word_data,multi_sound
|
|
|
+
|
|
|
+# 下載客語單詞
|
|
|
+def download_mp3(word_data,multi_sound):
|
|
|
+ print('Run download_mp3')
|
|
|
+ for j in range(len(word_data)):
|
|
|
+ if j%500==0:
|
|
|
+ print(j,'/',len(word_data))
|
|
|
+ urllib.request.urlretrieve(word_data.loc[j,:]['客語音檔'], "mp3/{}.mp3".format(j))
|
|
|
+ #刪除一些重複地念法到mp3_uni
|
|
|
+ print('Generate unin mp3')
|
|
|
+ for i in list(multi_sound.index):
|
|
|
+ sound = AudioSegment.from_mp3("mp3/{}.mp3".format(i))
|
|
|
+ loudness = sound.dBFS
|
|
|
+ chunks = split_on_silence(sound,
|
|
|
+ # must be silent for at least half a second,沉默半秒
|
|
|
+ min_silence_len=200,
|
|
|
+ # consider it silent if quieter than -16 dBFS
|
|
|
+ silence_thresh=-50,
|
|
|
+ keep_silence=100
|
|
|
+ )
|
|
|
+ if len(chunks)==1:
|
|
|
+ print(i)
|
|
|
+ chunks[0].export("mp3_uni/{}.mp3".format(i), format="mp3")
|
|
|
+
|
|
|
+def import_hakka_100():
|
|
|
+ tmp_word_100 = pd.read_csv('csv_imtong/hakka100.csv')
|
|
|
+ chinese_clean_list = []
|
|
|
+ for i in tmp_word_100.chinese:
|
|
|
+ chinese_clean = i[:-1].replace('。','').replace('?','').replace('!','/').replace(',','/').replace('、','/')
|
|
|
+ if chinese_clean.find('(')>=0:
|
|
|
+ chinese_clean = re.sub('\(\S+\)','',chinese_clean)
|
|
|
+
|
|
|
+ chinese_clean_list += [chinese_clean.split('/')]
|
|
|
+ tmp_word_100['chinese_clean'] = chinese_clean_list
|
|
|
+ hakka_100 = tmp_word_100.explode('chinese_clean').reset_index(drop=True)
|
|
|
+ return hakka_100
|
|
|
+
|
|
|
+def download_hakka_100(hakka_100):
|
|
|
+ #下載客語100句
|
|
|
+ print('Run download_hakka_100')
|
|
|
+ for m in range(len(hakka_100)):
|
|
|
+
|
|
|
+ print(m,'/',len(hakka_100))
|
|
|
+ urllib.request.urlretrieve(hakka_100.loc[m,:]['url '], "mp3/hakka_100_{}.mp3".format(j))
|
|
|
+ #刪除一些重複地念法到mp3_uni
|
|
|
+ print('Generate unin hakka_100 mp3')
|
|
|
+ j = 0
|
|
|
+ for i in list(hakka_100.index):
|
|
|
+ sound = AudioSegment.from_mp3("mp3/hakka_100_{}.mp3".format(i))
|
|
|
+ loudness = sound.dBFS
|
|
|
+ chunks = split_on_silence(sound,
|
|
|
+ # must be silent for at least half a second,沉默半秒
|
|
|
+ min_silence_len=300,
|
|
|
+ # consider it silent if quieter than -16 dBFS
|
|
|
+ silence_thresh=-45,
|
|
|
+ keep_silence=400
|
|
|
+ )
|
|
|
+ if len(hakka_100.loc[i,:]['chinese_clean'])==len(chunks):
|
|
|
+ for k in chunks:
|
|
|
+ k.export("mp3_uni/hakka_100_{}.mp3".format(j), format="mp3")
|
|
|
+ j += 1
|
|
|
+ else:
|
|
|
+ # 印出錯誤的音檔
|
|
|
+ print('Error',i,chunks)
|
|
|
+
|
|
|
+def import_jieba_userdict(ch_word_list, userDict_path='userDict.txt'):
|
|
|
+ f = open(userDict_path, 'w',encoding="utf-8")
|
|
|
+ for i in range(len(ch_word_list)):
|
|
|
+ f.write(ch_word_list[i]+'\n')
|
|
|
+ f.close()
|
|
|
+ jieba.load_userdict(userDict_path)
|
|
|
+ return jieba
|
|
|
+
|
|
|
+def gen_hakka_tts(word_data,multi_sound,hakka_100,ch_sentence,gender,mp3_path='test1.mp3',verbose=0):
|
|
|
+ Y = []
|
|
|
+ pitch_step = 0
|
|
|
+ if gender == 0:
|
|
|
+ pitch_step=8
|
|
|
+ else:
|
|
|
+ pitch_step=0
|
|
|
+ print(jieba.lcut(ch_sentence))
|
|
|
+ for word in jieba.lcut(ch_sentence):
|
|
|
+ tmp_mapping_100 = hakka_100.loc[[word == i for i in hakka_100.chinese_clean],:].head(1)
|
|
|
+ if tmp_mapping_100.empty:
|
|
|
+ tmp_mapping = word_data.loc[[word in i for i in word_data.華語詞義集],:].head(1)
|
|
|
+ if verbose==1:print(tmp_mapping)
|
|
|
+ if tmp_mapping.empty:
|
|
|
+ if verbose==1:print('no mach',word)
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ if tmp_mapping.index[0] in list(multi_sound.index):
|
|
|
+ y, sr = librosa.load('mp3_uni/{}.mp3'.format(tmp_mapping.index[0]))
|
|
|
+ if tmp_mapping.gender.values[0] == 'Male':
|
|
|
+ y = librosa.effects.pitch_shift(y, sr, n_steps=pitch_step)
|
|
|
+ else:
|
|
|
+ y, sr = librosa.load('mp3/{}.mp3'.format(tmp_mapping.index[0]))
|
|
|
+ if tmp_mapping.gender.values[0] == 'Male':
|
|
|
+ y = librosa.effects.pitch_shift(y, sr, n_steps=pitch_step)
|
|
|
+ Y += list(y[abs(y)>0.0005])
|
|
|
+ else:
|
|
|
+ y, sr = librosa.load('mp3_uni/hakka_100_{}.mp3'.format(tmp_mapping_100.index[0]))
|
|
|
+ y = librosa.effects.pitch_shift(y, sr, n_steps=pitch_step)
|
|
|
+ Y += list(y[abs(y)>0.0005])
|
|
|
+ librosa.output.write_wav(mp3_path, np.array(Y), sr)
|