# !pip install librosa==0.7.2 # !pip install numba==0.48.0 # !pip install pydub # !sudo yum update # !sudo yum install epel-release # !sudo rpm --import http://li.nux.ro/download/nux/RPM-GPG-KEY-nux.ro # !sudo rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-5.el7.nux.noarch.rpm # !sudo yum install ffmpeg ffmpeg-devel -y import pandas as pd import numpy as np import re import jieba import jieba.posseg as pseg import urllib import urllib.request import librosa from pydub import AudioSegment from pydub.silence import split_on_silence import itertools def import_data(): drop_id = [14,4806,5024] word_data = pd.DataFrame() for i in ['si3-1','si3-2','siw']: tmp_word_data = pd.read_csv('csv_imtong/{}.csv'.format(i)) gender_list = [] tmp_word_data_class = list(tmp_word_data.分類.unique()) for i in range(len(tmp_word_data_class)): if i%2 == 0: gender_list += ['Male'] else: gender_list += ['Female'] gender_df = pd.DataFrame({'分類':tmp_word_data_class,'gender':gender_list}) tmp_word_data = pd.merge(tmp_word_data,gender_df,on='分類') word_data = word_data.append(tmp_word_data) word_data = word_data.reset_index(drop=True) word_data = word_data.loc[:,['客家語','客語標音','客語音檔','華語詞義','gender']] # 丟掉有問題音檔 word_data = word_data.drop(drop_id,axis=0).reset_index(drop=True) # 整理資料,產生華語詞義集 # 以下是要被取代的字眼 repleace_list = ['引申為','...','或形容','…','形容','?','例如:','亦可說','「','」','例如','猶如華語中的','相當於華語的'] ch_word_list = [] for i in word_data.華語詞義: i_ = i for repleace_word in repleace_list: i_ = i_.replace(repleace_word,'') tmp_ch_word = re.sub(r'[,、;]','/',i_) if tmp_ch_word.find('。')>=0: tmp_ch_word = re.sub('\。\S+','',tmp_ch_word).replace('。','') if tmp_ch_word.find('(')>=0: tmp_ch_word = re.sub('\(\S+\)','',tmp_ch_word) ch_word_list.append(tmp_ch_word.split('/')) word_data['華語詞義集'] = ch_word_list # 找出重複的音檔 multi_sound = word_data.loc[[i.find('(')>=0 or i.find('【')>=0 or i.find('/')>=0 for i in word_data.客語標音],:] return word_data,multi_sound # 下載客語單詞 def download_mp3(word_data,multi_sound): print('Run download_mp3') for j in range(len(word_data)): if j%500==0: print(j,'/',len(word_data)) urllib.request.urlretrieve(word_data.loc[j,:]['客語音檔'], "mp3/{}.mp3".format(j)) #刪除一些重複地念法到mp3_uni print('Generate unin mp3') for i in list(multi_sound.index): sound = AudioSegment.from_mp3("mp3/{}.mp3".format(i)) loudness = sound.dBFS chunks = split_on_silence(sound, # must be silent for at least half a second,沉默半秒 min_silence_len=200, # consider it silent if quieter than -16 dBFS silence_thresh=-50, keep_silence=100 ) if len(chunks)==1: print(i) chunks[0].export("mp3_uni/{}.mp3".format(i), format="mp3") def import_hakka_100(): tmp_word_100 = pd.read_csv('csv_imtong/hakka100.csv') chinese_clean_list = [] for i in tmp_word_100.chinese: chinese_clean = i[:-1].replace('。','').replace('?','').replace('!','/').replace(',','/').replace('、','/') if chinese_clean.find('(')>=0: chinese_clean = re.sub('\(\S+\)','',chinese_clean) chinese_clean_list += [chinese_clean.split('/')] tmp_word_100['chinese_clean'] = chinese_clean_list hakka_100 = tmp_word_100.explode('chinese_clean').reset_index(drop=True) return hakka_100 def download_hakka_100(hakka_100): #下載客語100句 print('Run download_hakka_100') for m in range(len(hakka_100)): print(m,'/',len(hakka_100)) urllib.request.urlretrieve(hakka_100.loc[m,:]['url '], "mp3/hakka_100_{}.mp3".format(j)) #刪除一些重複地念法到mp3_uni print('Generate unin hakka_100 mp3') j = 0 for i in list(hakka_100.index): sound = AudioSegment.from_mp3("mp3/hakka_100_{}.mp3".format(i)) loudness = sound.dBFS chunks = split_on_silence(sound, # must be silent for at least half a second,沉默半秒 min_silence_len=300, # consider it silent if quieter than -16 dBFS silence_thresh=-45, keep_silence=400 ) if len(hakka_100.loc[i,:]['chinese_clean'])==len(chunks): for k in chunks: k.export("mp3_uni/hakka_100_{}.mp3".format(j), format="mp3") j += 1 else: # 印出錯誤的音檔 print('Error',i,chunks) def import_jieba_userdict(ch_word_list, userDict_path='userDict.txt'): f = open(userDict_path, 'w') for i in range(len(ch_word_list)): f.write(ch_word_list[i]+'\n') f.close() jieba.load_userdict(userDict_path) return jieba def gen_hakka_tts(word_data,multi_sound,hakka_100,ch_sentence,mp3_path='test1.mp3',verbose=0): Y = [] print(jieba.lcut(ch_sentence)) for word in jieba.lcut(ch_sentence): tmp_mapping_100 = hakka_100.loc[[word == i for i in hakka_100.chinese_clean],:].head(1) if tmp_mapping_100.empty: tmp_mapping = word_data.loc[[word in i for i in word_data.華語詞義集],:].head(1) if verbose==1:print(tmp_mapping) if tmp_mapping.empty: if verbose==1:print('no mach',word) pass else: if tmp_mapping.index[0] in list(multi_sound.index): y, sr = librosa.load('mp3_uni/{}.mp3'.format(tmp_mapping.index[0])) if tmp_mapping.gender.values[0] == 'Male': y = librosa.effects.pitch_shift(y, sr, n_steps=8) else: y, sr = librosa.load('mp3/{}.mp3'.format(tmp_mapping.index[0])) if tmp_mapping.gender.values[0] == 'Male': y = librosa.effects.pitch_shift(y, sr, n_steps=8) Y += list(y[abs(y)>0.0005]) else: y, sr = librosa.load('mp3_uni/hakka_100_{}.mp3'.format(tmp_mapping_100.index[0])) y = librosa.effects.pitch_shift(y, sr, n_steps=4) Y += list(y[abs(y)>0.0005]) librosa.output.write_wav(mp3_path, np.array(Y), sr)