# !pip install librosa==0.7.2 # !pip install numba==0.48.0 # !pip install pydub import pandas as pd import numpy as np import re import jieba import jieba.posseg as pseg import urllib import urllib.request import librosa from pydub import AudioSegment from pydub.silence import split_on_silence import itertools def import_data(): drop_id = [14,4806,5024] word_data = pd.DataFrame() for i in ['si3-1','si3-2','siw']: tmp_word_data = pd.read_csv('csv_imtong/{}.csv'.format(i)) gender_list = [] tmp_word_data_class = list(tmp_word_data.分類.unique()) for i in range(len(tmp_word_data_class)): if i%2 == 0: gender_list += ['Male'] else: gender_list += ['Female'] gender_df = pd.DataFrame({'分類':tmp_word_data_class,'gender':gender_list}) tmp_word_data = pd.merge(tmp_word_data,gender_df,on='分類') word_data = word_data.append(tmp_word_data) word_data = word_data.reset_index(drop=True) word_data = word_data.loc[:,['客家語','客語標音','客語音檔','華語詞義','gender']] # 丟掉有問題音檔 word_data = word_data.drop(drop_id,axis=0).reset_index(drop=True) # 整理資料,產生華語詞義集 # 以下是要被取代的字眼 repleace_list = ['引申為','...','或形容','…','形容','?','例如:','亦可說','「','」','例如','猶如華語中的','相當於華語的'] ch_word_list = [] for i in word_data.華語詞義: i_ = i for repleace_word in repleace_list: i_ = i_.replace(repleace_word,'') tmp_ch_word = re.sub(r'[,、;]','/',i_) if tmp_ch_word.find('。')>=0: tmp_ch_word = re.sub('\。\S+','',tmp_ch_word).replace('。','') if tmp_ch_word.find('(')>=0: tmp_ch_word = re.sub('\(\S+\)','',tmp_ch_word) ch_word_list.append(tmp_ch_word.split('/')) word_data['華語詞義集'] = ch_word_list # 找出重複的音檔 multi_sound = word_data.loc[[i.find('(')>=0 or i.find('【')>=0 or i.find('/')>=0 for i in word_data.客語標音],:] return word_data,multi_sound # 下載客語單詞 def download_mp3(word_data,multi_sound): print('Run download_mp3') for j in range(len(word_data)): if j%500==0: print(j,'/',len(word_data)) urllib.request.urlretrieve(word_data.loc[j,:]['客語音檔'], "mp3/{}.mp3".format(j)) #刪除一些重複地念法到mp3_uni print('Generate unin mp3') for i in list(multi_sound.index): sound = AudioSegment.from_mp3("mp3/{}.mp3".format(i)) loudness = sound.dBFS chunks = split_on_silence(sound, # must be silent for at least half a second,沉默半秒 min_silence_len=200, # consider it silent if quieter than -16 dBFS silence_thresh=-50, keep_silence=100 ) if len(chunks)==1: print(i) chunks[0].export("mp3_uni/{}.mp3".format(i), format="mp3") def import_hakka_100(): tmp_word_100 = pd.read_csv('csv_imtong/hakka100.csv') chinese_clean_list = [] for i in tmp_word_100.chinese: chinese_clean = i[:-1].replace('。','').replace('?','').replace('!','/').replace(',','/').replace('、','/') if chinese_clean.find('(')>=0: chinese_clean = re.sub('\(\S+\)','',chinese_clean) chinese_clean_list += [chinese_clean.split('/')] tmp_word_100['chinese_clean'] = chinese_clean_list hakka_100 = tmp_word_100.explode('chinese_clean').reset_index(drop=True) return hakka_100 def download_hakka_100(hakka_100): #下載客語100句 print('Run download_hakka_100') for m in range(len(hakka_100)): print(m,'/',len(hakka_100)) urllib.request.urlretrieve(hakka_100.loc[m,:]['url '], "mp3/hakka_100_{}.mp3".format(j)) #刪除一些重複地念法到mp3_uni print('Generate unin hakka_100 mp3') j = 0 for i in list(hakka_100.index): sound = AudioSegment.from_mp3("mp3/hakka_100_{}.mp3".format(i)) loudness = sound.dBFS chunks = split_on_silence(sound, # must be silent for at least half a second,沉默半秒 min_silence_len=300, # consider it silent if quieter than -16 dBFS silence_thresh=-45, keep_silence=400 ) if len(hakka_100.loc[i,:]['chinese_clean'])==len(chunks): for k in chunks: k.export("mp3_uni/hakka_100_{}.mp3".format(j), format="mp3") j += 1 else: # 印出錯誤的音檔 print('Error',i,chunks) def import_jieba_userdict(ch_word_list, userDict_path='userDict.txt'): f = open(userDict_path, 'w') for i in range(len(ch_word_list)): f.write(ch_word_list[i]+'\n') f.close() jieba.load_userdict(userDict_path) return jieba def gen_hakka_tts(word_data,multi_sound,hakka_100,ch_sentence,mp3_path='test1.mp3',verbose=0): Y = [] print(jieba.lcut(ch_sentence)) for word in jieba.lcut(ch_sentence): tmp_mapping_100 = hakka_100.loc[[word == i for i in hakka_100.chinese_clean],:].head(1) if tmp_mapping_100.empty: tmp_mapping = word_data.loc[[word in i for i in word_data.華語詞義集],:].head(1) if verbose==1:print(tmp_mapping) if tmp_mapping.empty: if verbose==1:print('no mach',word) pass else: if tmp_mapping.index[0] in list(multi_sound.index): y, sr = librosa.load('mp3_uni/{}.mp3'.format(tmp_mapping.index[0])) if tmp_mapping.gender.values[0] == 'Male': y = librosa.effects.pitch_shift(y, sr, n_steps=8) else: y, sr = librosa.load('mp3/{}.mp3'.format(tmp_mapping.index[0])) if tmp_mapping.gender.values[0] == 'Male': y = librosa.effects.pitch_shift(y, sr, n_steps=8) Y += list(y[abs(y)>0.0005]) else: y, sr = librosa.load('mp3_uni/hakka_100_{}.mp3'.format(tmp_mapping_100.index[0])) y = librosa.effects.pitch_shift(y, sr, n_steps=4) Y += list(y[abs(y)>0.0005]) librosa.output.write_wav(mp3_path, np.array(Y), sr)