123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- # !pip install librosa==0.7.2
- # !pip install numba==0.48.0
- # !pip install pydub
- # !sudo yum update
- # !sudo yum install epel-release
- # !sudo rpm --import http://li.nux.ro/download/nux/RPM-GPG-KEY-nux.ro
- # !sudo rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-5.el7.nux.noarch.rpm
- # !sudo yum install ffmpeg ffmpeg-devel -y
- import pandas as pd
- import numpy as np
- import re
- import jieba
- import jieba.posseg as pseg
- import urllib
- import urllib.request
- import librosa
- from pydub import AudioSegment
- from pydub.silence import split_on_silence
- import itertools
- def import_data():
- drop_id = [14,4806,5024]
- word_data = pd.DataFrame()
- for i in ['si3-1','si3-2','siw']:
- tmp_word_data = pd.read_csv('csv_imtong/{}.csv'.format(i))
- gender_list = []
- tmp_word_data_class = list(tmp_word_data.分類.unique())
- for i in range(len(tmp_word_data_class)):
- if i%2 == 0:
- gender_list += ['Male']
- else:
- gender_list += ['Female']
- gender_df = pd.DataFrame({'分類':tmp_word_data_class,'gender':gender_list})
- tmp_word_data = pd.merge(tmp_word_data,gender_df,on='分類')
- word_data = word_data.append(tmp_word_data)
- word_data = word_data.reset_index(drop=True)
- word_data = word_data.loc[:,['客家語','客語標音','客語音檔','華語詞義','gender']]
- # 丟掉有問題音檔
- word_data = word_data.drop(drop_id,axis=0).reset_index(drop=True)
- # 整理資料,產生華語詞義集
- # 以下是要被取代的字眼
- repleace_list = ['引申為','...','或形容','…','形容','?','例如:','亦可說','「','」','例如','猶如華語中的','相當於華語的']
- ch_word_list = []
- for i in word_data.華語詞義:
- i_ = i
- for repleace_word in repleace_list:
- i_ = i_.replace(repleace_word,'')
- tmp_ch_word = re.sub(r'[,、;]','/',i_)
- if tmp_ch_word.find('。')>=0:
- tmp_ch_word = re.sub('\。\S+','',tmp_ch_word).replace('。','')
- if tmp_ch_word.find('(')>=0:
- tmp_ch_word = re.sub('\(\S+\)','',tmp_ch_word)
- ch_word_list.append(tmp_ch_word.split('/'))
- word_data['華語詞義集'] = ch_word_list
- # 找出重複的音檔
- multi_sound = word_data.loc[[i.find('(')>=0 or i.find('【')>=0 or i.find('/')>=0 for i in word_data.客語標音],:]
- return word_data,multi_sound
- # 下載客語單詞
- def download_mp3(word_data,multi_sound):
- print('Run download_mp3')
- for j in range(len(word_data)):
- if j%500==0:
- print(j,'/',len(word_data))
- urllib.request.urlretrieve(word_data.loc[j,:]['客語音檔'], "mp3/{}.mp3".format(j))
- #刪除一些重複地念法到mp3_uni
- print('Generate unin mp3')
- for i in list(multi_sound.index):
- sound = AudioSegment.from_mp3("mp3/{}.mp3".format(i))
- loudness = sound.dBFS
- chunks = split_on_silence(sound,
- # must be silent for at least half a second,沉默半秒
- min_silence_len=200,
- # consider it silent if quieter than -16 dBFS
- silence_thresh=-50,
- keep_silence=100
- )
- if len(chunks)==1:
- print(i)
- chunks[0].export("mp3_uni/{}.mp3".format(i), format="mp3")
- def import_hakka_100():
- tmp_word_100 = pd.read_csv('csv_imtong/hakka100.csv')
- chinese_clean_list = []
- for i in tmp_word_100.chinese:
- chinese_clean = i[:-1].replace('。','').replace('?','').replace('!','/').replace(',','/').replace('、','/')
- if chinese_clean.find('(')>=0:
- chinese_clean = re.sub('\(\S+\)','',chinese_clean)
- chinese_clean_list += [chinese_clean.split('/')]
- tmp_word_100['chinese_clean'] = chinese_clean_list
- hakka_100 = tmp_word_100.explode('chinese_clean').reset_index(drop=True)
- return hakka_100
- def download_hakka_100(hakka_100):
- #下載客語100句
- print('Run download_hakka_100')
- for m in range(len(hakka_100)):
- print(m,'/',len(hakka_100))
- urllib.request.urlretrieve(hakka_100.loc[m,:]['url '], "mp3/hakka_100_{}.mp3".format(j))
- #刪除一些重複地念法到mp3_uni
- print('Generate unin hakka_100 mp3')
- j = 0
- for i in list(hakka_100.index):
- sound = AudioSegment.from_mp3("mp3/hakka_100_{}.mp3".format(i))
- loudness = sound.dBFS
- chunks = split_on_silence(sound,
- # must be silent for at least half a second,沉默半秒
- min_silence_len=300,
- # consider it silent if quieter than -16 dBFS
- silence_thresh=-45,
- keep_silence=400
- )
- if len(hakka_100.loc[i,:]['chinese_clean'])==len(chunks):
- for k in chunks:
- k.export("mp3_uni/hakka_100_{}.mp3".format(j), format="mp3")
- j += 1
- else:
- # 印出錯誤的音檔
- print('Error',i,chunks)
- def import_jieba_userdict(ch_word_list, userDict_path='userDict.txt'):
- f = open(userDict_path, 'w',encoding="utf-8")
- for i in range(len(ch_word_list)):
- f.write(ch_word_list[i]+'\n')
- f.close()
- jieba.load_userdict(userDict_path)
- return jieba
- def gen_hakka_tts(word_data,multi_sound,hakka_100,ch_sentence,gender,mp3_path='test1.mp3',verbose=0):
- Y = []
- pitch_step = 0
- if gender == 0:
- pitch_step=8
- else:
- pitch_step=0
- print(jieba.lcut(ch_sentence))
- for word in jieba.lcut(ch_sentence):
- tmp_mapping_100 = hakka_100.loc[[word == i for i in hakka_100.chinese_clean],:].head(1)
- if tmp_mapping_100.empty:
- tmp_mapping = word_data.loc[[word in i for i in word_data.華語詞義集],:].head(1)
- if verbose==1:print(tmp_mapping)
- if tmp_mapping.empty:
- if verbose==1:print('no mach',word)
- pass
- else:
- if tmp_mapping.index[0] in list(multi_sound.index):
- y, sr = librosa.load('mp3_uni/{}.mp3'.format(tmp_mapping.index[0]))
- if tmp_mapping.gender.values[0] == 'Male':
- y = librosa.effects.pitch_shift(y, sr, n_steps=pitch_step)
- else:
- y, sr = librosa.load('mp3/{}.mp3'.format(tmp_mapping.index[0]))
- if tmp_mapping.gender.values[0] == 'Male':
- y = librosa.effects.pitch_shift(y, sr, n_steps=pitch_step)
- Y += list(y[abs(y)>0.0005])
- else:
- y, sr = librosa.load('mp3_uni/hakka_100_{}.mp3'.format(tmp_mapping_100.index[0]))
- y = librosa.effects.pitch_shift(y, sr, n_steps=pitch_step)
- Y += list(y[abs(y)>0.0005])
- librosa.output.write_wav(mp3_path, np.array(Y), sr)
|