Hong
/
Hakka_tts


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
							# !pip install librosa==0.7.2
# !pip install numba==0.48.0
# !pip install pydub
import pandas as pd
import numpy as np
import re
import jieba
import jieba.posseg as pseg
import urllib
import urllib.request
import librosa
from pydub import AudioSegment
from pydub.silence import split_on_silence
import itertools
def import_data():
  drop_id = [14,4806,5024]
  word_data = pd.DataFrame()
  for i in ['si3-1','si3-2','siw']:
    tmp_word_data = pd.read_csv('csv_imtong/{}.csv'.format(i))
    gender_list = []
    tmp_word_data_class = list(tmp_word_data.分類.unique())
    for i in range(len(tmp_word_data_class)):
      if i%2 == 0:
        gender_list += ['Male']
      else:
        gender_list += ['Female']
    gender_df = pd.DataFrame({'分類':tmp_word_data_class,'gender':gender_list})
    tmp_word_data = pd.merge(tmp_word_data,gender_df,on='分類')
    word_data = word_data.append(tmp_word_data)
  word_data = word_data.reset_index(drop=True)
  word_data = word_data.loc[:,['客家語','客語標音','客語音檔','華語詞義','gender']]
  # 丟掉有問題音檔
  word_data = word_data.drop(drop_id,axis=0).reset_index(drop=True)
  # 整理資料，產生華語詞義集
  # 以下是要被取代的字眼
  repleace_list = ['引申為','...','或形容','…','形容','？','例如：','亦可說','「','」','例如','猶如華語中的','相當於華語的']
  ch_word_list = []
  for i in word_data.華語詞義:
    i_ = i
    for repleace_word in repleace_list:
      i_ = i_.replace(repleace_word,'')
    tmp_ch_word = re.sub(r'[，、；]','/',i_)
    if tmp_ch_word.find('。')>=0:
      tmp_ch_word = re.sub('\。\S+','',tmp_ch_word).replace('。','')
    if tmp_ch_word.find('（')>=0:
      tmp_ch_word = re.sub('\（\S+\）','',tmp_ch_word)
    ch_word_list.append(tmp_ch_word.split('/'))
  word_data['華語詞義集'] = ch_word_list
  # 找出重複的音檔
  multi_sound = word_data.loc[[i.find('(')>=0 or i.find('【')>=0 or i.find('/')>=0  for i in word_data.客語標音],:]
  return word_data,multi_sound

# 下載客語單詞
def download_mp3(word_data,multi_sound):
  print('Run download_mp3')
  for j in range(len(word_data)):
    if j%500==0:
      print(j,'/',len(word_data))
    urllib.request.urlretrieve(word_data.loc[j,:]['客語音檔'], "mp3/{}.mp3".format(j))
  #刪除一些重複地念法到mp3_uni
  print('Generate unin mp3')
  for i in list(multi_sound.index):
    sound = AudioSegment.from_mp3("mp3/{}.mp3".format(i))
    loudness = sound.dBFS
    chunks = split_on_silence(sound,
        # must be silent for at least half a second,沉默半秒
        min_silence_len=200,
        # consider it silent if quieter than -16 dBFS
        silence_thresh=-50,
        keep_silence=100
    )
    if len(chunks)==1:
      print(i)
    chunks[0].export("mp3_uni/{}.mp3".format(i), format="mp3")

def import_hakka_100():
  tmp_word_100 = pd.read_csv('csv_imtong/hakka100.csv')
  chinese_clean_list = []
  for i in tmp_word_100.chinese:
    chinese_clean = i[:-1].replace('。','').replace('？','').replace('！','/').replace('，','/').replace('、','/')
    if chinese_clean.find('（')>=0:
      chinese_clean = re.sub('\（\S+\）','',chinese_clean)

    chinese_clean_list += [chinese_clean.split('/')]
  tmp_word_100['chinese_clean'] = chinese_clean_list
  hakka_100 = tmp_word_100.explode('chinese_clean').reset_index(drop=True)
  return hakka_100

def download_hakka_100(hakka_100):
  #下載客語100句
  print('Run download_hakka_100')
  for m in range(len(hakka_100)):

    print(m,'/',len(hakka_100))
    urllib.request.urlretrieve(hakka_100.loc[m,:]['url '], "mp3/hakka_100_{}.mp3".format(j))
  #刪除一些重複地念法到mp3_uni
  print('Generate unin hakka_100 mp3')
  j = 0
  for i in list(hakka_100.index):
    sound = AudioSegment.from_mp3("mp3/hakka_100_{}.mp3".format(i))
    loudness = sound.dBFS
    chunks = split_on_silence(sound,
        # must be silent for at least half a second,沉默半秒
        min_silence_len=300,
        # consider it silent if quieter than -16 dBFS
        silence_thresh=-45,
        keep_silence=400
    )
    if len(hakka_100.loc[i,:]['chinese_clean'])==len(chunks):
      for k in chunks:
        k.export("mp3_uni/hakka_100_{}.mp3".format(j), format="mp3")
        j += 1
    else:
      # 印出錯誤的音檔
      print('Error',i,chunks)

def import_jieba_userdict(ch_word_list, userDict_path='userDict.txt'):
  f = open(userDict_path, 'w')
  for i in range(len(ch_word_list)):
    f.write(ch_word_list[i]+'\n')
  f.close()
  jieba.load_userdict(userDict_path)
  return jieba

def gen_hakka_tts(word_data,multi_sound,hakka_100,ch_sentence,mp3_path='test1.mp3',verbose=0):
  Y = []
  print(jieba.lcut(ch_sentence))
  for word in jieba.lcut(ch_sentence):
    tmp_mapping_100 = hakka_100.loc[[word == i for i in hakka_100.chinese_clean],:].head(1)
    if tmp_mapping_100.empty:
      tmp_mapping = word_data.loc[[word in i for i in word_data.華語詞義集],:].head(1)
      if verbose==1:print(tmp_mapping)
      if tmp_mapping.empty:
        if verbose==1:print('no mach',word)
        pass
      else:
        if tmp_mapping.index[0] in list(multi_sound.index):
          y, sr = librosa.load('mp3_uni/{}.mp3'.format(tmp_mapping.index[0]))
          if tmp_mapping.gender.values[0] == 'Male':
            y = librosa.effects.pitch_shift(y, sr, n_steps=8)
        else:
          y, sr = librosa.load('mp3/{}.mp3'.format(tmp_mapping.index[0]))
          if tmp_mapping.gender.values[0] == 'Male':
            y = librosa.effects.pitch_shift(y, sr, n_steps=8)
        Y += list(y[abs(y)>0.0005])
    else:
      y, sr = librosa.load('mp3_uni/hakka_100_{}.mp3'.format(tmp_mapping_100.index[0]))
      y = librosa.effects.pitch_shift(y, sr, n_steps=4)
      Y += list(y[abs(y)>0.0005])
  librosa.output.write_wav(mp3_path, np.array(Y), sr)