# !pip install librosa==0.7.2
# !pip install numba==0.48.0
# !pip install pydub
# !sudo yum update
# !sudo yum install epel-release
# !sudo rpm --import http://li.nux.ro/download/nux/RPM-GPG-KEY-nux.ro
# !sudo rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-5.el7.nux.noarch.rpm
# !sudo yum install ffmpeg ffmpeg-devel -y

import pandas as pd
import numpy as np
import re
import jieba
import jieba.posseg as pseg
import urllib
import urllib.request
import librosa
from pydub import AudioSegment
from pydub.silence import split_on_silence
import itertools
def import_data():
  drop_id = [14,4806,5024]
  word_data = pd.DataFrame()
  for i in ['si3-1','si3-2','siw']:
    tmp_word_data = pd.read_csv('csv_imtong/{}.csv'.format(i))
    gender_list = []
    tmp_word_data_class = list(tmp_word_data.分類.unique())
    for i in range(len(tmp_word_data_class)):
      if i%2 == 0:
        gender_list += ['Male']
      else:
        gender_list += ['Female']
    gender_df = pd.DataFrame({'分類':tmp_word_data_class,'gender':gender_list})
    tmp_word_data = pd.merge(tmp_word_data,gender_df,on='分類')
    word_data = word_data.append(tmp_word_data)
  word_data = word_data.reset_index(drop=True)
  word_data = word_data.loc[:,['客家語','客語標音','客語音檔','華語詞義','gender']]
  # 丟掉有問題音檔
  word_data = word_data.drop(drop_id,axis=0).reset_index(drop=True)
  # 整理資料，產生華語詞義集
  # 以下是要被取代的字眼
  repleace_list = ['引申為','...','或形容','…','形容','？','例如：','亦可說','「','」','例如','猶如華語中的','相當於華語的']
  ch_word_list = []
  for i in word_data.華語詞義:
    i_ = i
    for repleace_word in repleace_list:
      i_ = i_.replace(repleace_word,'')
    tmp_ch_word = re.sub(r'[，、；]','/',i_)
    if tmp_ch_word.find('。')>=0:
      tmp_ch_word = re.sub('\。\S+','',tmp_ch_word).replace('。','')
    if tmp_ch_word.find('（')>=0:
      tmp_ch_word = re.sub('\（\S+\）','',tmp_ch_word)
    ch_word_list.append(tmp_ch_word.split('/'))
  word_data['華語詞義集'] = ch_word_list
  # 找出重複的音檔
  multi_sound = word_data.loc[[i.find('(')>=0 or i.find('【')>=0 or i.find('/')>=0  for i in word_data.客語標音],:]
  return word_data,multi_sound

# 下載客語單詞
def download_mp3(word_data,multi_sound):
  print('Run download_mp3')
  for j in range(len(word_data)):
    if j%500==0:
      print(j,'/',len(word_data))
    urllib.request.urlretrieve(word_data.loc[j,:]['客語音檔'], "mp3/{}.mp3".format(j))
  #刪除一些重複地念法到mp3_uni
  print('Generate unin mp3')
  for i in list(multi_sound.index):
    sound = AudioSegment.from_mp3("mp3/{}.mp3".format(i))
    loudness = sound.dBFS
    chunks = split_on_silence(sound,
        # must be silent for at least half a second,沉默半秒
        min_silence_len=200,
        # consider it silent if quieter than -16 dBFS
        silence_thresh=-50,
        keep_silence=100
    )
    if len(chunks)==1:
      print(i)
    chunks[0].export("mp3_uni/{}.mp3".format(i), format="mp3")

def import_hakka_100():
  tmp_word_100 = pd.read_csv('csv_imtong/hakka100.csv')
  chinese_clean_list = []
  for i in tmp_word_100.chinese:
    chinese_clean = i[:-1].replace('。','').replace('？','').replace('！','/').replace('，','/').replace('、','/')
    if chinese_clean.find('（')>=0:
      chinese_clean = re.sub('\（\S+\）','',chinese_clean)

    chinese_clean_list += [chinese_clean.split('/')]
  tmp_word_100['chinese_clean'] = chinese_clean_list
  hakka_100 = tmp_word_100.explode('chinese_clean').reset_index(drop=True)
  return hakka_100

def download_hakka_100(hakka_100):
  #下載客語100句
  print('Run download_hakka_100')
  for m in range(len(hakka_100)):

    print(m,'/',len(hakka_100))
    urllib.request.urlretrieve(hakka_100.loc[m,:]['url '], "mp3/hakka_100_{}.mp3".format(j))
  #刪除一些重複地念法到mp3_uni
  print('Generate unin hakka_100 mp3')
  j = 0
  for i in list(hakka_100.index):
    sound = AudioSegment.from_mp3("mp3/hakka_100_{}.mp3".format(i))
    loudness = sound.dBFS
    chunks = split_on_silence(sound,
        # must be silent for at least half a second,沉默半秒
        min_silence_len=300,
        # consider it silent if quieter than -16 dBFS
        silence_thresh=-45,
        keep_silence=400
    )
    if len(hakka_100.loc[i,:]['chinese_clean'])==len(chunks):
      for k in chunks:
        k.export("mp3_uni/hakka_100_{}.mp3".format(j), format="mp3")
        j += 1
    else:
      # 印出錯誤的音檔
      print('Error',i,chunks)

def import_jieba_userdict(ch_word_list, userDict_path='userDict.txt'):
  f = open(userDict_path, 'w')
  for i in range(len(ch_word_list)):
    f.write(ch_word_list[i]+'\n')
  f.close()
  jieba.load_userdict(userDict_path)
  return jieba

def gen_hakka_tts(word_data,multi_sound,hakka_100,ch_sentence,mp3_path='test1.mp3',verbose=0):
  Y = []
  print(jieba.lcut(ch_sentence))
  for word in jieba.lcut(ch_sentence):
    tmp_mapping_100 = hakka_100.loc[[word == i for i in hakka_100.chinese_clean],:].head(1)
    if tmp_mapping_100.empty:
      tmp_mapping = word_data.loc[[word in i for i in word_data.華語詞義集],:].head(1)
      if verbose==1:print(tmp_mapping)
      if tmp_mapping.empty:
        if verbose==1:print('no mach',word)
        pass
      else:
        if tmp_mapping.index[0] in list(multi_sound.index):
          y, sr = librosa.load('mp3_uni/{}.mp3'.format(tmp_mapping.index[0]))
          if tmp_mapping.gender.values[0] == 'Male':
            y = librosa.effects.pitch_shift(y, sr, n_steps=8)
        else:
          y, sr = librosa.load('mp3/{}.mp3'.format(tmp_mapping.index[0]))
          if tmp_mapping.gender.values[0] == 'Male':
            y = librosa.effects.pitch_shift(y, sr, n_steps=8)
        Y += list(y[abs(y)>0.0005])
    else:
      y, sr = librosa.load('mp3_uni/hakka_100_{}.mp3'.format(tmp_mapping_100.index[0]))
      y = librosa.effects.pitch_shift(y, sr, n_steps=4)
      Y += list(y[abs(y)>0.0005])
  librosa.output.write_wav(mp3_path, np.array(Y), sr)