|  | @@ -27,6 +27,9 @@ from autosub import DEFAULT_SUBTITLE_FORMAT
 | 
	
		
			
				|  |  |  from pytranscriber.control.ctr_main import Ctr_Main
 | 
	
		
			
				|  |  |  from pytranscriber.control.ctr_autosub import Ctr_Autosub
 | 
	
		
			
				|  |  |  import multiprocessing
 | 
	
		
			
				|  |  | +import math
 | 
	
		
			
				|  |  | +from itertools import groupby
 | 
	
		
			
				|  |  | +from operator import itemgetter
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  dir_sound = 'mp3_track/'
 | 
	
		
			
				|  |  |  dir_photo = 'photo/'
 | 
	
	
		
			
				|  | @@ -234,178 +237,91 @@ def call_anchor(fileName,avatar):
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      fr.close()
 | 
	
		
			
				|  |  |      fw.close()
 | 
	
		
			
				|  |  | +def sentence_time_ratio(text,maxLen):
 | 
	
		
			
				|  |  |      
 | 
	
		
			
				|  |  | -def syllable_count(word):
 | 
	
		
			
				|  |  | -    word = word.lower()
 | 
	
		
			
				|  |  | -    count = 0
 | 
	
		
			
				|  |  | -    vowels = "aeiouy"
 | 
	
		
			
				|  |  | -    if word[0] in vowels:
 | 
	
		
			
				|  |  | -        count += 1
 | 
	
		
			
				|  |  | -    for index in range(1, len(word)):
 | 
	
		
			
				|  |  | -        if word[index] in vowels and word[index - 1] not in vowels:
 | 
	
		
			
				|  |  | -            count += 1
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    if word.endswith("e"):
 | 
	
		
			
				|  |  | -        count -= 1
 | 
	
		
			
				|  |  | -    if count == 0:
 | 
	
		
			
				|  |  | -        count += 1
 | 
	
		
			
				|  |  | -    return count
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -def split_sentence(in_str, maxLen):
 | 
	
		
			
				|  |  | -    re.findall(r'[\u4e00-\u9fff]+', in_str)
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    zh_idx = []
 | 
	
		
			
				|  |  | -    eng_idx= []
 | 
	
		
			
				|  |  | -    for i in range(len(in_str)):
 | 
	
		
			
				|  |  | -        if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
 | 
	
		
			
				|  |  | -            zh_idx.append(i)
 | 
	
		
			
				|  |  | -        else:
 | 
	
		
			
				|  |  | -            eng_idx.append(i)
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    space_index = [m.start() for m in re.finditer(' ', in_str)]
 | 
	
		
			
				|  |  | -    for idx in space_index:
 | 
	
		
			
				|  |  | -        eng_idx.remove(idx)
 | 
	
		
			
				|  |  | -    
 | 
	
		
			
				|  |  | -    eng_range_list = []
 | 
	
		
			
				|  |  | -    for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
 | 
	
		
			
				|  |  | -        eng_range = list(map(itemgetter(1), g))
 | 
	
		
			
				|  |  | -        eng_range_list.append(eng_range)
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    total_syllable = 0
 | 
	
		
			
				|  |  | -    for i in range(len(eng_range_list)):
 | 
	
		
			
				|  |  | -        total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5)
 | 
	
		
			
				|  |  | -    for i in range(len(zh_idx)):
 | 
	
		
			
				|  |  | -        total_syllable+=1
 | 
	
		
			
				|  |  | +    total_len = len(text)
 | 
	
		
			
				|  |  | +    if total_len > maxLen:
 | 
	
		
			
				|  |  | +        left_word = total_len % maxLen
 | 
	
		
			
				|  |  | +        times = int(math.ceil(total_len/maxLen))
 | 
	
		
			
				|  |  | +        if left_word < 5:
 | 
	
		
			
				|  |  | +            times+=1
 | 
	
		
			
				|  |  | +        sen_len = int(total_len/times)
 | 
	
		
			
				|  |  |      
 | 
	
		
			
				|  |  | -    #final chchchchchc[en][en][en]
 | 
	
		
			
				|  |  | -    #[en] is a vocabulary dict with  occurence of image
 | 
	
		
			
				|  |  | -    zh_eng_idx_list = []
 | 
	
		
			
				|  |  | -    i = 0
 | 
	
		
			
				|  |  | -    while i < len(in_str):
 | 
	
		
			
				|  |  | -        if in_str[i]==' ':
 | 
	
		
			
				|  |  | -            i+=1
 | 
	
		
			
				|  |  | -        if i in zh_idx:
 | 
	
		
			
				|  |  | -            zh_eng_idx_list.append(i)
 | 
	
		
			
				|  |  | -            i+=1
 | 
	
		
			
				|  |  | -        if i in eng_idx:
 | 
	
		
			
				|  |  | -            for ls in eng_range_list:
 | 
	
		
			
				|  |  | -                if i in ls:
 | 
	
		
			
				|  |  | -                    zh_eng_idx_list.append(ls)
 | 
	
		
			
				|  |  | -                    i = ls[-1]+1
 | 
	
		
			
				|  |  | -                    break
 | 
	
		
			
				|  |  | +        time_ratio = [None]*times
 | 
	
		
			
				|  |  | +        sentences = [None]*times
 | 
	
		
			
				|  |  | +        print(times,',',total_len,",",sen_len)
 | 
	
		
			
				|  |  | +        for t in range(times):
 | 
	
		
			
				|  |  |              
 | 
	
		
			
				|  |  | -    zh_eng_dict_list = [{'content':'','time_ratio':0}]
 | 
	
		
			
				|  |  | -    idx = 0 
 | 
	
		
			
				|  |  | -    current_len = 0
 | 
	
		
			
				|  |  | -    sen_idx = 0
 | 
	
		
			
				|  |  | -    while idx < len(zh_eng_idx_list):
 | 
	
		
			
				|  |  | -        str_from_idx = ''
 | 
	
		
			
				|  |  | -        sylla_cnt = 1
 | 
	
		
			
				|  |  | -        if type(zh_eng_idx_list[idx])==type([]):
 | 
	
		
			
				|  |  | -            str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' '
 | 
	
		
			
				|  |  | -            sylla_cnt = syllable_count(str_from_idx)
 | 
	
		
			
				|  |  | -        else:
 | 
	
		
			
				|  |  | -            str_from_idx = in_str[zh_eng_idx_list[idx]]
 | 
	
		
			
				|  |  | +            sentences[t] = text[t*sen_len:t*sen_len+sen_len]
 | 
	
		
			
				|  |  | +            time_ratio[t] = len(sentences[t])/total_len
 | 
	
		
			
				|  |  | +    else:
 | 
	
		
			
				|  |  | +        
 | 
	
		
			
				|  |  | +        time_ratio = [1]
 | 
	
		
			
				|  |  | +        sen_len = total_len
 | 
	
		
			
				|  |  | +        sentences = [text]
 | 
	
		
			
				|  |  |      
 | 
	
		
			
				|  |  | -      
 | 
	
		
			
				|  |  | -        if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen:
 | 
	
		
			
				|  |  | -            zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable
 | 
	
		
			
				|  |  | -           
 | 
	
		
			
				|  |  | -            zh_eng_dict_list.append({'content':'','time_ratio':0})
 | 
	
		
			
				|  |  | -            sen_idx+=1
 | 
	
		
			
				|  |  | -            current_len = 0
 | 
	
		
			
				|  |  | -        else:
 | 
	
		
			
				|  |  | -            current_len += sylla_cnt
 | 
	
		
			
				|  |  | -            zh_eng_dict_list[sen_idx]['content'] += str_from_idx
 | 
	
		
			
				|  |  | -            idx+=1
 | 
	
		
			
				|  |  | +    return sen_len, time_ratio, sentences
 | 
	
		
			
				|  |  |          
 | 
	
		
			
				|  |  | -    total_ratio = 0
 | 
	
		
			
				|  |  | -    for obj in zh_eng_dict_list:
 | 
	
		
			
				|  |  | -        total_ratio+=obj['time_ratio']
 | 
	
		
			
				|  |  | -    zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio
 | 
	
		
			
				|  |  | -    return zh_eng_dict_list
 | 
	
		
			
				|  |  | -   
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | +        
 | 
	
		
			
				|  |  | +    
 | 
	
		
			
				|  |  |  def parse_script(file_path,gt_list):
 | 
	
		
			
				|  |  | -    with open(file_path, 'r',encoding="utf-8") as f:
 | 
	
		
			
				|  |  | +    with open(file_path, 'r') as f:
 | 
	
		
			
				|  |  |          raw_lines = [line.strip() for line in f]
 | 
	
		
			
				|  |  |      lines = adjustSub_by_text_similarity(gt_list,raw_lines)
 | 
	
		
			
				|  |  | -    #make dict
 | 
	
		
			
				|  |  |      dict_list = []
 | 
	
		
			
				|  |  | -    for idx in range(len(lines)):
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    for idx in range(int((len(lines)+1)/4)):
 | 
	
		
			
				|  |  |          script={}
 | 
	
		
			
				|  |  | -        script['content'] = lines[idx]
 | 
	
		
			
				|  |  | -        time_raw = raw_lines[idx * 4 +1 ].split(' --> ')
 | 
	
		
			
				|  |  | -        start = time_raw[0].split(':')
 | 
	
		
			
				|  |  | -        stop = time_raw[1].split(':')
 | 
	
		
			
				|  |  | -        script['start'] = float(start[0])*3600 + float(start[1])*60 + float(start[2].replace(',','.'))
 | 
	
		
			
				|  |  | -        script['stop'] = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2].replace(',','.'))
 | 
	
		
			
				|  |  | +        script['index'] = idx
 | 
	
		
			
				|  |  | +        time_raw = raw_lines[idx * 4 +1 ]
 | 
	
		
			
				|  |  | +        script['content'] = lines[idx*4+2]
 | 
	
		
			
				|  |  | +        start = time_raw.split(' --> ')[0].split(':')
 | 
	
		
			
				|  |  | +        stop = time_raw.split(' --> ')[1].split(':')
 | 
	
		
			
				|  |  | +        start[2] = start[2].replace(',','.')
 | 
	
		
			
				|  |  | +        stop[2] = stop[2].replace(',','.')
 | 
	
		
			
				|  |  | +        start_sec = float(start[0])*3600 + float(start[1])*60 + float(start[2])
 | 
	
		
			
				|  |  | +        stop_sec = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2])
 | 
	
		
			
				|  |  | +        duration = start_sec-stop_sec
 | 
	
		
			
				|  |  | +        script['start'] = start_sec
 | 
	
		
			
				|  |  | +        script['stop'] = stop_sec
 | 
	
		
			
				|  |  | +        script['duration'] = abs(duration)
 | 
	
		
			
				|  |  |          dict_list.append(script)
 | 
	
		
			
				|  |  | -    #merge duplicated sentences
 | 
	
		
			
				|  |  | -    script_not_dup_list = []
 | 
	
		
			
				|  |  | -    for idx in range(len(dict_list)):
 | 
	
		
			
				|  |  | -        dup_list = []
 | 
	
		
			
				|  |  | -        for idx_inner in range(len(dict_list)):
 | 
	
		
			
				|  |  | -            if dict_list[idx_inner]['content']==dict_list[idx]['content']:
 | 
	
		
			
				|  |  | -                dup_list.append(idx_inner)
 | 
	
		
			
				|  |  | -        for dup_idx in dup_list:
 | 
	
		
			
				|  |  | -            if dup_idx == min(dup_list):
 | 
	
		
			
				|  |  | -                dict_list[dup_idx]['type'] = 'lead_sentence'
 | 
	
		
			
				|  |  | -            else:
 | 
	
		
			
				|  |  | -                dict_list[dup_idx]['type'] = 'duplicated'
 | 
	
		
			
				|  |  | -        dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop']
 | 
	
		
			
				|  |  | -        if dict_list[idx]['type'] == 'lead_sentence':
 | 
	
		
			
				|  |  | -            script_not_dup_list.append(dict_list[idx])
 | 
	
		
			
				|  |  | -                
 | 
	
		
			
				|  |  | -    #avoid subtitle overlapping ?   Timeline overlapping not found currently
 | 
	
		
			
				|  |  | -    #cut by max length---->  eng seperated problem   {eng_idx}
 | 
	
		
			
				|  |  | -    #ENG counts, zh counts, space counts
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | +    '''
 | 
	
		
			
				|  |  | +    for dic in dict_list:
 | 
	
		
			
				|  |  | +        print(dic)
 | 
	
		
			
				|  |  | +    '''
 | 
	
		
			
				|  |  |      new_idx = 0
 | 
	
		
			
				|  |  |      splitted_dict = []
 | 
	
		
			
				|  |  | -    for dic in script_not_dup_list:
 | 
	
		
			
				|  |  | -        dic_idx = 0
 | 
	
		
			
				|  |  | -        accumulated_duration = 0
 | 
	
		
			
				|  |  | -        duration = dic['stop']-dic['start']
 | 
	
		
			
				|  |  | -        print(duration)
 | 
	
		
			
				|  |  | -        for sub_dic in split_sentence(dic['content'],13):
 | 
	
		
			
				|  |  | -            new_dic = {}
 | 
	
		
			
				|  |  | -            new_dic['index'] = new_idx
 | 
	
		
			
				|  |  | +    for dic in dict_list:
 | 
	
		
			
				|  |  | +        sen_len, time_ratio, sentences = sentence_time_ratio(dic['content'],13)
 | 
	
		
			
				|  |  | +        for s in range(len(sentences)):
 | 
	
		
			
				|  |  | +            new_dict = {}
 | 
	
		
			
				|  |  | +            new_dict['index'] = new_idx
 | 
	
		
			
				|  |  | +            start = dic['start']
 | 
	
		
			
				|  |  | +            for t in range(s):
 | 
	
		
			
				|  |  | +                start += (dic['duration']*time_ratio[t])
 | 
	
		
			
				|  |  | +            new_dict['start'] = start
 | 
	
		
			
				|  |  | +            new_dict['duration'] = dic['duration'] * time_ratio[s]
 | 
	
		
			
				|  |  | +            new_dict['content'] = sentences[s]
 | 
	
		
			
				|  |  |              new_idx+=1
 | 
	
		
			
				|  |  | -            ind_duration = duration * sub_dic['time_ratio']
 | 
	
		
			
				|  |  | -            new_dic['start'] = dic['start'] + accumulated_duration
 | 
	
		
			
				|  |  | -            accumulated_duration += ind_duration
 | 
	
		
			
				|  |  | -            
 | 
	
		
			
				|  |  | -            new_dic['content'] = sub_dic['content']
 | 
	
		
			
				|  |  | -            new_dic['duration'] = ind_duration
 | 
	
		
			
				|  |  | -            splitted_dict.append(new_dic)
 | 
	
		
			
				|  |  | +            splitted_dict.append(new_dict)
 | 
	
		
			
				|  |  |      
 | 
	
		
			
				|  |  | -    return splitted_dict
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -def adjustSub_by_text_similarity(gts,gens_raw):
 | 
	
		
			
				|  |  | -    gens = []
 | 
	
		
			
				|  |  | -    for idx in range(int((len(gens_raw)+1)/4)):
 | 
	
		
			
				|  |  | -        gens.append(gens_raw[idx*4+2])
 | 
	
		
			
				|  |  | +    return dict_list
 | 
	
		
			
				|  |  |      
 | 
	
		
			
				|  |  | -    combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
 | 
	
		
			
				|  |  | -    combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
 | 
	
		
			
				|  |  | -    alls = gts + combine2 + combine3
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +def adjustSub_by_text_similarity(gts,gens):
 | 
	
		
			
				|  |  |      adjusted = [None]*len(gens)
 | 
	
		
			
				|  |  | -    duplicated_list = []
 | 
	
		
			
				|  |  | +    combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
 | 
	
		
			
				|  |  | +    combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
 | 
	
		
			
				|  |  | +    alls = gts+combine2+combine3
 | 
	
		
			
				|  |  | + 
 | 
	
		
			
				|  |  |      for idx in range(len(gens)):
 | 
	
		
			
				|  |  |          match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
 | 
	
		
			
				|  |  | -        if match_text[0] in duplicated_list:
 | 
	
		
			
				|  |  | -            for mt in match_text:
 | 
	
		
			
				|  |  | -                if mt == adjusted[idx-1] or mt not in duplicated_list:
 | 
	
		
			
				|  |  | -                    adjusted[idx] = mt
 | 
	
		
			
				|  |  | -                    break
 | 
	
		
			
				|  |  | -        else:
 | 
	
		
			
				|  |  | +        if len(match_text) != 0 and idx:
 | 
	
		
			
				|  |  | +            #print(gens[idx]+'校正後: '+match_text[0])
 | 
	
		
			
				|  |  |              adjusted[idx] = match_text[0]
 | 
	
		
			
				|  |  | -            duplicated_list.append(match_text[0])
 | 
	
		
			
				|  |  | +  
 | 
	
		
			
				|  |  | +    
 | 
	
		
			
				|  |  |      return adjusted
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  def trim_punctuation(s):
 | 
	
	
		
			
				|  | @@ -415,20 +331,12 @@ def trim_punctuation(s):
 | 
	
		
			
				|  |  |      return res
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  def splitter(s):
 | 
	
		
			
				|  |  | -    for sent in re.findall(u'[^!?,。\!\?]+[!? 。\!\?]?', s, flags=re.U):
 | 
	
		
			
				|  |  | +    for sent in re.findall(u'[^!?,。\!\?]+[!?。\!\?]?', s, flags=re.U):
 | 
	
		
			
				|  |  |          yield sent
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  def split_by_pun(s):
 | 
	
		
			
				|  |  |      res = list(splitter(s))
 | 
	
		
			
				|  |  |      return res
 | 
	
		
			
				|  |  | -def split_by_word(s):
 | 
	
		
			
				|  |  | -    slice_size = 3
 | 
	
		
			
				|  |  | -    paragraph_len = len(s)
 | 
	
		
			
				|  |  | -    slice_num = int(math.ceil(paragraph_len/slice_size))
 | 
	
		
			
				|  |  | -    slice_list = []
 | 
	
		
			
				|  |  | -    for n in range(slice_num):
 | 
	
		
			
				|  |  | -        slice_list.append(s[n*slice_size:n*slice_size+slice_size])
 | 
	
		
			
				|  |  | -    return slice_list
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  def generate_subtitle_image_from_dict(name_hash, sub_dict):
 | 
	
		
			
				|  |  |      for script in sub_dict:
 | 
	
	
		
			
				|  | @@ -886,7 +794,10 @@ def anchor_video_eng(name_hash,name,text_content, image_urls,sub_titles,avatar):
 | 
	
		
			
				|  |  |  import pyttsx3
 | 
	
		
			
				|  |  |  def make_speech(text):
 | 
	
		
			
				|  |  |      engine = pyttsx3.init()
 | 
	
		
			
				|  |  | -    #voices = engine.getProperty('voices')
 | 
	
		
			
				|  |  | +    voices = engine.getProperty('voices')
 | 
	
		
			
				|  |  | +    for voice in voices:
 | 
	
		
			
				|  |  | +        if voice.name=='Mandarin':
 | 
	
		
			
				|  |  | +            engine.setProperty('voice',)
 | 
	
		
			
				|  |  |      engine.setProperty('voice', 'Mandarin')
 | 
	
		
			
				|  |  |      engine.save_to_file(text, '/app/speech.mp3')
 | 
	
		
			
				|  |  |      engine.runAndWait()
 |