|
@@ -27,6 +27,9 @@ from autosub import DEFAULT_SUBTITLE_FORMAT
|
|
|
from pytranscriber.control.ctr_main import Ctr_Main
|
|
|
from pytranscriber.control.ctr_autosub import Ctr_Autosub
|
|
|
import multiprocessing
|
|
|
+import math
|
|
|
+from itertools import groupby
|
|
|
+from operator import itemgetter
|
|
|
|
|
|
dir_sound = 'mp3_track/'
|
|
|
dir_photo = 'photo/'
|
|
@@ -234,178 +237,91 @@ def call_anchor(fileName,avatar):
|
|
|
|
|
|
fr.close()
|
|
|
fw.close()
|
|
|
+def sentence_time_ratio(text,maxLen):
|
|
|
|
|
|
-def syllable_count(word):
|
|
|
- word = word.lower()
|
|
|
- count = 0
|
|
|
- vowels = "aeiouy"
|
|
|
- if word[0] in vowels:
|
|
|
- count += 1
|
|
|
- for index in range(1, len(word)):
|
|
|
- if word[index] in vowels and word[index - 1] not in vowels:
|
|
|
- count += 1
|
|
|
-
|
|
|
- if word.endswith("e"):
|
|
|
- count -= 1
|
|
|
- if count == 0:
|
|
|
- count += 1
|
|
|
- return count
|
|
|
-
|
|
|
-def split_sentence(in_str, maxLen):
|
|
|
- re.findall(r'[\u4e00-\u9fff]+', in_str)
|
|
|
-
|
|
|
- zh_idx = []
|
|
|
- eng_idx= []
|
|
|
- for i in range(len(in_str)):
|
|
|
- if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
|
|
|
- zh_idx.append(i)
|
|
|
- else:
|
|
|
- eng_idx.append(i)
|
|
|
-
|
|
|
- space_index = [m.start() for m in re.finditer(' ', in_str)]
|
|
|
- for idx in space_index:
|
|
|
- eng_idx.remove(idx)
|
|
|
-
|
|
|
- eng_range_list = []
|
|
|
- for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
|
|
|
- eng_range = list(map(itemgetter(1), g))
|
|
|
- eng_range_list.append(eng_range)
|
|
|
-
|
|
|
- total_syllable = 0
|
|
|
- for i in range(len(eng_range_list)):
|
|
|
- total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5)
|
|
|
- for i in range(len(zh_idx)):
|
|
|
- total_syllable+=1
|
|
|
+ total_len = len(text)
|
|
|
+ if total_len > maxLen:
|
|
|
+ left_word = total_len % maxLen
|
|
|
+ times = int(math.ceil(total_len/maxLen))
|
|
|
+ if left_word < 5:
|
|
|
+ times+=1
|
|
|
+ sen_len = int(total_len/times)
|
|
|
|
|
|
- #final chchchchchc[en][en][en]
|
|
|
- #[en] is a vocabulary dict with occurence of image
|
|
|
- zh_eng_idx_list = []
|
|
|
- i = 0
|
|
|
- while i < len(in_str):
|
|
|
- if in_str[i]==' ':
|
|
|
- i+=1
|
|
|
- if i in zh_idx:
|
|
|
- zh_eng_idx_list.append(i)
|
|
|
- i+=1
|
|
|
- if i in eng_idx:
|
|
|
- for ls in eng_range_list:
|
|
|
- if i in ls:
|
|
|
- zh_eng_idx_list.append(ls)
|
|
|
- i = ls[-1]+1
|
|
|
- break
|
|
|
+ time_ratio = [None]*times
|
|
|
+ sentences = [None]*times
|
|
|
+ print(times,',',total_len,",",sen_len)
|
|
|
+ for t in range(times):
|
|
|
|
|
|
- zh_eng_dict_list = [{'content':'','time_ratio':0}]
|
|
|
- idx = 0
|
|
|
- current_len = 0
|
|
|
- sen_idx = 0
|
|
|
- while idx < len(zh_eng_idx_list):
|
|
|
- str_from_idx = ''
|
|
|
- sylla_cnt = 1
|
|
|
- if type(zh_eng_idx_list[idx])==type([]):
|
|
|
- str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' '
|
|
|
- sylla_cnt = syllable_count(str_from_idx)
|
|
|
- else:
|
|
|
- str_from_idx = in_str[zh_eng_idx_list[idx]]
|
|
|
+ sentences[t] = text[t*sen_len:t*sen_len+sen_len]
|
|
|
+ time_ratio[t] = len(sentences[t])/total_len
|
|
|
+ else:
|
|
|
+
|
|
|
+ time_ratio = [1]
|
|
|
+ sen_len = total_len
|
|
|
+ sentences = [text]
|
|
|
|
|
|
-
|
|
|
- if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen:
|
|
|
- zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable
|
|
|
-
|
|
|
- zh_eng_dict_list.append({'content':'','time_ratio':0})
|
|
|
- sen_idx+=1
|
|
|
- current_len = 0
|
|
|
- else:
|
|
|
- current_len += sylla_cnt
|
|
|
- zh_eng_dict_list[sen_idx]['content'] += str_from_idx
|
|
|
- idx+=1
|
|
|
+ return sen_len, time_ratio, sentences
|
|
|
|
|
|
- total_ratio = 0
|
|
|
- for obj in zh_eng_dict_list:
|
|
|
- total_ratio+=obj['time_ratio']
|
|
|
- zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio
|
|
|
- return zh_eng_dict_list
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
def parse_script(file_path,gt_list):
|
|
|
- with open(file_path, 'r',encoding="utf-8") as f:
|
|
|
+ with open(file_path, 'r') as f:
|
|
|
raw_lines = [line.strip() for line in f]
|
|
|
lines = adjustSub_by_text_similarity(gt_list,raw_lines)
|
|
|
- #make dict
|
|
|
dict_list = []
|
|
|
- for idx in range(len(lines)):
|
|
|
+
|
|
|
+ for idx in range(int((len(lines)+1)/4)):
|
|
|
script={}
|
|
|
- script['content'] = lines[idx]
|
|
|
- time_raw = raw_lines[idx * 4 +1 ].split(' --> ')
|
|
|
- start = time_raw[0].split(':')
|
|
|
- stop = time_raw[1].split(':')
|
|
|
- script['start'] = float(start[0])*3600 + float(start[1])*60 + float(start[2].replace(',','.'))
|
|
|
- script['stop'] = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2].replace(',','.'))
|
|
|
+ script['index'] = idx
|
|
|
+ time_raw = raw_lines[idx * 4 +1 ]
|
|
|
+ script['content'] = lines[idx*4+2]
|
|
|
+ start = time_raw.split(' --> ')[0].split(':')
|
|
|
+ stop = time_raw.split(' --> ')[1].split(':')
|
|
|
+ start[2] = start[2].replace(',','.')
|
|
|
+ stop[2] = stop[2].replace(',','.')
|
|
|
+ start_sec = float(start[0])*3600 + float(start[1])*60 + float(start[2])
|
|
|
+ stop_sec = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2])
|
|
|
+ duration = start_sec-stop_sec
|
|
|
+ script['start'] = start_sec
|
|
|
+ script['stop'] = stop_sec
|
|
|
+ script['duration'] = abs(duration)
|
|
|
dict_list.append(script)
|
|
|
- #merge duplicated sentences
|
|
|
- script_not_dup_list = []
|
|
|
- for idx in range(len(dict_list)):
|
|
|
- dup_list = []
|
|
|
- for idx_inner in range(len(dict_list)):
|
|
|
- if dict_list[idx_inner]['content']==dict_list[idx]['content']:
|
|
|
- dup_list.append(idx_inner)
|
|
|
- for dup_idx in dup_list:
|
|
|
- if dup_idx == min(dup_list):
|
|
|
- dict_list[dup_idx]['type'] = 'lead_sentence'
|
|
|
- else:
|
|
|
- dict_list[dup_idx]['type'] = 'duplicated'
|
|
|
- dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop']
|
|
|
- if dict_list[idx]['type'] == 'lead_sentence':
|
|
|
- script_not_dup_list.append(dict_list[idx])
|
|
|
-
|
|
|
- #avoid subtitle overlapping ? Timeline overlapping not found currently
|
|
|
- #cut by max length----> eng seperated problem {eng_idx}
|
|
|
- #ENG counts, zh counts, space counts
|
|
|
-
|
|
|
+ '''
|
|
|
+ for dic in dict_list:
|
|
|
+ print(dic)
|
|
|
+ '''
|
|
|
new_idx = 0
|
|
|
splitted_dict = []
|
|
|
- for dic in script_not_dup_list:
|
|
|
- dic_idx = 0
|
|
|
- accumulated_duration = 0
|
|
|
- duration = dic['stop']-dic['start']
|
|
|
- print(duration)
|
|
|
- for sub_dic in split_sentence(dic['content'],13):
|
|
|
- new_dic = {}
|
|
|
- new_dic['index'] = new_idx
|
|
|
+ for dic in dict_list:
|
|
|
+ sen_len, time_ratio, sentences = sentence_time_ratio(dic['content'],13)
|
|
|
+ for s in range(len(sentences)):
|
|
|
+ new_dict = {}
|
|
|
+ new_dict['index'] = new_idx
|
|
|
+ start = dic['start']
|
|
|
+ for t in range(s):
|
|
|
+ start += (dic['duration']*time_ratio[t])
|
|
|
+ new_dict['start'] = start
|
|
|
+ new_dict['duration'] = dic['duration'] * time_ratio[s]
|
|
|
+ new_dict['content'] = sentences[s]
|
|
|
new_idx+=1
|
|
|
- ind_duration = duration * sub_dic['time_ratio']
|
|
|
- new_dic['start'] = dic['start'] + accumulated_duration
|
|
|
- accumulated_duration += ind_duration
|
|
|
-
|
|
|
- new_dic['content'] = sub_dic['content']
|
|
|
- new_dic['duration'] = ind_duration
|
|
|
- splitted_dict.append(new_dic)
|
|
|
+ splitted_dict.append(new_dict)
|
|
|
|
|
|
- return splitted_dict
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-def adjustSub_by_text_similarity(gts,gens_raw):
|
|
|
- gens = []
|
|
|
- for idx in range(int((len(gens_raw)+1)/4)):
|
|
|
- gens.append(gens_raw[idx*4+2])
|
|
|
+ return dict_list
|
|
|
|
|
|
- combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
|
|
|
- combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
|
|
|
- alls = gts + combine2 + combine3
|
|
|
|
|
|
+def adjustSub_by_text_similarity(gts,gens):
|
|
|
adjusted = [None]*len(gens)
|
|
|
- duplicated_list = []
|
|
|
+ combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
|
|
|
+ combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
|
|
|
+ alls = gts+combine2+combine3
|
|
|
+
|
|
|
for idx in range(len(gens)):
|
|
|
match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
|
|
|
- if match_text[0] in duplicated_list:
|
|
|
- for mt in match_text:
|
|
|
- if mt == adjusted[idx-1] or mt not in duplicated_list:
|
|
|
- adjusted[idx] = mt
|
|
|
- break
|
|
|
- else:
|
|
|
+ if len(match_text) != 0 and idx:
|
|
|
+ #print(gens[idx]+'校正後: '+match_text[0])
|
|
|
adjusted[idx] = match_text[0]
|
|
|
- duplicated_list.append(match_text[0])
|
|
|
+
|
|
|
+
|
|
|
return adjusted
|
|
|
|
|
|
def trim_punctuation(s):
|
|
@@ -415,20 +331,12 @@ def trim_punctuation(s):
|
|
|
return res
|
|
|
|
|
|
def splitter(s):
|
|
|
- for sent in re.findall(u'[^!?,。\!\?]+[!? 。\!\?]?', s, flags=re.U):
|
|
|
+ for sent in re.findall(u'[^!?,。\!\?]+[!?。\!\?]?', s, flags=re.U):
|
|
|
yield sent
|
|
|
|
|
|
def split_by_pun(s):
|
|
|
res = list(splitter(s))
|
|
|
return res
|
|
|
-def split_by_word(s):
|
|
|
- slice_size = 3
|
|
|
- paragraph_len = len(s)
|
|
|
- slice_num = int(math.ceil(paragraph_len/slice_size))
|
|
|
- slice_list = []
|
|
|
- for n in range(slice_num):
|
|
|
- slice_list.append(s[n*slice_size:n*slice_size+slice_size])
|
|
|
- return slice_list
|
|
|
|
|
|
def generate_subtitle_image_from_dict(name_hash, sub_dict):
|
|
|
for script in sub_dict:
|
|
@@ -886,7 +794,10 @@ def anchor_video_eng(name_hash,name,text_content, image_urls,sub_titles,avatar):
|
|
|
import pyttsx3
|
|
|
def make_speech(text):
|
|
|
engine = pyttsx3.init()
|
|
|
- #voices = engine.getProperty('voices')
|
|
|
+ voices = engine.getProperty('voices')
|
|
|
+ for voice in voices:
|
|
|
+ if voice.name=='Mandarin':
|
|
|
+ engine.setProperty('voice',)
|
|
|
engine.setProperty('voice', 'Mandarin')
|
|
|
engine.save_to_file(text, '/app/speech.mp3')
|
|
|
engine.runAndWait()
|