|
@@ -238,12 +238,13 @@ def call_anchor(fileName,avatar):
|
|
|
def parse_script(file_path):
|
|
|
with open(file_path, 'r') as f:
|
|
|
lines = [line.strip() for line in f]
|
|
|
+ lines = adjustSub_by_text_similarity(gt_list,lines)
|
|
|
dict_list = []
|
|
|
|
|
|
for idx in range(int((len(lines)+1)/4)):
|
|
|
script={}
|
|
|
script['index'] = idx
|
|
|
- time_raw = lines[idx * 4 + 1]
|
|
|
+ time_raw = lines[idx * 4]
|
|
|
script['content'] = lines[idx * 4 + 2]
|
|
|
start = time_raw.split(' --> ')[0].split(':')
|
|
|
stop = time_raw.split(' --> ')[1].split(':')
|
|
@@ -256,7 +257,49 @@ def parse_script(file_path):
|
|
|
script['stop'] = stop_sec
|
|
|
script['duration'] = abs(duration)
|
|
|
dict_list.append(script)
|
|
|
- return dict_list
|
|
|
+ new_idx = 0
|
|
|
+ splitted_dict = []
|
|
|
+ for dic in dict_list:
|
|
|
+ #螢幕寬度只能容納13個字
|
|
|
+ if dic['content']>13:
|
|
|
+ times = math.ceil(len(dic['content'])/13)
|
|
|
+ time_ratio = []
|
|
|
+ for t in range(times):
|
|
|
+ time_ratio[t] = len(dic['content'][t*13:t*13+13])/13 * dic['duration']
|
|
|
+ for t in range(times):
|
|
|
+ if time_ratio[t]>0.22:#about less than 3 words
|
|
|
+ new_dic = {}
|
|
|
+ new_dic['index'] = new_idx
|
|
|
+ new_dic['content'] = dic['content'][t*13:t*13+13]
|
|
|
+ start_plus = 0
|
|
|
+ for t2 in range(t):
|
|
|
+ start_plus += time_ratio[t2]
|
|
|
+ new_dic['start'] = dic['start'] + start_plus
|
|
|
+ new_dic['stop'] = new_dic['start']+time_ratio[t]
|
|
|
+ new_dic['duration'] = time_ratio[t]
|
|
|
+ splitted_dict.append(new_dic)
|
|
|
+ new_idx+=1
|
|
|
+ else:
|
|
|
+ splitted_dict[new_idx-1]['content']+=dic['content']
|
|
|
+ else:
|
|
|
+ dic['index'] = new_idx
|
|
|
+ new_idx+=1
|
|
|
+ splitted_dict.append(dic)
|
|
|
+ return splitted_dict
|
|
|
+
|
|
|
+def adjustSub_by_text_similarity(gts,gens):
|
|
|
+ combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
|
|
|
+ combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
|
|
|
+ alls = gts+combine2+combine3
|
|
|
+
|
|
|
+ for idx in range(len(gens)):
|
|
|
+ match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
|
|
|
+ if len(match_text) != 0:
|
|
|
+ print('{ '+gens[idx]+' }校正後: '+match_text[0])
|
|
|
+ gens[idx] = match_text[0]
|
|
|
+ else:
|
|
|
+ print('無校正:'+gens[idx])
|
|
|
+ return gens
|
|
|
|
|
|
def trim_punctuation(s):
|
|
|
pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+';
|