|
@@ -234,18 +234,17 @@ def call_anchor(fileName,avatar):
|
|
|
|
|
|
fr.close()
|
|
|
fw.close()
|
|
|
-
|
|
|
def parse_script(file_path,gt_list):
|
|
|
with open(file_path, 'r') as f:
|
|
|
- lines = [line.strip() for line in f]
|
|
|
- lines = adjustSub_by_text_similarity(gt_list,lines)
|
|
|
+ raw_lines = [line.strip() for line in f]
|
|
|
+ lines = adjustSub_by_text_similarity(gt_list,raw_lines)
|
|
|
dict_list = []
|
|
|
-
|
|
|
+
|
|
|
for idx in range(int((len(lines)+1)/4)):
|
|
|
script={}
|
|
|
script['index'] = idx
|
|
|
- time_raw = lines[idx * 4 +1 ]
|
|
|
- script['content'] = lines[idx * 4 + 2]
|
|
|
+ time_raw = raw_lines[idx * 4 +1 ]
|
|
|
+ script['content'] = lines[idx*4+2]
|
|
|
start = time_raw.split(' --> ')[0].split(':')
|
|
|
stop = time_raw.split(' --> ')[1].split(':')
|
|
|
start[2] = start[2].replace(',','.')
|
|
@@ -259,11 +258,12 @@ def parse_script(file_path,gt_list):
|
|
|
dict_list.append(script)
|
|
|
new_idx = 0
|
|
|
splitted_dict = []
|
|
|
+
|
|
|
for dic in dict_list:
|
|
|
#螢幕寬度只能容納13個字
|
|
|
if len(dic['content'])>13:
|
|
|
times = math.ceil(len(dic['content'])/13)
|
|
|
- time_ratio = []
|
|
|
+ time_ratio = [None] *times
|
|
|
left_words = len(dic['content'])%13
|
|
|
for t in range(times):
|
|
|
if t != (times-1):
|
|
@@ -271,7 +271,8 @@ def parse_script(file_path,gt_list):
|
|
|
else:
|
|
|
time_ratio[t] = left_words/13 * dic['duration'] / times
|
|
|
for t in range(times):
|
|
|
- if time_ratio[t]>0.22:#about less than 3 words
|
|
|
+ if True:
|
|
|
+ #if time_ratio[t]>0.22:#about less than 3 words
|
|
|
new_dic = {}
|
|
|
new_dic['index'] = new_idx
|
|
|
new_dic['content'] = dic['content'][t*13:t*13+13]
|
|
@@ -283,27 +284,27 @@ def parse_script(file_path,gt_list):
|
|
|
new_dic['duration'] = time_ratio[t]
|
|
|
splitted_dict.append(new_dic)
|
|
|
new_idx+=1
|
|
|
- else:
|
|
|
- splitted_dict[new_idx-1]['content']+=dic['content']
|
|
|
+ #else:
|
|
|
+ # splitted_dict[new_idx-1]['content']+=dic['content']
|
|
|
else:
|
|
|
dic['index'] = new_idx
|
|
|
new_idx+=1
|
|
|
splitted_dict.append(dic)
|
|
|
return splitted_dict
|
|
|
-
|
|
|
def adjustSub_by_text_similarity(gts,gens):
|
|
|
+ adjusted = [None]*len(gens)
|
|
|
combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
|
|
|
combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
|
|
|
alls = gts+combine2+combine3
|
|
|
|
|
|
for idx in range(len(gens)):
|
|
|
match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
|
|
|
- if len(match_text) != 0:
|
|
|
- print('{ '+gens[idx]+' }校正後: '+match_text[0])
|
|
|
- gens[idx] = match_text[0]
|
|
|
- else:
|
|
|
- print('無校正:'+gens[idx])
|
|
|
- return gens
|
|
|
+ if len(match_text) != 0 and idx:
|
|
|
+ #print(gens[idx]+'校正後: '+match_text[0])
|
|
|
+ adjusted[idx] = match_text[0]
|
|
|
+
|
|
|
+
|
|
|
+ return adjusted
|
|
|
|
|
|
def trim_punctuation(s):
|
|
|
pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+';
|