|
@@ -8,53 +8,63 @@ def sentence_time_ratio(text,maxLen):
|
|
if total_len > maxLen:
|
|
if total_len > maxLen:
|
|
left_word = total_len % maxLen
|
|
left_word = total_len % maxLen
|
|
times = int(math.ceil(total_len/maxLen))
|
|
times = int(math.ceil(total_len/maxLen))
|
|
- if left_word < 3:
|
|
|
|
|
|
+ if left_word < 5:
|
|
times+=1
|
|
times+=1
|
|
sen_len = int(total_len/times)
|
|
sen_len = int(total_len/times)
|
|
|
|
|
|
time_ratio = [None]*times
|
|
time_ratio = [None]*times
|
|
sentences = [None]*times
|
|
sentences = [None]*times
|
|
|
|
+ print(times,',',total_len,",",sen_len)
|
|
for t in range(times):
|
|
for t in range(times):
|
|
|
|
+
|
|
sentences[t] = text[t*sen_len:t*sen_len+sen_len]
|
|
sentences[t] = text[t*sen_len:t*sen_len+sen_len]
|
|
time_ratio[t] = len(sentences[t])/total_len
|
|
time_ratio[t] = len(sentences[t])/total_len
|
|
else:
|
|
else:
|
|
|
|
+
|
|
time_ratio = [1]
|
|
time_ratio = [1]
|
|
- sen_len = total_len
|
|
|
|
sentences = [text]
|
|
sentences = [text]
|
|
- return sen_len, time_ratio, sentences
|
|
|
|
-
|
|
|
|
-#case : 1.短句mactch到很長的句子
|
|
|
|
-
|
|
|
|
-
|
|
|
|
|
|
|
|
|
|
+ return time_ratio, sentences
|
|
def parse_script(file_path,gt_list):
|
|
def parse_script(file_path,gt_list):
|
|
with open(file_path, 'r',encoding="utf-8") as f:
|
|
with open(file_path, 'r',encoding="utf-8") as f:
|
|
raw_lines = [line.strip() for line in f]
|
|
raw_lines = [line.strip() for line in f]
|
|
lines = adjustSub_by_text_similarity(gt_list,raw_lines)
|
|
lines = adjustSub_by_text_similarity(gt_list,raw_lines)
|
|
|
|
+ #make dict
|
|
dict_list = []
|
|
dict_list = []
|
|
-
|
|
|
|
- for idx in range(int((len(lines)+1)/4)):
|
|
|
|
|
|
+ for idx in range(len(lines)):
|
|
script={}
|
|
script={}
|
|
- script['index'] = idx
|
|
|
|
- time_raw = raw_lines[idx * 4 +1 ]
|
|
|
|
- script['content'] = lines[idx*4+2]
|
|
|
|
- start = time_raw.split(' --> ')[0].split(':')
|
|
|
|
- stop = time_raw.split(' --> ')[1].split(':')
|
|
|
|
- start[2] = start[2].replace(',','.')
|
|
|
|
- stop[2] = stop[2].replace(',','.')
|
|
|
|
- start_sec = float(start[0])*3600 + float(start[1])*60 + float(start[2])
|
|
|
|
- stop_sec = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2])
|
|
|
|
- duration = start_sec-stop_sec
|
|
|
|
- script['start'] = start_sec
|
|
|
|
- script['stop'] = stop_sec
|
|
|
|
- script['duration'] = abs(duration)
|
|
|
|
|
|
+ script['content'] = lines[idx]
|
|
|
|
+ time_raw = raw_lines[idx * 4 +1 ].split(' --> ')
|
|
|
|
+ start = time_raw[0].split(':')
|
|
|
|
+ stop = time_raw[1].split(':')
|
|
|
|
+ script['start'] = float(start[0])*3600 + float(start[1])*60 + float(start[2].replace(',','.'))
|
|
|
|
+ script['stop'] = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2].replace(',','.'))
|
|
dict_list.append(script)
|
|
dict_list.append(script)
|
|
|
|
+ #merge duplicated sentences
|
|
|
|
+ script_not_dup_list = []
|
|
|
|
+ for idx in range(len(dict_list)):
|
|
|
|
+ dup_list = []
|
|
|
|
+ for idx_inner in range(len(dict_list)):
|
|
|
|
+ if dict_list[idx_inner]['content']==dict_list[idx]['content']:
|
|
|
|
+ dup_list.append(idx_inner)
|
|
|
|
+ for dup_idx in dup_list:
|
|
|
|
+ if dup_idx == min(dup_list):
|
|
|
|
+ dict_list[dup_idx]['type'] = 'lead_sentence'
|
|
|
|
+ else:
|
|
|
|
+ dict_list[dup_idx]['type'] = 'duplicated'
|
|
|
|
+ dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop']
|
|
|
|
+ if dict_list[idx]['type'] == 'lead_sentence':
|
|
|
|
+ script_not_dup_list.append(dict_list[idx])
|
|
|
|
+
|
|
|
|
+ #avoid subtitle overlapping ? Timeline overlapping not found currently
|
|
|
|
|
|
|
|
|
|
|
|
+ #cut by max length----> eng seperated problem {eng_idx}
|
|
|
|
+ #ENG counts, zh counts, space counts
|
|
new_idx = 0
|
|
new_idx = 0
|
|
splitted_dict = []
|
|
splitted_dict = []
|
|
for dic in dict_list:
|
|
for dic in dict_list:
|
|
- sen_len, time_ratio, sentences = sentence_time_ratio(dic['content'],13)
|
|
|
|
|
|
+ time_ratio, sentences = sentence_time_ratio(dic['content'],13)
|
|
for s in range(len(sentences)):
|
|
for s in range(len(sentences)):
|
|
new_dict = {}
|
|
new_dict = {}
|
|
new_dict['index'] = new_idx
|
|
new_dict['index'] = new_idx
|
|
@@ -66,23 +76,32 @@ def parse_script(file_path,gt_list):
|
|
new_dict['content'] = sentences[s]
|
|
new_dict['content'] = sentences[s]
|
|
new_idx+=1
|
|
new_idx+=1
|
|
splitted_dict.append(new_dict)
|
|
splitted_dict.append(new_dict)
|
|
-
|
|
|
|
|
|
+
|
|
return splitted_dict
|
|
return splitted_dict
|
|
-def adjustSub_by_text_similarity(gts,gens):
|
|
|
|
- adjusted = [None]*len(gens)
|
|
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def adjustSub_by_text_similarity(gts,gens_raw):
|
|
|
|
+ gens = []
|
|
|
|
+ for idx in range(int((len(gens_raw)+1)/4)):
|
|
|
|
+ gens.append(gens_raw[idx*4+2])
|
|
|
|
+
|
|
combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
|
|
combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
|
|
combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
|
|
combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
|
|
alls = gts + combine2 + combine3
|
|
alls = gts + combine2 + combine3
|
|
|
|
|
|
|
|
+ adjusted = [None]*len(gens)
|
|
|
|
+ duplicated_list = []
|
|
for idx in range(len(gens)):
|
|
for idx in range(len(gens)):
|
|
match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
|
|
match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
|
|
- if len(match_text) != 0 and idx:
|
|
|
|
|
|
+ if match_text[0] in duplicated_list:
|
|
|
|
+ for mt in match_text:
|
|
|
|
+ if mt == adjusted[idx-1] or mt not in duplicated_list:
|
|
|
|
+ adjusted[idx] = mt
|
|
|
|
+ break
|
|
|
|
+ else:
|
|
adjusted[idx] = match_text[0]
|
|
adjusted[idx] = match_text[0]
|
|
- #1.is duplicated 2.is near
|
|
|
|
-
|
|
|
|
- if idx % 2 ==0:
|
|
|
|
- #print(gens[idx]+'||||校正後: '+match_text[0])
|
|
|
|
- print(match_text[0])
|
|
|
|
|
|
+ duplicated_list.append(match_text[0])
|
|
return adjusted
|
|
return adjusted
|
|
|
|
|
|
def trim_punctuation(s):
|
|
def trim_punctuation(s):
|