|
@@ -383,9 +383,7 @@ def parse_script(file_path,gt_list):
|
|
|
dict_list = []
|
|
|
for idx in range(len(lines)):
|
|
|
script={}
|
|
|
- print(lines[idx])
|
|
|
rep_ls = text_parser.replace_list(lines[idx])
|
|
|
- print(rep_ls)
|
|
|
line_content = lines[idx]
|
|
|
for reptxt in rep_ls:
|
|
|
line_content = line_content.replace(reptxt,'')
|
|
@@ -398,32 +396,42 @@ def parse_script(file_path,gt_list):
|
|
|
script['start'] = float(start[0])*3600 + float(start[1])*60 + float(start[2].replace(',','.'))
|
|
|
script['stop'] = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2].replace(',','.'))
|
|
|
dict_list.append(script)
|
|
|
+
|
|
|
#merge duplicated sentences
|
|
|
+ skip_list = []
|
|
|
script_not_dup_list = []
|
|
|
for idx in range(len(dict_list)):
|
|
|
- dup_list = []
|
|
|
- for idx_inner in range(len(dict_list)):
|
|
|
- if dict_list[idx_inner]['content']==dict_list[idx]['content']:
|
|
|
- dup_list.append(idx_inner)
|
|
|
- for dup_idx in dup_list:
|
|
|
- if dup_idx == min(dup_list):
|
|
|
- dict_list[dup_idx]['type'] = 'lead_sentence'
|
|
|
- else:
|
|
|
- dict_list[dup_idx]['type'] = 'duplicated'
|
|
|
- dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop']
|
|
|
- if dict_list[idx]['type'] == 'lead_sentence':
|
|
|
- script_not_dup_list.append(dict_list[idx])
|
|
|
-
|
|
|
- #avoid subtitle overlapping ? Timeline overlapping not found currently
|
|
|
- #cut by max length----> eng seperated problem {eng_idx}
|
|
|
- #ENG counts, zh counts, space counts
|
|
|
-
|
|
|
+ if idx not in skip_list:
|
|
|
+ dup_list = []
|
|
|
+ found = 0
|
|
|
+ for idx_inner in range(len(dict_list)):
|
|
|
+ if dict_list[idx_inner]['content'] == dict_list[idx]['content'] and idx <= idx_inner:
|
|
|
+ dup_list.append(idx_inner)
|
|
|
+ skip_list.append(idx_inner)
|
|
|
+ found += 1
|
|
|
+ if found != 0 and dict_list[idx_inner]['content']!=dict_list[idx]['content'] and idx <= idx_inner:
|
|
|
+ found = 0
|
|
|
+ break
|
|
|
+
|
|
|
+ for dup_idx in dup_list:
|
|
|
+ if dup_idx == min(dup_list):
|
|
|
+ dict_list[dup_idx]['type'] = 'lead_sentence'
|
|
|
+ else:
|
|
|
+ dict_list[dup_idx]['type'] = 'duplicated'
|
|
|
+ dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop']
|
|
|
+
|
|
|
+ if dict_list[idx]['type'] == 'lead_sentence':
|
|
|
+ script_not_dup_list.append(dict_list[idx])
|
|
|
+
|
|
|
+
|
|
|
new_idx = 0
|
|
|
splitted_dict = []
|
|
|
for dic in script_not_dup_list:
|
|
|
dic_idx = 0
|
|
|
accumulated_duration = 0
|
|
|
duration = dic['stop']-dic['start']
|
|
|
+ if duration > 5:
|
|
|
+ print('fuck',dic)
|
|
|
for sub_dic in split_sentence(dic['content'],13):
|
|
|
new_dic = {}
|
|
|
new_dic['index'] = new_idx
|
|
@@ -434,14 +442,8 @@ def parse_script(file_path,gt_list):
|
|
|
new_dic['start'] = dic['start'] + accumulated_duration
|
|
|
accumulated_duration += ind_duration
|
|
|
new_dic['content'] = sub_dic['content']
|
|
|
- new_dic['duration'] = ind_duration*0.9
|
|
|
- if new_dic['duration'] > 3:
|
|
|
- print('-----------------------------')
|
|
|
- print('origin duration : ', duration)
|
|
|
- print(dic)
|
|
|
- print('-----------------------------')
|
|
|
+ new_dic['duration'] = ind_duration*0.7
|
|
|
splitted_dict.append(new_dic)
|
|
|
-
|
|
|
return splitted_dict
|
|
|
|
|
|
|
|
@@ -462,21 +464,23 @@ def adjustSub_by_text_similarity(gts_in,gens_raw):
|
|
|
combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
|
|
|
combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
|
|
|
alls = gts + combine2 + combine3
|
|
|
-
|
|
|
adjusted = [None]*len(gens)
|
|
|
duplicated_list = []
|
|
|
for idx in range(len(gens)):
|
|
|
match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
|
|
|
+
|
|
|
if match_text[0] in duplicated_list:
|
|
|
for mt in match_text:
|
|
|
- if mt == adjusted[idx-1] or mt not in duplicated_list:
|
|
|
+ if mt == adjusted[idx-1]: #or mt not in duplicated_list:
|
|
|
adjusted[idx] = mt
|
|
|
break
|
|
|
else:
|
|
|
adjusted[idx] = match_text[0]
|
|
|
duplicated_list.append(match_text[0])
|
|
|
+
|
|
|
if None == adjusted[idx]:
|
|
|
adjusted[idx] = gens[idx]
|
|
|
+
|
|
|
|
|
|
combine2_tag = [''.join([i,j]) for i,j in zip(gts_in, gts_in[1:])]
|
|
|
combine3_tag = [''.join([i,j,k]) for i,j,k in zip(gts_in, gts_in[1:], gts_in[2:])]
|
|
@@ -485,7 +489,6 @@ def adjustSub_by_text_similarity(gts_in,gens_raw):
|
|
|
for idx in range(len(adjusted)):
|
|
|
match_text = difflib.get_close_matches(adjusted[idx], alls_tag, cutoff=0.1)
|
|
|
adjusted[idx] = match_text[0]
|
|
|
-
|
|
|
return adjusted
|
|
|
|
|
|
def trim_punctuation(s):
|