4 年之前 · 4ceea7f114
--- a/subGenerator/split.py
+++ b/subGenerator/split.py
@@ -8,53 +8,63 @@ def sentence_time_ratio(text,maxLen):
 
				     if total_len > maxLen:
			
 
				         left_word = total_len % maxLen
			
 
				         times = int(math.ceil(total_len/maxLen))
			
 
				-        if left_word < 3:
			
 
				+        if left_word < 5:
			
 
				             times+=1
			
 
				         sen_len = int(total_len/times)
			
 
				     
			
 
				         time_ratio = [None]*times
			
 
				         sentences = [None]*times
			
 
				+        print(times,',',total_len,",",sen_len)
			
 
				         for t in range(times):
			
 
				+            
			
 
				             sentences[t] = text[t*sen_len:t*sen_len+sen_len]
			
 
				             time_ratio[t] = len(sentences[t])/total_len
			
 
				     else:
			
 
				+        
			
 
				         time_ratio = [1]
			
 
				-        sen_len = total_len
			
 
				         sentences = [text]
			
 
				-    return sen_len, time_ratio, sentences
			
 
				-
			
 
				-#case :  1.短句mactch到很長的句子
			
 
				-        
			
 
				-        
			
 
				     
			
 
				+    return time_ratio, sentences
			
 
				 def parse_script(file_path,gt_list):
			
 
				     with open(file_path, 'r',encoding="utf-8") as f:
			
 
				         raw_lines = [line.strip() for line in f]
			
 
				     lines = adjustSub_by_text_similarity(gt_list,raw_lines)
			
 
				+    #make dict
			
 
				     dict_list = []
			
 
				-
			
 
				-    for idx in range(int((len(lines)+1)/4)):
			
 
				+    for idx in range(len(lines)):
			
 
				         script={}
			
 
				-        script['index'] = idx
			
 
				-        time_raw = raw_lines[idx * 4 +1 ]
			
 
				-        script['content'] = lines[idx*4+2]
			
 
				-        start = time_raw.split(' --> ')[0].split(':')
			
 
				-        stop = time_raw.split(' --> ')[1].split(':')
			
 
				-        start[2] = start[2].replace(',','.')
			
 
				-        stop[2] = stop[2].replace(',','.')
			
 
				-        start_sec = float(start[0])*3600 + float(start[1])*60 + float(start[2])
			
 
				-        stop_sec = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2])
			
 
				-        duration = start_sec-stop_sec
			
 
				-        script['start'] = start_sec
			
 
				-        script['stop'] = stop_sec
			
 
				-        script['duration'] = abs(duration)
			
 
				+        script['content'] = lines[idx]
			
 
				+        time_raw = raw_lines[idx * 4 +1 ].split(' --> ')
			
 
				+        start = time_raw[0].split(':')
			
 
				+        stop = time_raw[1].split(':')
			
 
				+        script['start'] = float(start[0])*3600 + float(start[1])*60 + float(start[2].replace(',','.'))
			
 
				+        script['stop'] = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2].replace(',','.'))
			
 
				         dict_list.append(script)
			
 
				+    #merge duplicated sentences
			
 
				+    script_not_dup_list = []
			
 
				+    for idx in range(len(dict_list)):
			
 
				+        dup_list = []
			
 
				+        for idx_inner in range(len(dict_list)):
			
 
				+            if dict_list[idx_inner]['content']==dict_list[idx]['content']:
			
 
				+                dup_list.append(idx_inner)
			
 
				+        for dup_idx in dup_list:
			
 
				+            if dup_idx == min(dup_list):
			
 
				+                dict_list[dup_idx]['type'] = 'lead_sentence'
			
 
				+            else:
			
 
				+                dict_list[dup_idx]['type'] = 'duplicated'
			
 
				+        dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop']
			
 
				+        if dict_list[idx]['type'] == 'lead_sentence':
			
 
				+            script_not_dup_list.append(dict_list[idx])
			
 
				+                
			
 
				+    #avoid subtitle overlapping ?   Timeline overlapping not found currently
			
 
				 
			
 
				     
			
 
				+    #cut by max length---->  eng seperated problem   {eng_idx}
			
 
				+    #ENG counts, zh counts, space counts
			
 
				     new_idx = 0
			
 
				     splitted_dict = []
			
 
				     for dic in dict_list:
			
 
				-        sen_len, time_ratio, sentences = sentence_time_ratio(dic['content'],13)
			
 
				+        time_ratio, sentences = sentence_time_ratio(dic['content'],13)
			
 
				         for s in range(len(sentences)):
			
 
				             new_dict = {}
			
 
				             new_dict['index'] = new_idx
			
@@ -66,23 +76,32 @@ def parse_script(file_path,gt_list):
 
				             new_dict['content'] = sentences[s]
			
 
				             new_idx+=1
			
 
				             splitted_dict.append(new_dict)
			
 
				-    
			
 
				+
			
 
				     return splitted_dict
			
 
				-def adjustSub_by_text_similarity(gts,gens):
			
 
				-    adjusted = [None]*len(gens)
			
 
				+
			
 
				+
			
 
				+
			
 
				+def adjustSub_by_text_similarity(gts,gens_raw):
			
 
				+    gens = []
			
 
				+    for idx in range(int((len(gens_raw)+1)/4)):
			
 
				+        gens.append(gens_raw[idx*4+2])
			
 
				+    
			
 
				     combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
			
 
				     combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
			
 
				     alls = gts + combine2 + combine3
			
 
				 
			
 
				+    adjusted = [None]*len(gens)
			
 
				+    duplicated_list = []
			
 
				     for idx in range(len(gens)):
			
 
				         match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
			
 
				-        if len(match_text) != 0 and idx:
			
 
				+        if match_text[0] in duplicated_list:
			
 
				+            for mt in match_text:
			
 
				+                if mt == adjusted[idx-1] or mt not in duplicated_list:
			
 
				+                    adjusted[idx] = mt
			
 
				+                    break
			
 
				+        else:
			
 
				             adjusted[idx] = match_text[0]
			
 
				-            #1.is duplicated 2.is near
			
 
				-
			
 
				-            if idx % 2 ==0:
			
 
				-                #print(gens[idx]+'||||校正後： '+match_text[0])
			
 
				-                print(match_text[0])
			
 
				+            duplicated_list.append(match_text[0])
			
 
				     return adjusted
			
 
				 
			
 
				 def trim_punctuation(s):