4 rokov pred · 238804785a
--- a/subGenerator/split.py
+++ b/subGenerator/split.py
@@ -0,0 +1,111 @@
 
				+import re
			
 
				+import difflib
			
 
				+import math
			
 
				+
			
 
				+def sentence_time_ratio(text,maxLen):
			
 
				+    
			
 
				+    total_len = len(text)
			
 
				+    if total_len > maxLen:
			
 
				+        left_word = total_len % maxLen
			
 
				+        times = int(math.ceil(total_len/maxLen))
			
 
				+        if left_word < 3:
			
 
				+            times+=1
			
 
				+        sen_len = int(total_len/times)
			
 
				+    
			
 
				+        time_ratio = [None]*times
			
 
				+        sentences = [None]*times
			
 
				+        for t in range(times):
			
 
				+            sentences[t] = text[t*sen_len:t*sen_len+sen_len]
			
 
				+            time_ratio[t] = len(sentences[t])/total_len
			
 
				+    else:
			
 
				+        time_ratio = [1]
			
 
				+        sen_len = total_len
			
 
				+        sentences = [text]
			
 
				+    return sen_len, time_ratio, sentences
			
 
				+
			
 
				+#case :  1.短句mactch到很長的句子
			
 
				+        
			
 
				+        
			
 
				+    
			
 
				+def parse_script(file_path,gt_list):
			
 
				+    with open(file_path, 'r',encoding="utf-8") as f:
			
 
				+        raw_lines = [line.strip() for line in f]
			
 
				+    lines = adjustSub_by_text_similarity(gt_list,raw_lines)
			
 
				+    dict_list = []
			
 
				+
			
 
				+    for idx in range(int((len(lines)+1)/4)):
			
 
				+        script={}
			
 
				+        script['index'] = idx
			
 
				+        time_raw = raw_lines[idx * 4 +1 ]
			
 
				+        script['content'] = lines[idx*4+2]
			
 
				+        start = time_raw.split(' --> ')[0].split(':')
			
 
				+        stop = time_raw.split(' --> ')[1].split(':')
			
 
				+        start[2] = start[2].replace(',','.')
			
 
				+        stop[2] = stop[2].replace(',','.')
			
 
				+        start_sec = float(start[0])*3600 + float(start[1])*60 + float(start[2])
			
 
				+        stop_sec = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2])
			
 
				+        duration = start_sec-stop_sec
			
 
				+        script['start'] = start_sec
			
 
				+        script['stop'] = stop_sec
			
 
				+        script['duration'] = abs(duration)
			
 
				+        dict_list.append(script)
			
 
				+
			
 
				+    
			
 
				+    new_idx = 0
			
 
				+    splitted_dict = []
			
 
				+    for dic in dict_list:
			
 
				+        sen_len, time_ratio, sentences = sentence_time_ratio(dic['content'],13)
			
 
				+        for s in range(len(sentences)):
			
 
				+            new_dict = {}
			
 
				+            new_dict['index'] = new_idx
			
 
				+            start = dic['start']
			
 
				+            for t in range(s):
			
 
				+                start += (dic['duration']*time_ratio[t])
			
 
				+            new_dict['start'] = start
			
 
				+            new_dict['duration'] = dic['duration'] * time_ratio[s]
			
 
				+            new_dict['content'] = sentences[s]
			
 
				+            new_idx+=1
			
 
				+            splitted_dict.append(new_dict)
			
 
				+    
			
 
				+    return splitted_dict
			
 
				+def adjustSub_by_text_similarity(gts,gens):
			
 
				+    adjusted = [None]*len(gens)
			
 
				+    combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
			
 
				+    combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
			
 
				+    alls = gts + combine2 + combine3
			
 
				+
			
 
				+    for idx in range(len(gens)):
			
 
				+        match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
			
 
				+        if len(match_text) != 0 and idx:
			
 
				+            adjusted[idx] = match_text[0]
			
 
				+            #1.is duplicated 2.is near
			
 
				+
			
 
				+            if idx % 2 ==0:
			
 
				+                #print(gens[idx]+'||||校正後： '+match_text[0])
			
 
				+                print(match_text[0])
			
 
				+    return adjusted
			
 
				+
			
 
				+def trim_punctuation(s):
			
 
				+    pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+';
			
 
				+    pattern = u'([0-9]+{0}[0-9]+)|{0}'.format(pat_block)
			
 
				+    res = re.sub(pattern, lambda x: x.group(1) if x.group(1) else u" " ,s)
			
 
				+    return res
			
 
				+
			
 
				+def splitter(s):
			
 
				+    for sent in re.findall(u'[^!?，。\!\?]+[!? 。\!\?]?', s, flags=re.U):
			
 
				+        yield sent
			
 
				+
			
 
				+def split_by_pun(s):
			
 
				+    res = list(splitter(s))
			
 
				+    return res
			
 
				+def split_by_word(s):
			
 
				+    slice_size = 3
			
 
				+    paragraph_len = len(s)
			
 
				+    slice_num = int(math.ceil(paragraph_len/slice_size))
			
 
				+    slice_list = []
			
 
				+    for n in range(slice_num):
			
 
				+        slice_list.append(s[n*slice_size:n*slice_size+slice_size])
			
 
				+    return slice_list
			
 
				+
			
 
				+raw_str = '更糟糕的是，與大量關注相伴的並非用戶讚賞，而是 Windows 10 on ARM 的不成熟暴露無遺，以及隨之而來的如潮差評──對用戶使用體驗影響最惡劣的，莫過於 Windows 10 on ARM 僅能透過模擬兼容老舊過時的 32 位元 x86 應用，而對效能與普及度俱佳的 64 位元 x86（即 x64）應用無能為力'
			
 
				+sub_dict = parse_script("out.txt",split_by_pun(raw_str))