| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202 | 
							- import re
 
- import difflib
 
- import math
 
- from itertools import groupby
 
- from operator import itemgetter
 
- def syllable_count(word):
 
-     word = word.lower()
 
-     count = 0
 
-     vowels = "aeiouy"
 
-     if word[0] in vowels:
 
-         count += 1
 
-     for index in range(1, len(word)):
 
-         if word[index] in vowels and word[index - 1] not in vowels:
 
-             count += 1
 
-     if word.endswith("e"):
 
-         count -= 1
 
-     if count == 0:
 
-         count += 1
 
-     return count
 
- def split_sentence(in_str, maxLen):
 
-     re.findall(r'[\u4e00-\u9fff]+', in_str)
 
-     zh_idx = []
 
-     eng_idx= []
 
-     for i in range(len(in_str)):
 
-         if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
 
-             zh_idx.append(i)
 
-         else:
 
-             eng_idx.append(i)
 
-     space_index = [m.start() for m in re.finditer(' ', in_str)]
 
-     for idx in space_index:
 
-         eng_idx.remove(idx)
 
-     
 
-     eng_range_list = []
 
-     for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
 
-         eng_range = list(map(itemgetter(1), g))
 
-         eng_range_list.append(eng_range)
 
-     total_syllable = 0
 
-     for i in range(len(eng_range_list)):
 
-         total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5)
 
-     for i in range(len(zh_idx)):
 
-         total_syllable+=1
 
-     
 
-     #final chchchchchc[en][en][en]
 
-     #[en] is a vocabulary dict with  occurence of image
 
-     zh_eng_idx_list = []
 
-     i = 0
 
-     while i < len(in_str):
 
-         if in_str[i]==' ':
 
-             i+=1
 
-         if i in zh_idx:
 
-             zh_eng_idx_list.append(i)
 
-             i+=1
 
-         if i in eng_idx:
 
-             for ls in eng_range_list:
 
-                 if i in ls:
 
-                     zh_eng_idx_list.append(ls)
 
-                     i = ls[-1]+1
 
-                     break
 
-             
 
-     zh_eng_dict_list = [{'content':'','time_ratio':0}]
 
-     idx = 0 
 
-     current_len = 0
 
-     sen_idx = 0
 
-     while idx < len(zh_eng_idx_list):
 
-         str_from_idx = ''
 
-         sylla_cnt = 1
 
-         if type(zh_eng_idx_list[idx])==type([]):
 
-             str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' '
 
-             sylla_cnt = syllable_count(str_from_idx)
 
-         else:
 
-             str_from_idx = in_str[zh_eng_idx_list[idx]]
 
-     
 
-       
 
-         if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen:
 
-             zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable
 
-            
 
-             zh_eng_dict_list.append({'content':'','time_ratio':0})
 
-             sen_idx+=1
 
-             current_len = 0
 
-         else:
 
-             current_len += sylla_cnt
 
-             zh_eng_dict_list[sen_idx]['content'] += str_from_idx
 
-             idx+=1
 
-         
 
-     total_ratio = 0
 
-     for obj in zh_eng_dict_list:
 
-         total_ratio+=obj['time_ratio']
 
-     zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio
 
-     return zh_eng_dict_list
 
-    
 
- def parse_script(file_path,gt_list):
 
-     with open(file_path, 'r',encoding="utf-8") as f:
 
-         raw_lines = [line.strip() for line in f]
 
-     lines = adjustSub_by_text_similarity(gt_list,raw_lines)
 
-     #make dict
 
-     dict_list = []
 
-     for idx in range(len(lines)):
 
-         script={}
 
-         script['content'] = lines[idx]
 
-         time_raw = raw_lines[idx * 4 +1 ].split(' --> ')
 
-         start = time_raw[0].split(':')
 
-         stop = time_raw[1].split(':')
 
-         script['start'] = float(start[0])*3600 + float(start[1])*60 + float(start[2].replace(',','.'))
 
-         script['stop'] = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2].replace(',','.'))
 
-         dict_list.append(script)
 
-     #merge duplicated sentences
 
-     script_not_dup_list = []
 
-     for idx in range(len(dict_list)):
 
-         dup_list = []
 
-         for idx_inner in range(len(dict_list)):
 
-             if dict_list[idx_inner]['content']==dict_list[idx]['content']:
 
-                 dup_list.append(idx_inner)
 
-         for dup_idx in dup_list:
 
-             if dup_idx == min(dup_list):
 
-                 dict_list[dup_idx]['type'] = 'lead_sentence'
 
-             else:
 
-                 dict_list[dup_idx]['type'] = 'duplicated'
 
-         dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop']
 
-         if dict_list[idx]['type'] == 'lead_sentence':
 
-             script_not_dup_list.append(dict_list[idx])
 
-                 
 
-     #avoid subtitle overlapping ?   Timeline overlapping not found currently
 
-     #cut by max length---->  eng seperated problem   {eng_idx}
 
-     #ENG counts, zh counts, space counts
 
-     new_idx = 0
 
-     splitted_dict = []
 
-     for dic in script_not_dup_list:
 
-         dic_idx = 0
 
-         accumulated_duration = 0
 
-         duration = dic['stop']-dic['start']
 
-         print(duration)
 
-         for sub_dic in split_sentence(dic['content'],13):
 
-             new_dic = {}
 
-             ind_duration = duration * sub_dic['time_ratio']
 
-             new_dic['start'] = dic['start'] + accumulated_duration
 
-             accumulated_duration += ind_duration
 
-             
 
-             new_dic['content'] = sub_dic['content']
 
-             new_dic['duration'] = ind_duration
 
-             splitted_dict.append(new_dic)
 
-     for obj in splitted_dict:
 
-         print(obj)
 
-     return splitted_dict
 
- def adjustSub_by_text_similarity(gts,gens_raw):
 
-     gens = []
 
-     for idx in range(int((len(gens_raw)+1)/4)):
 
-         gens.append(gens_raw[idx*4+2])
 
-     
 
-     combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
 
-     combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
 
-     alls = gts + combine2 + combine3
 
-     adjusted = [None]*len(gens)
 
-     duplicated_list = []
 
-     for idx in range(len(gens)):
 
-         match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
 
-         if match_text[0] in duplicated_list:
 
-             for mt in match_text:
 
-                 if mt == adjusted[idx-1] or mt not in duplicated_list:
 
-                     adjusted[idx] = mt
 
-                     break
 
-         else:
 
-             adjusted[idx] = match_text[0]
 
-             duplicated_list.append(match_text[0])
 
-     return adjusted
 
- def trim_punctuation(s):
 
-     pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+';
 
-     pattern = u'([0-9]+{0}[0-9]+)|{0}'.format(pat_block)
 
-     res = re.sub(pattern, lambda x: x.group(1) if x.group(1) else u" " ,s)
 
-     return res
 
- def splitter(s):
 
-     for sent in re.findall(u'[^!?,。\!\?]+[!? 。\!\?]?', s, flags=re.U):
 
-         yield sent
 
- def split_by_pun(s):
 
-     res = list(splitter(s))
 
-     return res
 
- def split_by_word(s):
 
-     slice_size = 3
 
-     paragraph_len = len(s)
 
-     slice_num = int(math.ceil(paragraph_len/slice_size))
 
-     slice_list = []
 
-     for n in range(slice_num):
 
-         slice_list.append(s[n*slice_size:n*slice_size+slice_size])
 
-     return slice_list
 
- raw_str = '更糟糕的是,與大量關注相伴的並非用戶讚賞,而是 Windows 10 on ARM 的不成熟暴露無遺,以及隨之而來的如潮差評──對用戶使用體驗影響最惡劣的,莫過於 Windows 10 on ARM 僅能透過模擬兼容老舊過時的 32 位元 x86 應用,而對效能與普及度俱佳的 64 位元 x86(即 x64)應用無能為力'
 
- sub_dict = parse_script("out.txt",split_by_pun(raw_str))
 
 
  |