import re import difflib import math from itertools import groupby from operator import itemgetter def syllable_count(word): word = word.lower() count = 0 vowels = "aeiouy" if word[0] in vowels: count += 1 for index in range(1, len(word)): if word[index] in vowels and word[index - 1] not in vowels: count += 1 if word.endswith("e"): count -= 1 if count == 0: count += 1 return count def split_sentence(in_str, maxLen): re.findall(r'[\u4e00-\u9fff]+', in_str) zh_idx = [] eng_idx= [] for i in range(len(in_str)): if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff': zh_idx.append(i) else: eng_idx.append(i) space_index = [m.start() for m in re.finditer(' ', in_str)] for idx in space_index: eng_idx.remove(idx) eng_range_list = [] for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]): eng_range = list(map(itemgetter(1), g)) eng_range_list.append(eng_range) total_syllable = 0 for i in range(len(eng_range_list)): total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5) for i in range(len(zh_idx)): total_syllable+=1 #final chchchchchc[en][en][en] #[en] is a vocabulary dict with occurence of image zh_eng_idx_list = [] i = 0 while i < len(in_str): if in_str[i]==' ': i+=1 if i in zh_idx: zh_eng_idx_list.append(i) i+=1 if i in eng_idx: for ls in eng_range_list: if i in ls: zh_eng_idx_list.append(ls) i = ls[-1]+1 break zh_eng_dict_list = [{'content':'','time_ratio':0}] idx = 0 current_len = 0 sen_idx = 0 while idx < len(zh_eng_idx_list): str_from_idx = '' sylla_cnt = 1 if type(zh_eng_idx_list[idx])==type([]): str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' ' sylla_cnt = syllable_count(str_from_idx) else: str_from_idx = in_str[zh_eng_idx_list[idx]] if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen: zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable zh_eng_dict_list.append({'content':'','time_ratio':0}) sen_idx+=1 current_len = 0 else: current_len += sylla_cnt zh_eng_dict_list[sen_idx]['content'] += str_from_idx idx+=1 total_ratio = 0 for obj in zh_eng_dict_list: total_ratio+=obj['time_ratio'] zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio return zh_eng_dict_list def parse_script(file_path,gt_list): with open(file_path, 'r',encoding="utf-8") as f: raw_lines = [line.strip() for line in f] lines = adjustSub_by_text_similarity(gt_list,raw_lines) #make dict dict_list = [] for idx in range(len(lines)): script={} script['content'] = lines[idx] time_raw = raw_lines[idx * 4 +1 ].split(' --> ') start = time_raw[0].split(':') stop = time_raw[1].split(':') script['start'] = float(start[0])*3600 + float(start[1])*60 + float(start[2].replace(',','.')) script['stop'] = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2].replace(',','.')) dict_list.append(script) #merge duplicated sentences script_not_dup_list = [] for idx in range(len(dict_list)): dup_list = [] for idx_inner in range(len(dict_list)): if dict_list[idx_inner]['content']==dict_list[idx]['content']: dup_list.append(idx_inner) for dup_idx in dup_list: if dup_idx == min(dup_list): dict_list[dup_idx]['type'] = 'lead_sentence' else: dict_list[dup_idx]['type'] = 'duplicated' dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop'] if dict_list[idx]['type'] == 'lead_sentence': script_not_dup_list.append(dict_list[idx]) #avoid subtitle overlapping ? Timeline overlapping not found currently #cut by max length----> eng seperated problem {eng_idx} #ENG counts, zh counts, space counts new_idx = 0 splitted_dict = [] for dic in script_not_dup_list: dic_idx = 0 accumulated_duration = 0 duration = dic['stop']-dic['start'] print(duration) for sub_dic in split_sentence(dic['content'],13): new_dic = {} ind_duration = duration * sub_dic['time_ratio'] new_dic['start'] = dic['start'] + accumulated_duration accumulated_duration += ind_duration new_dic['content'] = sub_dic['content'] new_dic['duration'] = ind_duration splitted_dict.append(new_dic) for obj in splitted_dict: print(obj) return splitted_dict def adjustSub_by_text_similarity(gts,gens_raw): gens = [] for idx in range(int((len(gens_raw)+1)/4)): gens.append(gens_raw[idx*4+2]) combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])] combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])] alls = gts + combine2 + combine3 adjusted = [None]*len(gens) duplicated_list = [] for idx in range(len(gens)): match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1) if match_text[0] in duplicated_list: for mt in match_text: if mt == adjusted[idx-1] or mt not in duplicated_list: adjusted[idx] = mt break else: adjusted[idx] = match_text[0] duplicated_list.append(match_text[0]) return adjusted def trim_punctuation(s): pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+'; pattern = u'([0-9]+{0}[0-9]+)|{0}'.format(pat_block) res = re.sub(pattern, lambda x: x.group(1) if x.group(1) else u" " ,s) return res def splitter(s): for sent in re.findall(u'[^!?,。\!\?]+[!? 。\!\?]?', s, flags=re.U): yield sent def split_by_pun(s): res = list(splitter(s)) return res def split_by_word(s): slice_size = 3 paragraph_len = len(s) slice_num = int(math.ceil(paragraph_len/slice_size)) slice_list = [] for n in range(slice_num): slice_list.append(s[n*slice_size:n*slice_size+slice_size]) return slice_list raw_str = '更糟糕的是,與大量關注相伴的並非用戶讚賞,而是 Windows 10 on ARM 的不成熟暴露無遺,以及隨之而來的如潮差評──對用戶使用體驗影響最惡劣的,莫過於 Windows 10 on ARM 僅能透過模擬兼容老舊過時的 32 位元 x86 應用,而對效能與普及度俱佳的 64 位元 x86(即 x64)應用無能為力' sub_dict = parse_script("out.txt",split_by_pun(raw_str))