| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202 | import reimport difflibimport mathfrom itertools import groupbyfrom operator import itemgetterdef syllable_count(word):    word = word.lower()    count = 0    vowels = "aeiouy"    if word[0] in vowels:        count += 1    for index in range(1, len(word)):        if word[index] in vowels and word[index - 1] not in vowels:            count += 1    if word.endswith("e"):        count -= 1    if count == 0:        count += 1    return countdef split_sentence(in_str, maxLen):    re.findall(r'[\u4e00-\u9fff]+', in_str)    zh_idx = []    eng_idx= []    for i in range(len(in_str)):        if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':            zh_idx.append(i)        else:            eng_idx.append(i)    space_index = [m.start() for m in re.finditer(' ', in_str)]    for idx in space_index:        eng_idx.remove(idx)        eng_range_list = []    for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):        eng_range = list(map(itemgetter(1), g))        eng_range_list.append(eng_range)    total_syllable = 0    for i in range(len(eng_range_list)):        total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5)    for i in range(len(zh_idx)):        total_syllable+=1        #final chchchchchc[en][en][en]    #[en] is a vocabulary dict with  occurence of image    zh_eng_idx_list = []    i = 0    while i < len(in_str):        if in_str[i]==' ':            i+=1        if i in zh_idx:            zh_eng_idx_list.append(i)            i+=1        if i in eng_idx:            for ls in eng_range_list:                if i in ls:                    zh_eng_idx_list.append(ls)                    i = ls[-1]+1                    break                zh_eng_dict_list = [{'content':'','time_ratio':0}]    idx = 0     current_len = 0    sen_idx = 0    while idx < len(zh_eng_idx_list):        str_from_idx = ''        sylla_cnt = 1        if type(zh_eng_idx_list[idx])==type([]):            str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' '            sylla_cnt = syllable_count(str_from_idx)        else:            str_from_idx = in_str[zh_eng_idx_list[idx]]                  if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen:            zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable                       zh_eng_dict_list.append({'content':'','time_ratio':0})            sen_idx+=1            current_len = 0        else:            current_len += sylla_cnt            zh_eng_dict_list[sen_idx]['content'] += str_from_idx            idx+=1            total_ratio = 0    for obj in zh_eng_dict_list:        total_ratio+=obj['time_ratio']    zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio    return zh_eng_dict_list   def parse_script(file_path,gt_list):    with open(file_path, 'r',encoding="utf-8") as f:        raw_lines = [line.strip() for line in f]    lines = adjustSub_by_text_similarity(gt_list,raw_lines)    #make dict    dict_list = []    for idx in range(len(lines)):        script={}        script['content'] = lines[idx]        time_raw = raw_lines[idx * 4 +1 ].split(' --> ')        start = time_raw[0].split(':')        stop = time_raw[1].split(':')        script['start'] = float(start[0])*3600 + float(start[1])*60 + float(start[2].replace(',','.'))        script['stop'] = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2].replace(',','.'))        dict_list.append(script)    #merge duplicated sentences    script_not_dup_list = []    for idx in range(len(dict_list)):        dup_list = []        for idx_inner in range(len(dict_list)):            if dict_list[idx_inner]['content']==dict_list[idx]['content']:                dup_list.append(idx_inner)        for dup_idx in dup_list:            if dup_idx == min(dup_list):                dict_list[dup_idx]['type'] = 'lead_sentence'            else:                dict_list[dup_idx]['type'] = 'duplicated'        dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop']        if dict_list[idx]['type'] == 'lead_sentence':            script_not_dup_list.append(dict_list[idx])                    #avoid subtitle overlapping ?   Timeline overlapping not found currently    #cut by max length---->  eng seperated problem   {eng_idx}    #ENG counts, zh counts, space counts    new_idx = 0    splitted_dict = []    for dic in script_not_dup_list:        dic_idx = 0        accumulated_duration = 0        duration = dic['stop']-dic['start']        print(duration)        for sub_dic in split_sentence(dic['content'],13):            new_dic = {}            ind_duration = duration * sub_dic['time_ratio']            new_dic['start'] = dic['start'] + accumulated_duration            accumulated_duration += ind_duration                        new_dic['content'] = sub_dic['content']            new_dic['duration'] = ind_duration            splitted_dict.append(new_dic)    for obj in splitted_dict:        print(obj)    return splitted_dictdef adjustSub_by_text_similarity(gts,gens_raw):    gens = []    for idx in range(int((len(gens_raw)+1)/4)):        gens.append(gens_raw[idx*4+2])        combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]    combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]    alls = gts + combine2 + combine3    adjusted = [None]*len(gens)    duplicated_list = []    for idx in range(len(gens)):        match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)        if match_text[0] in duplicated_list:            for mt in match_text:                if mt == adjusted[idx-1] or mt not in duplicated_list:                    adjusted[idx] = mt                    break        else:            adjusted[idx] = match_text[0]            duplicated_list.append(match_text[0])    return adjusteddef trim_punctuation(s):    pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+';    pattern = u'([0-9]+{0}[0-9]+)|{0}'.format(pat_block)    res = re.sub(pattern, lambda x: x.group(1) if x.group(1) else u" " ,s)    return resdef splitter(s):    for sent in re.findall(u'[^!?,。\!\?]+[!? 。\!\?]?', s, flags=re.U):        yield sentdef split_by_pun(s):    res = list(splitter(s))    return resdef split_by_word(s):    slice_size = 3    paragraph_len = len(s)    slice_num = int(math.ceil(paragraph_len/slice_size))    slice_list = []    for n in range(slice_num):        slice_list.append(s[n*slice_size:n*slice_size+slice_size])    return slice_listraw_str = '更糟糕的是,與大量關注相伴的並非用戶讚賞,而是 Windows 10 on ARM 的不成熟暴露無遺,以及隨之而來的如潮差評──對用戶使用體驗影響最惡劣的,莫過於 Windows 10 on ARM 僅能透過模擬兼容老舊過時的 32 位元 x86 應用,而對效能與普及度俱佳的 64 位元 x86(即 x64)應用無能為力'sub_dict = parse_script("out.txt",split_by_pun(raw_str))
 |