choozmo
/
AI_Spokesgirl


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
							import re
import difflib
import math
from itertools import groupby
from operator import itemgetter

def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1

    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def split_sentence(in_str, maxLen):
    re.findall(r'[\u4e00-\u9fff]+', in_str)

    zh_idx = []
    eng_idx= []
    for i in range(len(in_str)):
        if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
            zh_idx.append(i)
        else:
            eng_idx.append(i)

    space_index = [m.start() for m in re.finditer(' ', in_str)]
    for idx in space_index:
        eng_idx.remove(idx)
    
    eng_range_list = []
    for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
        eng_range = list(map(itemgetter(1), g))
        eng_range_list.append(eng_range)

    total_syllable = 0
    for i in range(len(eng_range_list)):
        total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5)
    for i in range(len(zh_idx)):
        total_syllable+=1
    
    #final chchchchchc[en][en][en]
    #[en] is a vocabulary dict with  occurence of image
    zh_eng_idx_list = []
    i = 0
    while i < len(in_str):
        if in_str[i]==' ':
            i+=1
        if i in zh_idx:
            zh_eng_idx_list.append(i)
            i+=1
        if i in eng_idx:
            for ls in eng_range_list:
                if i in ls:
                    zh_eng_idx_list.append(ls)
                    i = ls[-1]+1
                    break
            
    zh_eng_dict_list = [{'content':'','time_ratio':0}]
    idx = 0 
    current_len = 0
    sen_idx = 0
    while idx < len(zh_eng_idx_list):
        str_from_idx = ''
        sylla_cnt = 1
        if type(zh_eng_idx_list[idx])==type([]):
            str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' '
            sylla_cnt = syllable_count(str_from_idx)
        else:
            str_from_idx = in_str[zh_eng_idx_list[idx]]
    
      
        if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen:
            zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable
           
            zh_eng_dict_list.append({'content':'','time_ratio':0})
            sen_idx+=1
            current_len = 0
        else:
            current_len += sylla_cnt
            zh_eng_dict_list[sen_idx]['content'] += str_from_idx
            idx+=1
        
    total_ratio = 0
    for obj in zh_eng_dict_list:
        total_ratio+=obj['time_ratio']
    zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio
    return zh_eng_dict_list
   

def parse_script(file_path,gt_list):
    with open(file_path, 'r',encoding="utf-8") as f:
        raw_lines = [line.strip() for line in f]
    lines = adjustSub_by_text_similarity(gt_list,raw_lines)
    #make dict
    dict_list = []
    for idx in range(len(lines)):
        script={}
        script['content'] = lines[idx]
        time_raw = raw_lines[idx * 4 +1 ].split(' --> ')
        start = time_raw[0].split(':')
        stop = time_raw[1].split(':')
        script['start'] = float(start[0])*3600 + float(start[1])*60 + float(start[2].replace(',','.'))
        script['stop'] = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2].replace(',','.'))
        dict_list.append(script)
    #merge duplicated sentences
    script_not_dup_list = []
    for idx in range(len(dict_list)):
        dup_list = []
        for idx_inner in range(len(dict_list)):
            if dict_list[idx_inner]['content']==dict_list[idx]['content']:
                dup_list.append(idx_inner)
        for dup_idx in dup_list:
            if dup_idx == min(dup_list):
                dict_list[dup_idx]['type'] = 'lead_sentence'
            else:
                dict_list[dup_idx]['type'] = 'duplicated'
        dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop']
        if dict_list[idx]['type'] == 'lead_sentence':
            script_not_dup_list.append(dict_list[idx])
                
    #avoid subtitle overlapping ?   Timeline overlapping not found currently
    #cut by max length---->  eng seperated problem   {eng_idx}
    #ENG counts, zh counts, space counts

    new_idx = 0
    splitted_dict = []
    for dic in script_not_dup_list:
        dic_idx = 0
        accumulated_duration = 0
        duration = dic['stop']-dic['start']
        print(duration)
        for sub_dic in split_sentence(dic['content'],13):
            new_dic = {}
            ind_duration = duration * sub_dic['time_ratio']
            new_dic['start'] = dic['start'] + accumulated_duration
            accumulated_duration += ind_duration
            
            new_dic['content'] = sub_dic['content']
            new_dic['duration'] = ind_duration
            splitted_dict.append(new_dic)
    for obj in splitted_dict:
        print(obj)
    return splitted_dict


def adjustSub_by_text_similarity(gts,gens_raw):
    gens = []
    for idx in range(int((len(gens_raw)+1)/4)):
        gens.append(gens_raw[idx*4+2])
    
    combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
    combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
    alls = gts + combine2 + combine3

    adjusted = [None]*len(gens)
    duplicated_list = []
    for idx in range(len(gens)):
        match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
        if match_text[0] in duplicated_list:
            for mt in match_text:
                if mt == adjusted[idx-1] or mt not in duplicated_list:
                    adjusted[idx] = mt
                    break
        else:
            adjusted[idx] = match_text[0]
            duplicated_list.append(match_text[0])
    return adjusted

def trim_punctuation(s):
    pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+';
    pattern = u'([0-9]+{0}[0-9]+)|{0}'.format(pat_block)
    res = re.sub(pattern, lambda x: x.group(1) if x.group(1) else u" " ,s)
    return res

def splitter(s):
    for sent in re.findall(u'[^!?，。\!\?]+[!? 。\!\?]?', s, flags=re.U):
        yield sent

def split_by_pun(s):
    res = list(splitter(s))
    return res
def split_by_word(s):
    slice_size = 3
    paragraph_len = len(s)
    slice_num = int(math.ceil(paragraph_len/slice_size))
    slice_list = []
    for n in range(slice_num):
        slice_list.append(s[n*slice_size:n*slice_size+slice_size])
    return slice_list

raw_str = '更糟糕的是，與大量關注相伴的並非用戶讚賞，而是 Windows 10 on ARM 的不成熟暴露無遺，以及隨之而來的如潮差評──對用戶使用體驗影響最惡劣的，莫過於 Windows 10 on ARM 僅能透過模擬兼容老舊過時的 32 位元 x86 應用，而對效能與普及度俱佳的 64 位元 x86（即 x64）應用無能為力'
sub_dict = parse_script("out.txt",split_by_pun(raw_str))