choozmo
/
AI_Anchor


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
							

import re
from itertools import groupby
from operator import itemgetter
ipath= "中文中文在這Windows on ARM不好用Eng at last"

def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1

    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def weighting(in_str, maxLen):
    re.findall(r'[\u4e00-\u9fff]+', in_str)

    zh_idx = []
    eng_idx= []
    for i in range(len(in_str)):
        if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
            zh_idx.append(i)
        else:
            eng_idx.append(i)

    space_index = [m.start() for m in re.finditer(' ', in_str)]
    for idx in space_index:
        eng_idx.remove(idx)
    
    eng_range_list = []
    for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
        eng_range = list(map(itemgetter(1), g))
        eng_range_list.append(eng_range)

    total_syllable = 0
    for i in range(len(eng_range_list)):
        total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5)
    for i in range(len(zh_idx)):
        total_syllable+=1
    
    #final chchchchchc[en][en][en]
    #[en] is a vocabulary dict with  occurence of image
    zh_eng_idx_list = []
    i = 0
    while i < len(in_str):
        if in_str[i]==' ':
            i+=1
        if i in zh_idx:
            zh_eng_idx_list.append(i)
            i+=1
        if i in eng_idx:
            for ls in eng_range_list:
                if i in ls:
                    zh_eng_idx_list.append(ls)
                    i = ls[-1]+1
                    break
            
    zh_eng_dict_list = [{'content':'','time_ratio':0}]
    idx = 0 
    current_len = 0
    sen_idx = 0
    while idx < len(zh_eng_idx_list):
        str_from_idx = ''
        sylla_cnt = 1
        if type(zh_eng_idx_list[idx])==type([]):
            str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' '
            sylla_cnt = syllable_count(str_from_idx)
        else:
            str_from_idx = in_str[zh_eng_idx_list[idx]]
    
      
        if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen:
            zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable
           
            zh_eng_dict_list.append({'content':'','time_ratio':0})
            sen_idx+=1
            current_len = 0
        else:
            current_len += sylla_cnt
            zh_eng_dict_list[sen_idx]['content'] += str_from_idx
            idx+=1
        
    total_ratio = 0
    for obj in zh_eng_dict_list:
        total_ratio+=obj['time_ratio']
    zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio
    return zh_eng_dict_list
   
                    
            
            
    
#run
weighting(ipath, 13)