import re from itertools import groupby from operator import itemgetter ipath= "中文中文在這Windows on ARM不好用Eng at last" def syllable_count(word): word = word.lower() count = 0 vowels = "aeiouy" if word[0] in vowels: count += 1 for index in range(1, len(word)): if word[index] in vowels and word[index - 1] not in vowels: count += 1 if word.endswith("e"): count -= 1 if count == 0: count += 1 return count def weighting(in_str, maxLen): re.findall(r'[\u4e00-\u9fff]+', in_str) zh_idx = [] eng_idx= [] for i in range(len(in_str)): if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff': zh_idx.append(i) else: eng_idx.append(i) space_index = [m.start() for m in re.finditer(' ', in_str)] for idx in space_index: eng_idx.remove(idx) eng_range_list = [] for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]): eng_range = list(map(itemgetter(1), g)) eng_range_list.append(eng_range) total_syllable = 0 for i in range(len(eng_range_list)): total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5) for i in range(len(zh_idx)): total_syllable+=1 #final chchchchchc[en][en][en] #[en] is a vocabulary dict with occurence of image zh_eng_idx_list = [] i = 0 while i < len(in_str): if in_str[i]==' ': i+=1 if i in zh_idx: zh_eng_idx_list.append(i) i+=1 if i in eng_idx: for ls in eng_range_list: if i in ls: zh_eng_idx_list.append(ls) i = ls[-1]+1 break zh_eng_dict_list = [{'content':'','time_ratio':0}] idx = 0 current_len = 0 sen_idx = 0 while idx < len(zh_eng_idx_list): str_from_idx = '' sylla_cnt = 1 if type(zh_eng_idx_list[idx])==type([]): str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' ' sylla_cnt = syllable_count(str_from_idx) else: str_from_idx = in_str[zh_eng_idx_list[idx]] if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen: zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable zh_eng_dict_list.append({'content':'','time_ratio':0}) sen_idx+=1 current_len = 0 else: current_len += sylla_cnt zh_eng_dict_list[sen_idx]['content'] += str_from_idx idx+=1 total_ratio = 0 for obj in zh_eng_dict_list: total_ratio+=obj['time_ratio'] zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio return zh_eng_dict_list #run weighting(ipath, 13)