| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 | 
							
- import re
 
- from itertools import groupby
 
- from operator import itemgetter
 
- ipath= "中文中文在這Windows on ARM不好用Eng at last"
 
- def syllable_count(word):
 
-     word = word.lower()
 
-     count = 0
 
-     vowels = "aeiouy"
 
-     if word[0] in vowels:
 
-         count += 1
 
-     for index in range(1, len(word)):
 
-         if word[index] in vowels and word[index - 1] not in vowels:
 
-             count += 1
 
-     if word.endswith("e"):
 
-         count -= 1
 
-     if count == 0:
 
-         count += 1
 
-     return count
 
- def weighting(in_str, maxLen):
 
-     re.findall(r'[\u4e00-\u9fff]+', in_str)
 
-     zh_idx = []
 
-     eng_idx= []
 
-     for i in range(len(in_str)):
 
-         if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
 
-             zh_idx.append(i)
 
-         else:
 
-             eng_idx.append(i)
 
-     space_index = [m.start() for m in re.finditer(' ', in_str)]
 
-     for idx in space_index:
 
-         eng_idx.remove(idx)
 
-     
 
-     eng_range_list = []
 
-     for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
 
-         eng_range = list(map(itemgetter(1), g))
 
-         eng_range_list.append(eng_range)
 
-     total_syllable = 0
 
-     for i in range(len(eng_range_list)):
 
-         total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5)
 
-     for i in range(len(zh_idx)):
 
-         total_syllable+=1
 
-     
 
-     #final chchchchchc[en][en][en]
 
-     #[en] is a vocabulary dict with  occurence of image
 
-     zh_eng_idx_list = []
 
-     i = 0
 
-     while i < len(in_str):
 
-         if in_str[i]==' ':
 
-             i+=1
 
-         if i in zh_idx:
 
-             zh_eng_idx_list.append(i)
 
-             i+=1
 
-         if i in eng_idx:
 
-             for ls in eng_range_list:
 
-                 if i in ls:
 
-                     zh_eng_idx_list.append(ls)
 
-                     i = ls[-1]+1
 
-                     break
 
-             
 
-     zh_eng_dict_list = [{'content':'','time_ratio':0}]
 
-     idx = 0 
 
-     current_len = 0
 
-     sen_idx = 0
 
-     while idx < len(zh_eng_idx_list):
 
-         str_from_idx = ''
 
-         sylla_cnt = 1
 
-         if type(zh_eng_idx_list[idx])==type([]):
 
-             str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' '
 
-             sylla_cnt = syllable_count(str_from_idx)
 
-         else:
 
-             str_from_idx = in_str[zh_eng_idx_list[idx]]
 
-     
 
-       
 
-         if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen:
 
-             zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable
 
-            
 
-             zh_eng_dict_list.append({'content':'','time_ratio':0})
 
-             sen_idx+=1
 
-             current_len = 0
 
-         else:
 
-             current_len += sylla_cnt
 
-             zh_eng_dict_list[sen_idx]['content'] += str_from_idx
 
-             idx+=1
 
-         
 
-     total_ratio = 0
 
-     for obj in zh_eng_dict_list:
 
-         total_ratio+=obj['time_ratio']
 
-     zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio
 
-     return zh_eng_dict_list
 
-    
 
-                     
 
-             
 
-             
 
-     
 
- #run
 
- weighting(ipath, 13)
 
 
  |