123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- import re
- from itertools import groupby
- from operator import itemgetter
- ipath= "中文中文在這Windows on ARM不好用Eng at last"
- def syllable_count(word):
- word = word.lower()
- count = 0
- vowels = "aeiouy"
- if word[0] in vowels:
- count += 1
- for index in range(1, len(word)):
- if word[index] in vowels and word[index - 1] not in vowels:
- count += 1
- if word.endswith("e"):
- count -= 1
- if count == 0:
- count += 1
- return count
- def weighting(in_str, maxLen):
- re.findall(r'[\u4e00-\u9fff]+', in_str)
- zh_idx = []
- eng_idx= []
- for i in range(len(in_str)):
- if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
- zh_idx.append(i)
- else:
- eng_idx.append(i)
- space_index = [m.start() for m in re.finditer(' ', in_str)]
- for idx in space_index:
- eng_idx.remove(idx)
-
- eng_range_list = []
- for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
- eng_range = list(map(itemgetter(1), g))
- eng_range_list.append(eng_range)
- total_syllable = 0
- for i in range(len(eng_range_list)):
- total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5)
- for i in range(len(zh_idx)):
- total_syllable+=1
-
- #final chchchchchc[en][en][en]
- #[en] is a vocabulary dict with occurence of image
- zh_eng_idx_list = []
- i = 0
- while i < len(in_str):
- if in_str[i]==' ':
- i+=1
- if i in zh_idx:
- zh_eng_idx_list.append(i)
- i+=1
- if i in eng_idx:
- for ls in eng_range_list:
- if i in ls:
- zh_eng_idx_list.append(ls)
- i = ls[-1]+1
- break
-
- zh_eng_dict_list = [{'content':'','time_ratio':0}]
- idx = 0
- current_len = 0
- sen_idx = 0
- while idx < len(zh_eng_idx_list):
- str_from_idx = ''
- sylla_cnt = 1
- if type(zh_eng_idx_list[idx])==type([]):
- str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' '
- sylla_cnt = syllable_count(str_from_idx)
- else:
- str_from_idx = in_str[zh_eng_idx_list[idx]]
-
-
- if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen:
- zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable
-
- zh_eng_dict_list.append({'content':'','time_ratio':0})
- sen_idx+=1
- current_len = 0
- else:
- current_len += sylla_cnt
- zh_eng_dict_list[sen_idx]['content'] += str_from_idx
- idx+=1
-
- total_ratio = 0
- for obj in zh_eng_dict_list:
- total_ratio+=obj['time_ratio']
- zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio
- return zh_eng_dict_list
-
-
-
-
-
- #run
- weighting(ipath, 13)
|