|
@@ -4,22 +4,79 @@ import re
|
|
|
from itertools import groupby
|
|
|
from operator import itemgetter
|
|
|
ipath= "中文中文在這Windows on ARM不好用Eng at last"
|
|
|
-re.findall(r'[\u4e00-\u9fff]+', ipath)
|
|
|
|
|
|
-zh_idx = []
|
|
|
-eng_idx= []
|
|
|
-for i in range(len(ipath)):
|
|
|
- if ipath[i] > u'\u4e00' and ipath[i] < u'\u9fff':
|
|
|
- zh_idx.append(i)
|
|
|
- else:
|
|
|
- eng_idx.append(i)
|
|
|
+def syllable_count(word):
|
|
|
+ word = word.lower()
|
|
|
+ count = 0
|
|
|
+ vowels = "aeiouy"
|
|
|
+ if word[0] in vowels:
|
|
|
+ count += 1
|
|
|
+ for index in range(1, len(word)):
|
|
|
+ if word[index] in vowels and word[index - 1] not in vowels:
|
|
|
+ count += 1
|
|
|
+ if word.endswith("e"):
|
|
|
+ count -= 1
|
|
|
+ if count == 0:
|
|
|
+ count += 1
|
|
|
+ return count
|
|
|
|
|
|
|
|
|
-for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
|
|
|
- eng_range = list(map(itemgetter(1), g))
|
|
|
- ipath2 = ipath[0 : eng_range[0] : ] + ipath[eng_range[-1]+1 : :]
|
|
|
- print(eng_range)
|
|
|
+def weighting(in_str):
|
|
|
+ re.findall(r'[\u4e00-\u9fff]+', in_str)
|
|
|
|
|
|
+ zh_idx = []
|
|
|
+ eng_idx= []
|
|
|
+ for i in range(len(in_str)):
|
|
|
+ if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
|
|
|
+ zh_idx.append(i)
|
|
|
+ else:
|
|
|
+ eng_idx.append(i)
|
|
|
|
|
|
-def split_sentence():
|
|
|
-
|
|
|
+ space_index = [m.start() for m in re.finditer(' ', in_str)]
|
|
|
+ for idx in space_index:
|
|
|
+ eng_idx.remove(idx)
|
|
|
+
|
|
|
+ eng_range_list = []
|
|
|
+ for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
|
|
|
+ eng_range = list(map(itemgetter(1), g))
|
|
|
+ eng_range_list.append(eng_range)
|
|
|
+
|
|
|
+ total_syllable = 0
|
|
|
+ for i in range(len(eng_range_list)):
|
|
|
+ total_syllable += syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])
|
|
|
+ for i in range(len(zh_idx)):
|
|
|
+ total_syllable+=i
|
|
|
+
|
|
|
+ #final chchchchchc[en][en][en]
|
|
|
+ #[en] is a vocabulary dict with occurence of image
|
|
|
+ zh_eng_idx_list = []
|
|
|
+ i = 0
|
|
|
+ while i < len(in_str):
|
|
|
+ if in_str[i]==' ':
|
|
|
+ i+=1
|
|
|
+ if i in zh_idx:
|
|
|
+ zh_eng_idx_list.append(i)
|
|
|
+ i+=1
|
|
|
+ if i in eng_idx:
|
|
|
+ for ls in eng_range_list:
|
|
|
+ if i in ls:
|
|
|
+ zh_eng_idx_list.append(ls)
|
|
|
+ i = ls[-1]+1
|
|
|
+ break
|
|
|
+ print(zh_eng_idx_list)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ zh_eng_dict_list = []
|
|
|
+ for i in range(len(in_str)):
|
|
|
+ sentence_dict = {'content':'','ratio':0,'image': False}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+weighting(ipath)
|