4 년 전 · ba82361818
--- a/subGenerator/ProcessSub.py
+++ b/subGenerator/ProcessSub.py
@@ -75,6 +75,9 @@ gg = parse_script('out.txt')
 
				 for g in gg:
			
 
				     print(g) 
			
 
				             
			
 
				+
			
 
				+
			
 
				+            
			
 
				     
			
 
				 gts = 'Hello Kitty 於2018年,加入YouTube開始活動,在自我介紹的影片裡,Kitty表示一直憧憬著,想在YouTube跟大家見面,一開頻道就吸引許多粉絲訂閱,目前有28萬訂閱者,接下來這位花生君,於2017年加入YouTube開始活動,他的外型太過特別,花生頭、紅色圍巾與紙尿布,被觀眾評價為,第一眼看上很噁心,但看著看著還挺可愛,目前有12萬訂閱者'.split(',')
			
 
				 gens = ['Hello Kitty瑜2018年加入YouTube開始活動','再次我介紹的影片裡','Kitty表示一直憧憬著長在YouTube跟大家見面','一開頻道就吸引許多粉絲訂閱付錢有28萬訂閱者','接下來這位花生君瑜2017年加入YouTube開始湖','活動','他的外型太過特別花生桃紅色圍巾魚紙尿布','被觀眾評價為第一眼看上很噁心但看著看著還挺','秦可愛','目前有12萬訂閱者']
			
--- a/subGenerator/sentence_length_weighting.py
+++ b/subGenerator/sentence_length_weighting.py
@@ -4,22 +4,79 @@ import re
 
				 from itertools import groupby
			
 
				 from operator import itemgetter
			
 
				 ipath= "中文中文在這Windows on ARM不好用Eng at last"
			
 
				-re.findall(r'[\u4e00-\u9fff]+', ipath)
			
 
				 
			
 
				-zh_idx = []
			
 
				-eng_idx= []
			
 
				-for i in range(len(ipath)):
			
 
				-    if ipath[i] > u'\u4e00' and ipath[i] < u'\u9fff':
			
 
				-        zh_idx.append(i)
			
 
				-    else:
			
 
				-        eng_idx.append(i)
			
 
				+def syllable_count(word):
			
 
				+    word = word.lower()
			
 
				+    count = 0
			
 
				+    vowels = "aeiouy"
			
 
				+    if word[0] in vowels:
			
 
				+        count += 1
			
 
				+    for index in range(1, len(word)):
			
 
				+        if word[index] in vowels and word[index - 1] not in vowels:
			
 
				+            count += 1
			
 
				+    if word.endswith("e"):
			
 
				+        count -= 1
			
 
				+    if count == 0:
			
 
				+        count += 1
			
 
				+    return count
			
 
				 
			
 
				 
			
 
				-for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
			
 
				-    eng_range = list(map(itemgetter(1), g))
			
 
				-    ipath2 = ipath[0 : eng_range[0] : ] + ipath[eng_range[-1]+1  : :]
			
 
				-    print(eng_range)
			
 
				+def weighting(in_str):
			
 
				+    re.findall(r'[\u4e00-\u9fff]+', in_str)
			
 
				 
			
 
				+    zh_idx = []
			
 
				+    eng_idx= []
			
 
				+    for i in range(len(in_str)):
			
 
				+        if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
			
 
				+            zh_idx.append(i)
			
 
				+        else:
			
 
				+            eng_idx.append(i)
			
 
				 
			
 
				-def split_sentence():
			
 
				-    
			
 
				+    space_index = [m.start() for m in re.finditer(' ', in_str)]
			
 
				+    for idx in space_index:
			
 
				+        eng_idx.remove(idx)
			
 
				+    
			
 
				+    eng_range_list = []
			
 
				+    for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
			
 
				+        eng_range = list(map(itemgetter(1), g))
			
 
				+        eng_range_list.append(eng_range)
			
 
				+
			
 
				+    total_syllable = 0
			
 
				+    for i in range(len(eng_range_list)):
			
 
				+        total_syllable += syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])
			
 
				+    for i in range(len(zh_idx)):
			
 
				+        total_syllable+=i
			
 
				+    
			
 
				+    #final chchchchchc[en][en][en]
			
 
				+    #[en] is a vocabulary dict with  occurence of image
			
 
				+    zh_eng_idx_list = []
			
 
				+    i = 0
			
 
				+    while i < len(in_str):
			
 
				+        if in_str[i]==' ':
			
 
				+            i+=1
			
 
				+        if i in zh_idx:
			
 
				+            zh_eng_idx_list.append(i)
			
 
				+            i+=1
			
 
				+        if i in eng_idx:
			
 
				+            for ls in eng_range_list:
			
 
				+                if i in ls:
			
 
				+                    zh_eng_idx_list.append(ls)
			
 
				+                    i = ls[-1]+1
			
 
				+                    break
			
 
				+    print(zh_eng_idx_list)
			
 
				+            
			
 
				+                
			
 
				+            
			
 
				+            
			
 
				+
			
 
				+
			
 
				+    zh_eng_dict_list = []
			
 
				+    for i in range(len(in_str)):
			
 
				+        sentence_dict = {'content':'','ratio':0,'image': False}
			
 
				+
			
 
				+                    
			
 
				+            
			
 
				+            
			
 
				+    
			
 
				+
			
 
				+weighting(ipath)