瀏覽代碼

tune parse subtitle

ming 3 年之前
父節點
當前提交
ba82361818
共有 2 個文件被更改,包括 74 次插入14 次删除
  1. 3 0
      subGenerator/ProcessSub.py
  2. 71 14
      subGenerator/sentence_length_weighting.py

+ 3 - 0
subGenerator/ProcessSub.py

@@ -75,6 +75,9 @@ gg = parse_script('out.txt')
 for g in gg:
     print(g) 
             
+
+
+            
     
 gts = 'Hello Kitty 於2018年,加入YouTube開始活動,在自我介紹的影片裡,Kitty表示一直憧憬著,想在YouTube跟大家見面,一開頻道就吸引許多粉絲訂閱,目前有28萬訂閱者,接下來這位花生君,於2017年加入YouTube開始活動,他的外型太過特別,花生頭、紅色圍巾與紙尿布,被觀眾評價為,第一眼看上很噁心,但看著看著還挺可愛,目前有12萬訂閱者'.split(',')
 gens = ['Hello Kitty瑜2018年加入YouTube開始活動','再次我介紹的影片裡','Kitty表示一直憧憬著長在YouTube跟大家見面','一開頻道就吸引許多粉絲訂閱付錢有28萬訂閱者','接下來這位花生君瑜2017年加入YouTube開始湖','活動','他的外型太過特別花生桃紅色圍巾魚紙尿布','被觀眾評價為第一眼看上很噁心但看著看著還挺','秦可愛','目前有12萬訂閱者']

+ 71 - 14
subGenerator/sentence_length_weighting.py

@@ -4,22 +4,79 @@ import re
 from itertools import groupby
 from operator import itemgetter
 ipath= "中文中文在這Windows on ARM不好用Eng at last"
-re.findall(r'[\u4e00-\u9fff]+', ipath)
 
-zh_idx = []
-eng_idx= []
-for i in range(len(ipath)):
-    if ipath[i] > u'\u4e00' and ipath[i] < u'\u9fff':
-        zh_idx.append(i)
-    else:
-        eng_idx.append(i)
+def syllable_count(word):
+    word = word.lower()
+    count = 0
+    vowels = "aeiouy"
+    if word[0] in vowels:
+        count += 1
+    for index in range(1, len(word)):
+        if word[index] in vowels and word[index - 1] not in vowels:
+            count += 1
+    if word.endswith("e"):
+        count -= 1
+    if count == 0:
+        count += 1
+    return count
 
 
-for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
-    eng_range = list(map(itemgetter(1), g))
-    ipath2 = ipath[0 : eng_range[0] : ] + ipath[eng_range[-1]+1  : :]
-    print(eng_range)
+def weighting(in_str):
+    re.findall(r'[\u4e00-\u9fff]+', in_str)
 
+    zh_idx = []
+    eng_idx= []
+    for i in range(len(in_str)):
+        if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
+            zh_idx.append(i)
+        else:
+            eng_idx.append(i)
 
-def split_sentence():
-    
+    space_index = [m.start() for m in re.finditer(' ', in_str)]
+    for idx in space_index:
+        eng_idx.remove(idx)
+    
+    eng_range_list = []
+    for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
+        eng_range = list(map(itemgetter(1), g))
+        eng_range_list.append(eng_range)
+
+    total_syllable = 0
+    for i in range(len(eng_range_list)):
+        total_syllable += syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])
+    for i in range(len(zh_idx)):
+        total_syllable+=i
+    
+    #final chchchchchc[en][en][en]
+    #[en] is a vocabulary dict with  occurence of image
+    zh_eng_idx_list = []
+    i = 0
+    while i < len(in_str):
+        if in_str[i]==' ':
+            i+=1
+        if i in zh_idx:
+            zh_eng_idx_list.append(i)
+            i+=1
+        if i in eng_idx:
+            for ls in eng_range_list:
+                if i in ls:
+                    zh_eng_idx_list.append(ls)
+                    i = ls[-1]+1
+                    break
+    print(zh_eng_idx_list)
+            
+                
+            
+            
+
+
+    zh_eng_dict_list = []
+    for i in range(len(in_str)):
+        sentence_dict = {'content':'','ratio':0,'image': False}
+
+                    
+            
+            
+    
+
+weighting(ipath)