فهرست منبع

working on sentence weighting

ming 3 سال پیش
والد
کامیت
c9342653be
2فایلهای تغییر یافته به همراه22 افزوده شده و 1 حذف شده
  1. 19 0
      subGenerator/sentence_length_weighting.py
  2. 3 1
      subGenerator/split.py

+ 19 - 0
subGenerator/sentence_length_weighting.py

@@ -0,0 +1,19 @@
+import re
+from itertools import groupby
+from operator import itemgetter
+ipath= "中文中文在這Windows on ARM不好用Eng at last"
+re.findall(r'[\u4e00-\u9fff]+', ipath)
+
+zh_idx = []
+eng_idx= []
+for i in range(len(ipath)):
+    if ipath[i] > u'\u4e00' and ipath[i] < u'\u9fff':
+        zh_idx.append(i)
+    else:
+        eng_idx.append(i)
+
+
+for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
+    eng_range = list(map(itemgetter(1), g))
+    ipath2 = ipath[0 : eng_range[0] : ] + ipath[eng_range[-1]+1  : :]
+    print(eng_range)

+ 3 - 1
subGenerator/split.py

@@ -20,11 +20,13 @@ def sentence_time_ratio(text,maxLen):
             sentences[t] = text[t*sen_len:t*sen_len+sen_len]
             time_ratio[t] = len(sentences[t])/total_len
     else:
-        
         time_ratio = [1]
         sentences = [text]
     
     return time_ratio, sentences
+#1 sentence in, spliited array out
+
+
 def parse_script(file_path,gt_list):
     with open(file_path, 'r',encoding="utf-8") as f:
         raw_lines = [line.strip() for line in f]