Browse Source

spliter added

ming 3 years ago
parent
commit
d2b1601cad
1 changed files with 30 additions and 0 deletions
  1. 30 0
      process_pun.py

+ 30 - 0
process_pun.py

@@ -0,0 +1,30 @@
+import re
+
+s = u"三宅一秀空間創藝|天使總監郁琇琇 室內設計師,擅長鄉村風、古典風、美式風、奢華風、混搭風以及北歐風,多年被評價為幸福空間觀眾最愛室內設計師之列。屢獲亞洲、韓國、英國、義大利等多國設計大獎。"
+
+def trim_punctuation(s):
+    pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+';
+    pattern = u'([0-9]+{0}[0-9]+)|{0}'.format(pat_block)
+    res = re.sub(pattern, lambda x: x.group(1) if x.group(1) else u"" ,s)
+    return res
+
+def splitter(s):
+    for sent in re.findall(u'[^!?,。\.\!\?]+[!?。\.\!\?]?', s, flags=re.U):
+        yield sent
+def split_by_pun(s):
+    res = list(splitter(s))
+    return res
+
+maxLen = 10
+s_list = split_by_pun(s)
+for sen in s_list:
+    print(sen)
+print('-------------------------------')
+trim_list = [None]*len(s_list)
+for idx in range(len(s_list)):
+    trim_list[idx] = trim_punctuation(s_list[idx])
+for sen in trim_list:
+    if len(sen) >10:
+        sen+='*'
+    print(sen)
+