|
@@ -0,0 +1,30 @@
|
|
|
+import re
|
|
|
+
|
|
|
+s = u"三宅一秀空間創藝|天使總監郁琇琇 室內設計師,擅長鄉村風、古典風、美式風、奢華風、混搭風以及北歐風,多年被評價為幸福空間觀眾最愛室內設計師之列。屢獲亞洲、韓國、英國、義大利等多國設計大獎。"
|
|
|
+
|
|
|
+def trim_punctuation(s):
|
|
|
+ pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+';
|
|
|
+ pattern = u'([0-9]+{0}[0-9]+)|{0}'.format(pat_block)
|
|
|
+ res = re.sub(pattern, lambda x: x.group(1) if x.group(1) else u"" ,s)
|
|
|
+ return res
|
|
|
+
|
|
|
+def splitter(s):
|
|
|
+ for sent in re.findall(u'[^!?,。\.\!\?]+[!?。\.\!\?]?', s, flags=re.U):
|
|
|
+ yield sent
|
|
|
+def split_by_pun(s):
|
|
|
+ res = list(splitter(s))
|
|
|
+ return res
|
|
|
+
|
|
|
+maxLen = 10
|
|
|
+s_list = split_by_pun(s)
|
|
|
+for sen in s_list:
|
|
|
+ print(sen)
|
|
|
+print('-------------------------------')
|
|
|
+trim_list = [None]*len(s_list)
|
|
|
+for idx in range(len(s_list)):
|
|
|
+ trim_list[idx] = trim_punctuation(s_list[idx])
|
|
|
+for sen in trim_list:
|
|
|
+ if len(sen) >10:
|
|
|
+ sen+='*'
|
|
|
+ print(sen)
|
|
|
+
|