process_pun.py 1001 B

123456789101112131415161718192021222324252627282930
  1. import re
  2. s = u"三宅一秀空間創藝|天使總監郁琇琇 室內設計師,擅長鄉村風、古典風、美式風、奢華風、混搭風以及北歐風,多年被評價為幸福空間觀眾最愛室內設計師之列。屢獲亞洲、韓國、英國、義大利等多國設計大獎。"
  3. def trim_punctuation(s):
  4. pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+';
  5. pattern = u'([0-9]+{0}[0-9]+)|{0}'.format(pat_block)
  6. res = re.sub(pattern, lambda x: x.group(1) if x.group(1) else u"" ,s)
  7. return res
  8. def splitter(s):
  9. for sent in re.findall(u'[^!?,。\.\!\?]+[!?。\.\!\?]?', s, flags=re.U):
  10. yield sent
  11. def split_by_pun(s):
  12. res = list(splitter(s))
  13. return res
  14. maxLen = 10
  15. s_list = split_by_pun(s)
  16. for sen in s_list:
  17. print(sen)
  18. print('-------------------------------')
  19. trim_list = [None]*len(s_list)
  20. for idx in range(len(s_list)):
  21. trim_list[idx] = trim_punctuation(s_list[idx])
  22. for sen in trim_list:
  23. if len(sen) >10:
  24. sen+='*'
  25. print(sen)