sentence_length_weighting.py 576 B

12345678910111213141516171819202122232425
  1. import re
  2. from itertools import groupby
  3. from operator import itemgetter
  4. ipath= "中文中文在這Windows on ARM不好用Eng at last"
  5. re.findall(r'[\u4e00-\u9fff]+', ipath)
  6. zh_idx = []
  7. eng_idx= []
  8. for i in range(len(ipath)):
  9. if ipath[i] > u'\u4e00' and ipath[i] < u'\u9fff':
  10. zh_idx.append(i)
  11. else:
  12. eng_idx.append(i)
  13. for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
  14. eng_range = list(map(itemgetter(1), g))
  15. ipath2 = ipath[0 : eng_range[0] : ] + ipath[eng_range[-1]+1 : :]
  16. print(eng_range)
  17. def split_sentence():