sentence_length_weighting.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. import re
  2. from itertools import groupby
  3. from operator import itemgetter
  4. ipath= "中文中文在這Windows on ARM不好用Eng at last"
  5. def syllable_count(word):
  6. word = word.lower()
  7. count = 0
  8. vowels = "aeiouy"
  9. if word[0] in vowels:
  10. count += 1
  11. for index in range(1, len(word)):
  12. if word[index] in vowels and word[index - 1] not in vowels:
  13. count += 1
  14. if word.endswith("e"):
  15. count -= 1
  16. if count == 0:
  17. count += 1
  18. return count
  19. def weighting(in_str, maxLen):
  20. re.findall(r'[\u4e00-\u9fff]+', in_str)
  21. zh_idx = []
  22. eng_idx= []
  23. for i in range(len(in_str)):
  24. if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
  25. zh_idx.append(i)
  26. else:
  27. eng_idx.append(i)
  28. space_index = [m.start() for m in re.finditer(' ', in_str)]
  29. for idx in space_index:
  30. eng_idx.remove(idx)
  31. eng_range_list = []
  32. for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
  33. eng_range = list(map(itemgetter(1), g))
  34. eng_range_list.append(eng_range)
  35. total_syllable = 0
  36. for i in range(len(eng_range_list)):
  37. total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5)
  38. for i in range(len(zh_idx)):
  39. total_syllable+=1
  40. #final chchchchchc[en][en][en]
  41. #[en] is a vocabulary dict with occurence of image
  42. zh_eng_idx_list = []
  43. i = 0
  44. while i < len(in_str):
  45. if in_str[i]==' ':
  46. i+=1
  47. if i in zh_idx:
  48. zh_eng_idx_list.append(i)
  49. i+=1
  50. if i in eng_idx:
  51. for ls in eng_range_list:
  52. if i in ls:
  53. zh_eng_idx_list.append(ls)
  54. i = ls[-1]+1
  55. break
  56. zh_eng_dict_list = [{'content':'','time_ratio':0}]
  57. idx = 0
  58. current_len = 0
  59. sen_idx = 0
  60. while idx < len(zh_eng_idx_list):
  61. str_from_idx = ''
  62. sylla_cnt = 1
  63. if type(zh_eng_idx_list[idx])==type([]):
  64. str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' '
  65. sylla_cnt = syllable_count(str_from_idx)
  66. else:
  67. str_from_idx = in_str[zh_eng_idx_list[idx]]
  68. if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen:
  69. zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable
  70. zh_eng_dict_list.append({'content':'','time_ratio':0})
  71. sen_idx+=1
  72. current_len = 0
  73. else:
  74. current_len += sylla_cnt
  75. zh_eng_dict_list[sen_idx]['content'] += str_from_idx
  76. idx+=1
  77. total_ratio = 0
  78. for obj in zh_eng_dict_list:
  79. total_ratio+=obj['time_ratio']
  80. zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio
  81. return zh_eng_dict_list
  82. #run
  83. weighting(ipath, 13)