split.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. import re
  2. import difflib
  3. import math
  4. from itertools import groupby
  5. from operator import itemgetter
  6. def syllable_count(word):
  7. word = word.lower()
  8. count = 0
  9. vowels = "aeiouy"
  10. if word[0] in vowels:
  11. count += 1
  12. for index in range(1, len(word)):
  13. if word[index] in vowels and word[index - 1] not in vowels:
  14. count += 1
  15. if word.endswith("e"):
  16. count -= 1
  17. if count == 0:
  18. count += 1
  19. return count
  20. def split_sentence(in_str, maxLen):
  21. re.findall(r'[\u4e00-\u9fff]+', in_str)
  22. zh_idx = []
  23. eng_idx= []
  24. for i in range(len(in_str)):
  25. if in_str[i] > u'\u4e00' and in_str[i] < u'\u9fff':
  26. zh_idx.append(i)
  27. else:
  28. eng_idx.append(i)
  29. space_index = [m.start() for m in re.finditer(' ', in_str)]
  30. for idx in space_index:
  31. eng_idx.remove(idx)
  32. eng_range_list = []
  33. for k, g in groupby(enumerate(eng_idx), lambda ix : ix[0] - ix[1]):
  34. eng_range = list(map(itemgetter(1), g))
  35. eng_range_list.append(eng_range)
  36. total_syllable = 0
  37. for i in range(len(eng_range_list)):
  38. total_syllable += (syllable_count(in_str[eng_range_list[i][0]:eng_range_list[i][-1]+1])+0.5)
  39. for i in range(len(zh_idx)):
  40. total_syllable+=1
  41. #final chchchchchc[en][en][en]
  42. #[en] is a vocabulary dict with occurence of image
  43. zh_eng_idx_list = []
  44. i = 0
  45. while i < len(in_str):
  46. if in_str[i]==' ':
  47. i+=1
  48. if i in zh_idx:
  49. zh_eng_idx_list.append(i)
  50. i+=1
  51. if i in eng_idx:
  52. for ls in eng_range_list:
  53. if i in ls:
  54. zh_eng_idx_list.append(ls)
  55. i = ls[-1]+1
  56. break
  57. zh_eng_dict_list = [{'content':'','time_ratio':0}]
  58. idx = 0
  59. current_len = 0
  60. sen_idx = 0
  61. while idx < len(zh_eng_idx_list):
  62. str_from_idx = ''
  63. sylla_cnt = 1
  64. if type(zh_eng_idx_list[idx])==type([]):
  65. str_from_idx = in_str[zh_eng_idx_list[idx][0]:zh_eng_idx_list[idx][-1]+1]+' '
  66. sylla_cnt = syllable_count(str_from_idx)
  67. else:
  68. str_from_idx = in_str[zh_eng_idx_list[idx]]
  69. if len(zh_eng_dict_list[sen_idx]['content'])+sylla_cnt>=maxLen:
  70. zh_eng_dict_list[sen_idx]['time_ratio'] = current_len/total_syllable
  71. zh_eng_dict_list.append({'content':'','time_ratio':0})
  72. sen_idx+=1
  73. current_len = 0
  74. else:
  75. current_len += sylla_cnt
  76. zh_eng_dict_list[sen_idx]['content'] += str_from_idx
  77. idx+=1
  78. total_ratio = 0
  79. for obj in zh_eng_dict_list:
  80. total_ratio+=obj['time_ratio']
  81. zh_eng_dict_list[-1]['time_ratio'] = 1-total_ratio
  82. return zh_eng_dict_list
  83. def parse_script(file_path,gt_list):
  84. with open(file_path, 'r',encoding="utf-8") as f:
  85. raw_lines = [line.strip() for line in f]
  86. lines = adjustSub_by_text_similarity(gt_list,raw_lines)
  87. #make dict
  88. dict_list = []
  89. for idx in range(len(lines)):
  90. script={}
  91. script['content'] = lines[idx]
  92. time_raw = raw_lines[idx * 4 +1 ].split(' --> ')
  93. start = time_raw[0].split(':')
  94. stop = time_raw[1].split(':')
  95. script['start'] = float(start[0])*3600 + float(start[1])*60 + float(start[2].replace(',','.'))
  96. script['stop'] = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2].replace(',','.'))
  97. dict_list.append(script)
  98. #merge duplicated sentences
  99. script_not_dup_list = []
  100. for idx in range(len(dict_list)):
  101. dup_list = []
  102. for idx_inner in range(len(dict_list)):
  103. if dict_list[idx_inner]['content']==dict_list[idx]['content']:
  104. dup_list.append(idx_inner)
  105. for dup_idx in dup_list:
  106. if dup_idx == min(dup_list):
  107. dict_list[dup_idx]['type'] = 'lead_sentence'
  108. else:
  109. dict_list[dup_idx]['type'] = 'duplicated'
  110. dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop']
  111. if dict_list[idx]['type'] == 'lead_sentence':
  112. script_not_dup_list.append(dict_list[idx])
  113. #avoid subtitle overlapping ? Timeline overlapping not found currently
  114. #cut by max length----> eng seperated problem {eng_idx}
  115. #ENG counts, zh counts, space counts
  116. new_idx = 0
  117. splitted_dict = []
  118. for dic in script_not_dup_list:
  119. dic_idx = 0
  120. accumulated_duration = 0
  121. duration = dic['stop']-dic['start']
  122. print(duration)
  123. for sub_dic in split_sentence(dic['content'],13):
  124. new_dic = {}
  125. ind_duration = duration * sub_dic['time_ratio']
  126. new_dic['start'] = dic['start'] + accumulated_duration
  127. accumulated_duration += ind_duration
  128. new_dic['content'] = sub_dic['content']
  129. new_dic['duration'] = ind_duration
  130. splitted_dict.append(new_dic)
  131. for obj in splitted_dict:
  132. print(obj)
  133. return splitted_dict
  134. def adjustSub_by_text_similarity(gts,gens_raw):
  135. gens = []
  136. for idx in range(int((len(gens_raw)+1)/4)):
  137. gens.append(gens_raw[idx*4+2])
  138. combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
  139. combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
  140. alls = gts + combine2 + combine3
  141. adjusted = [None]*len(gens)
  142. duplicated_list = []
  143. for idx in range(len(gens)):
  144. match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
  145. if match_text[0] in duplicated_list:
  146. for mt in match_text:
  147. if mt == adjusted[idx-1] or mt not in duplicated_list:
  148. adjusted[idx] = mt
  149. break
  150. else:
  151. adjusted[idx] = match_text[0]
  152. duplicated_list.append(match_text[0])
  153. return adjusted
  154. def trim_punctuation(s):
  155. pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+';
  156. pattern = u'([0-9]+{0}[0-9]+)|{0}'.format(pat_block)
  157. res = re.sub(pattern, lambda x: x.group(1) if x.group(1) else u" " ,s)
  158. return res
  159. def splitter(s):
  160. for sent in re.findall(u'[^!?,。\!\?]+[!? 。\!\?]?', s, flags=re.U):
  161. yield sent
  162. def split_by_pun(s):
  163. res = list(splitter(s))
  164. return res
  165. def split_by_word(s):
  166. slice_size = 3
  167. paragraph_len = len(s)
  168. slice_num = int(math.ceil(paragraph_len/slice_size))
  169. slice_list = []
  170. for n in range(slice_num):
  171. slice_list.append(s[n*slice_size:n*slice_size+slice_size])
  172. return slice_list
  173. raw_str = '更糟糕的是,與大量關注相伴的並非用戶讚賞,而是 Windows 10 on ARM 的不成熟暴露無遺,以及隨之而來的如潮差評──對用戶使用體驗影響最惡劣的,莫過於 Windows 10 on ARM 僅能透過模擬兼容老舊過時的 32 位元 x86 應用,而對效能與普及度俱佳的 64 位元 x86(即 x64)應用無能為力'
  174. sub_dict = parse_script("out.txt",split_by_pun(raw_str))