split.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. import re
  2. import difflib
  3. import math
  4. def sentence_time_ratio(text,maxLen):
  5. total_len = len(text)
  6. if total_len > maxLen:
  7. left_word = total_len % maxLen
  8. times = int(math.ceil(total_len/maxLen))
  9. if left_word < 5:
  10. times+=1
  11. sen_len = int(total_len/times)
  12. time_ratio = [None]*times
  13. sentences = [None]*times
  14. print(times,',',total_len,",",sen_len)
  15. for t in range(times):
  16. sentences[t] = text[t*sen_len:t*sen_len+sen_len]
  17. time_ratio[t] = len(sentences[t])/total_len
  18. else:
  19. time_ratio = [1]
  20. sentences = [text]
  21. return time_ratio, sentences
  22. #1 sentence in, spliited array out
  23. def parse_script(file_path,gt_list):
  24. with open(file_path, 'r',encoding="utf-8") as f:
  25. raw_lines = [line.strip() for line in f]
  26. lines = adjustSub_by_text_similarity(gt_list,raw_lines)
  27. #make dict
  28. dict_list = []
  29. for idx in range(len(lines)):
  30. script={}
  31. script['content'] = lines[idx]
  32. time_raw = raw_lines[idx * 4 +1 ].split(' --> ')
  33. start = time_raw[0].split(':')
  34. stop = time_raw[1].split(':')
  35. script['start'] = float(start[0])*3600 + float(start[1])*60 + float(start[2].replace(',','.'))
  36. script['stop'] = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2].replace(',','.'))
  37. dict_list.append(script)
  38. #merge duplicated sentences
  39. script_not_dup_list = []
  40. for idx in range(len(dict_list)):
  41. dup_list = []
  42. for idx_inner in range(len(dict_list)):
  43. if dict_list[idx_inner]['content']==dict_list[idx]['content']:
  44. dup_list.append(idx_inner)
  45. for dup_idx in dup_list:
  46. if dup_idx == min(dup_list):
  47. dict_list[dup_idx]['type'] = 'lead_sentence'
  48. else:
  49. dict_list[dup_idx]['type'] = 'duplicated'
  50. dict_list[dup_list[0]]['stop'] = dict_list[dup_list[-1]]['stop']
  51. if dict_list[idx]['type'] == 'lead_sentence':
  52. script_not_dup_list.append(dict_list[idx])
  53. #avoid subtitle overlapping ? Timeline overlapping not found currently
  54. #cut by max length----> eng seperated problem {eng_idx}
  55. #ENG counts, zh counts, space counts
  56. new_idx = 0
  57. splitted_dict = []
  58. for dic in dict_list:
  59. time_ratio, sentences = sentence_time_ratio(dic['content'],13)
  60. for s in range(len(sentences)):
  61. new_dict = {}
  62. new_dict['index'] = new_idx
  63. start = dic['start']
  64. for t in range(s):
  65. start += (dic['duration']*time_ratio[t])
  66. new_dict['start'] = start
  67. new_dict['duration'] = dic['duration'] * time_ratio[s]
  68. new_dict['content'] = sentences[s]
  69. new_idx+=1
  70. splitted_dict.append(new_dict)
  71. return splitted_dict
  72. def adjustSub_by_text_similarity(gts,gens_raw):
  73. gens = []
  74. for idx in range(int((len(gens_raw)+1)/4)):
  75. gens.append(gens_raw[idx*4+2])
  76. combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
  77. combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
  78. alls = gts + combine2 + combine3
  79. adjusted = [None]*len(gens)
  80. duplicated_list = []
  81. for idx in range(len(gens)):
  82. match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
  83. if match_text[0] in duplicated_list:
  84. for mt in match_text:
  85. if mt == adjusted[idx-1] or mt not in duplicated_list:
  86. adjusted[idx] = mt
  87. break
  88. else:
  89. adjusted[idx] = match_text[0]
  90. duplicated_list.append(match_text[0])
  91. return adjusted
  92. def trim_punctuation(s):
  93. pat_block = u'[^\u4e00-\u9fff0-9a-zA-Z]+';
  94. pattern = u'([0-9]+{0}[0-9]+)|{0}'.format(pat_block)
  95. res = re.sub(pattern, lambda x: x.group(1) if x.group(1) else u" " ,s)
  96. return res
  97. def splitter(s):
  98. for sent in re.findall(u'[^!?,。\!\?]+[!? 。\!\?]?', s, flags=re.U):
  99. yield sent
  100. def split_by_pun(s):
  101. res = list(splitter(s))
  102. return res
  103. def split_by_word(s):
  104. slice_size = 3
  105. paragraph_len = len(s)
  106. slice_num = int(math.ceil(paragraph_len/slice_size))
  107. slice_list = []
  108. for n in range(slice_num):
  109. slice_list.append(s[n*slice_size:n*slice_size+slice_size])
  110. return slice_list
  111. raw_str = '更糟糕的是,與大量關注相伴的並非用戶讚賞,而是 Windows 10 on ARM 的不成熟暴露無遺,以及隨之而來的如潮差評──對用戶使用體驗影響最惡劣的,莫過於 Windows 10 on ARM 僅能透過模擬兼容老舊過時的 32 位元 x86 應用,而對效能與普及度俱佳的 64 位元 x86(即 x64)應用無能為力'
  112. sub_dict = parse_script("out.txt",split_by_pun(raw_str))