|
@@ -29,6 +29,8 @@ from pytranscriber.control.ctr_autosub import Ctr_Autosub
|
|
import multiprocessing
|
|
import multiprocessing
|
|
from itertools import groupby
|
|
from itertools import groupby
|
|
from operator import itemgetter
|
|
from operator import itemgetter
|
|
|
|
+from util.parser import parser
|
|
|
|
+
|
|
dir_sound = 'mp3_track/'
|
|
dir_sound = 'mp3_track/'
|
|
dir_photo = 'photo/'
|
|
dir_photo = 'photo/'
|
|
dir_text = 'text_file/'
|
|
dir_text = 'text_file/'
|
|
@@ -186,9 +188,12 @@ def file_prepare_long(name, name_hash,text_content,image_urls,multiLang,lang='zh
|
|
img_num+=1
|
|
img_num+=1
|
|
|
|
|
|
#make mp3
|
|
#make mp3
|
|
|
|
+ text_parser = parser()
|
|
txt_idx = 0
|
|
txt_idx = 0
|
|
for txt in text_content:
|
|
for txt in text_content:
|
|
- txt = txt.replace
|
|
|
|
|
|
+ rep_list = text_parser.replace_list(k)
|
|
|
|
+ for reptxt in rep_list:
|
|
|
|
+ txt = txt.replace(reptxt,'')
|
|
if lang!='zh' or multiLang==1:
|
|
if lang!='zh' or multiLang==1:
|
|
if lang!='zh':
|
|
if lang!='zh':
|
|
tts = gTTS(txt)
|
|
tts = gTTS(txt)
|
|
@@ -373,11 +378,18 @@ def parse_script(file_path,gt_list):
|
|
with open(file_path, 'r',encoding="utf-8") as f:
|
|
with open(file_path, 'r',encoding="utf-8") as f:
|
|
raw_lines = [line.strip() for line in f]
|
|
raw_lines = [line.strip() for line in f]
|
|
lines = adjustSub_by_text_similarity(gt_list,raw_lines)
|
|
lines = adjustSub_by_text_similarity(gt_list,raw_lines)
|
|
|
|
+ text_parser = parser()
|
|
#make dict
|
|
#make dict
|
|
dict_list = []
|
|
dict_list = []
|
|
for idx in range(len(lines)):
|
|
for idx in range(len(lines)):
|
|
script={}
|
|
script={}
|
|
- script['content'] = lines[idx]
|
|
|
|
|
|
+ rep_ls = text_parser.replace_list(lines[idx])
|
|
|
|
+ line_content = lines[idx]
|
|
|
|
+ for reptxt in rep_ls:
|
|
|
|
+ line_content = line_content.replace(reptxt,'')
|
|
|
|
+ if len(rep_ls)!=0:
|
|
|
|
+ script['image_idx'] = int(rep_ls[0].replace('{','').replace('}',''))
|
|
|
|
+ script['content'] = line_content
|
|
time_raw = raw_lines[idx * 4 +1 ].split(' --> ')
|
|
time_raw = raw_lines[idx * 4 +1 ].split(' --> ')
|
|
start = time_raw[0].split(':')
|
|
start = time_raw[0].split(':')
|
|
stop = time_raw[1].split(':')
|
|
stop = time_raw[1].split(':')
|
|
@@ -410,10 +422,11 @@ def parse_script(file_path,gt_list):
|
|
dic_idx = 0
|
|
dic_idx = 0
|
|
accumulated_duration = 0
|
|
accumulated_duration = 0
|
|
duration = dic['stop']-dic['start']
|
|
duration = dic['stop']-dic['start']
|
|
- print(duration)
|
|
|
|
for sub_dic in split_sentence(dic['content'],13):
|
|
for sub_dic in split_sentence(dic['content'],13):
|
|
new_dic = {}
|
|
new_dic = {}
|
|
new_dic['index'] = new_idx
|
|
new_dic['index'] = new_idx
|
|
|
|
+ if 'image_idx' in dic:
|
|
|
|
+ new_dic['image_obj'] = {'start':dic['start'],'idx':dic['image_idx']}
|
|
new_idx+=1
|
|
new_idx+=1
|
|
ind_duration = duration * sub_dic['time_ratio']
|
|
ind_duration = duration * sub_dic['time_ratio']
|
|
new_dic['start'] = dic['start'] + accumulated_duration
|
|
new_dic['start'] = dic['start'] + accumulated_duration
|
|
@@ -427,7 +440,15 @@ def parse_script(file_path,gt_list):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
-def adjustSub_by_text_similarity(gts,gens_raw):
|
|
|
|
|
|
+def adjustSub_by_text_similarity(gts_in,gens_raw):
|
|
|
|
+ #call by value only
|
|
|
|
+ gts = gts_in[:]
|
|
|
|
+ text_parser = parser()
|
|
|
|
+ for i in range(len(gts)):
|
|
|
|
+ rep_ls = text_parser.replace_list(gts[i])
|
|
|
|
+ for reptxt in rep_ls:
|
|
|
|
+ gts[i] = gts[i].replace(reptxt)
|
|
|
|
+ print(gts)
|
|
gens = []
|
|
gens = []
|
|
for idx in range(int((len(gens_raw)+1)/4)):
|
|
for idx in range(int((len(gens_raw)+1)/4)):
|
|
gens.append(gens_raw[idx*4+2])
|
|
gens.append(gens_raw[idx*4+2])
|
|
@@ -448,6 +469,12 @@ def adjustSub_by_text_similarity(gts,gens_raw):
|
|
else:
|
|
else:
|
|
adjusted[idx] = match_text[0]
|
|
adjusted[idx] = match_text[0]
|
|
duplicated_list.append(match_text[0])
|
|
duplicated_list.append(match_text[0])
|
|
|
|
+ combine2_tag = [''.join([i,j]) for i,j in zip(gts_in, gts_in[1:])]
|
|
|
|
+ combine3_tag = [''.join([i,j,k]) for i,j,k in zip(gts_in, gts_in[1:], gts_in[2:])]
|
|
|
|
+ alls_tag = gts_in + combine2_tag + combine3_tag
|
|
|
|
+ for idx in range(len(adjusted)):
|
|
|
|
+ match_text = difflib.get_close_matches(adjusted[idx], alls_tag, cutoff=0.1)
|
|
|
|
+ adjusted[idx] = match_text
|
|
return adjusted
|
|
return adjusted
|
|
|
|
|
|
def trim_punctuation(s):
|
|
def trim_punctuation(s):
|