| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 | 
							- from difflib import SequenceMatcher
 
- import os
 
- from gtts import gTTS
 
- from mutagen.mp3 import MP3
 
- import difflib
 
- #max_len = 3
 
- def similar(a, b):
 
-     return SequenceMatcher(None, a, b).ratio()
 
- def audio_compare_from_text(gt,gen):
 
-     tts=gTTS(text=gen, lang='zh')
 
-     tts.save("gen_tmp.mp3")
 
-     tts=gTTS(text=gt, lang='zh')
 
-     tts.save("gt_tmp.mp3")
 
-     audio = MP3("gen_tmp.mp3")
 
-     gen_len = audio.info.length
 
-     audio = MP3("gt_tmp.mp3")
 
-     gt_len = audio.info.length
 
-     
 
-     os.remove('gen_tmp.mp3')
 
-     os.remove('gt_tmp.mp3')
 
-     return gt_len, gen_len
 
- #break in nested loop only break 1 layer
 
- def adjustSub_by_audio_similarity(gt_array, generated_array):
 
-     for gen_idx in range(len(generated_array)):
 
-         #print('------------------------------------------------------------------------')
 
-         for gt_idx in range(len(gt_array)-max_len):
 
-             for l in range(max_len):
 
-                 gt_text = gt_array[gt_idx]
 
-                 gen_text = generated_array[gen_idx]
 
-                 for idx_num in range(l):
 
-                     gt_text += gt_array[gt_idx+idx_num+1]
 
-                 gt_len, gen_len = audio_compare_from_text(gt_text,gen_text)
 
-                 #print(gt_text+'|'+str(gt_len)+'|'+gen_text+'|'+str(gen_len)+'|'+'SIMILARITY:'+str(similar(gt_text,gen_text)))
 
-                 
 
-                 if abs(gen_len - gt_len) < 0.3 and similar(gt_text,gen_text) > 0.5:
 
-                     generated_array[gen_idx] = gt_text 
 
-                     break
 
-     return generated_array
 
- '''
 
- gts = 'Hello Kitty 於2018年,加入YouTube開始活動,在自我介紹的影片裡,Kitty表示一直憧憬著,想在YouTube跟大家見面,一開頻道就吸引許多粉絲訂閱,目前有28萬訂閱者,接下來這位花生君,於2017年加入YouTube開始活動,他的外型太過特別,花生頭、紅色圍巾與紙尿布,被觀眾評價為,第一眼看上很噁心,但看著看著還挺可愛,目前有12萬訂閱者'.split(',')
 
- gens = ['Hello Kitty瑜2018年加入YouTube開始活動','再次我介紹的影片裡','Kitty表示一直憧憬著長在YouTube跟大家見面','一開頻道就吸引許多粉絲訂閱付錢有28萬訂閱者','接下來這位花生君瑜2017年加入YouTube開始湖','活動','他的外型太過特別花生桃紅色圍巾魚紙尿布','被觀眾評價為第一眼看上很噁心但看著看著還挺','秦可愛','目前有12萬訂閱者']
 
- adjs = adjustSub_by_gt_array(gts,gens)
 
- for s in adjs:
 
-     print(s)
 
- '''
 
- def parse_script(file_path):
 
-     with open(file_path, 'r') as f:
 
-         lines = [line.strip() for line in f]
 
-     dict_list = []
 
-     
 
-     for idx in range(int((len(lines)+1)/4)):
 
-         script={}
 
-         script['index'] = lines[idx * 4]
 
-         time_raw = lines[idx * 4 + 1]
 
-         script['content'] = lines[idx * 4 + 2]
 
-         start = time_raw.split(' --> ')[0].split(':')
 
-         stop = time_raw.split(' --> ')[1].split(':')
 
-         start[2] = start[2].replace(',','.')
 
-         stop[2] = stop[2].replace(',','.')
 
-         start_sec = float(start[0])*3600 + float(start[1])*60 + float(start[2])
 
-         stop_sec = float(stop[0])*3600 + float(stop[1])*60 + float(stop[2])
 
-         duration = start_sec-stop_sec
 
-         script['start'] = start_sec
 
-         script['stop'] = stop_sec
 
-         script['duration'] = abs(duration)
 
-         dict_list.append(script)
 
-     return dict_list
 
- gg = parse_script('out.txt')
 
- for g in gg:
 
-     print(g) 
 
-             
 
-             
 
-     
 
- gts = 'Hello Kitty 於2018年,加入YouTube開始活動,在自我介紹的影片裡,Kitty表示一直憧憬著,想在YouTube跟大家見面,一開頻道就吸引許多粉絲訂閱,目前有28萬訂閱者,接下來這位花生君,於2017年加入YouTube開始活動,他的外型太過特別,花生頭、紅色圍巾與紙尿布,被觀眾評價為,第一眼看上很噁心,但看著看著還挺可愛,目前有12萬訂閱者'.split(',')
 
- gens = ['Hello Kitty瑜2018年加入YouTube開始活動','再次我介紹的影片裡','Kitty表示一直憧憬著長在YouTube跟大家見面','一開頻道就吸引許多粉絲訂閱付錢有28萬訂閱者','接下來這位花生君瑜2017年加入YouTube開始湖','活動','他的外型太過特別花生桃紅色圍巾魚紙尿布','被觀眾評價為第一眼看上很噁心但看著看著還挺','秦可愛','目前有12萬訂閱者']
 
- def adjustSub_by_text_similarity(gts,gens):
 
-     combine2 = [''.join([i,j]) for i,j in zip(gts, gts[1:])]
 
-     combine3 = [''.join([i,j,k]) for i,j,k in zip(gts, gts[1:], gts[2:])]
 
-     alls = gts+combine2+combine3
 
-  
 
-     for idx in range(len(gens)):
 
-         match_text = difflib.get_close_matches(gens[idx], alls, cutoff=0.1)
 
-         if len(match_text) != 0:
 
-             print('{ '+gens[idx]+' }校正後: '+match_text[0])
 
-             gens[idx] = match_text[0]
 
-         else:
 
-            print('無校正:'+gens[idx])
 
- adjustSub_by_text_similarity(gts,gens)
 
 
  |