|
@@ -2,13 +2,12 @@ import os
|
|
|
import argparse
|
|
|
from openai import OpenAI
|
|
|
from dotenv import load_dotenv
|
|
|
-from pypinyin import pinyin, Style
|
|
|
import jieba
|
|
|
from datetime import datetime
|
|
|
-import logging
|
|
|
+from pypinyin import pinyin, Style
|
|
|
|
|
|
load_dotenv('environment.env')
|
|
|
-client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
|
|
+client = OpenAI()
|
|
|
|
|
|
system_prompt = """你是一位專業的轉錄校對助理,專門處理有關溫室氣體、碳排放和碳管理的對話轉錄。
|
|
|
你的任務是:
|
|
@@ -21,32 +20,21 @@ system_prompt = """你是一位專業的轉錄校對助理,專門處理有關
|
|
|
|
|
|
請只根據提供的原文進行必要的更正,不要添加或刪除任何實質性內容。在修正時,請特別注意上下文,確保修正後的詞語符合整句話的語境。"""
|
|
|
|
|
|
-def setup_logger():
|
|
|
- logging.basicConfig(level=logging.DEBUG,
|
|
|
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
- return logging.getLogger(__name__)
|
|
|
-
|
|
|
-logger = setup_logger()
|
|
|
-
|
|
|
def transcribe(audio_file):
|
|
|
try:
|
|
|
- logger.info(f"Attempting to transcribe file: {audio_file}")
|
|
|
- with open(audio_file, "rb") as file:
|
|
|
- response = client.audio.transcriptions.create(
|
|
|
- file=file,
|
|
|
- model="whisper-1",
|
|
|
- response_format="text"
|
|
|
- )
|
|
|
- logger.info("Transcription successful")
|
|
|
- return response
|
|
|
+ transcript = client.audio.transcriptions.create(
|
|
|
+ file=audio_file,
|
|
|
+ model="whisper-1",
|
|
|
+ response_format="text"
|
|
|
+ )
|
|
|
+ return transcript
|
|
|
except Exception as e:
|
|
|
- logger.error(f"Transcription failed: {str(e)}", exc_info=True)
|
|
|
+ print(f"轉錄時發生錯誤:{str(e)}")
|
|
|
return None
|
|
|
-
|
|
|
+
|
|
|
def save_output(file_name, raw_transcript, corrected_transcript):
|
|
|
output_dir = "output"
|
|
|
- if not os.path.exists(output_dir):
|
|
|
- os.makedirs(output_dir)
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
output_file = os.path.join(output_dir, "transcription_results.txt")
|
|
|
|
|
@@ -59,7 +47,6 @@ def save_output(file_name, raw_transcript, corrected_transcript):
|
|
|
f.write("修正後的轉錄:\n")
|
|
|
f.write(f"{corrected_transcript}\n")
|
|
|
|
|
|
-
|
|
|
def process_audio_file(file_path):
|
|
|
try:
|
|
|
with open(file_path, "rb") as audio_file:
|
|
@@ -79,19 +66,16 @@ def process_audio_file(file_path):
|
|
|
print("\n修正後的轉錄:")
|
|
|
print(corrected_transcript)
|
|
|
|
|
|
- # 保存輸出結果
|
|
|
save_output(os.path.basename(file_path), raw_transcript, corrected_transcript)
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"處理文件 {os.path.basename(file_path)} 時發生錯誤:{str(e)}")
|
|
|
- return transcript
|
|
|
-
|
|
|
|
|
|
def process_folder(folder_path):
|
|
|
processed_files = 0
|
|
|
|
|
|
for filename in os.listdir(folder_path):
|
|
|
- if filename.endswith((".mp3", ".wav", ".m4a")):
|
|
|
+ if filename.lower().endswith((".mp3", ".wav", ".m4a")):
|
|
|
file_path = os.path.join(folder_path, filename)
|
|
|
process_audio_file(file_path)
|
|
|
processed_files += 1
|
|
@@ -99,38 +83,50 @@ def process_folder(folder_path):
|
|
|
print("\n=== 總結 ===")
|
|
|
print(f"處理的文件數:{processed_files}")
|
|
|
|
|
|
-def chinese_soundex(pinyin):
|
|
|
+def chinese_soundex(pinyin_str):
|
|
|
soundex_map = {
|
|
|
'b': '1', 'p': '1', 'm': '1', 'f': '1',
|
|
|
'd': '2', 't': '2', 'n': '2', 'l': '2',
|
|
|
'g': '3', 'k': '3', 'h': '3',
|
|
|
'j': '4', 'q': '4', 'x': '4',
|
|
|
'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
|
|
|
- 'z': '6', 'c': '6', 's': '6'
|
|
|
+ 'z': '6', 'c': '6', 's': '6',
|
|
|
+ 'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
|
|
|
+ 'ong': '9', 'un': '9', 'uan': '9',
|
|
|
+ 'i': 'A', 'u': 'A', 'v': 'A', # 'v' is used for 'ü' in some systems
|
|
|
+ 'e': 'B', 'o': 'B',
|
|
|
}
|
|
|
|
|
|
- code = pinyin[0].upper()
|
|
|
+ code = ''
|
|
|
tone = '0'
|
|
|
+ i = 0
|
|
|
+ while i < len(pinyin_str):
|
|
|
+ if pinyin_str[i:i+2] in soundex_map:
|
|
|
+ code += soundex_map[pinyin_str[i:i+2]]
|
|
|
+ i += 2
|
|
|
+ elif pinyin_str[i] in soundex_map:
|
|
|
+ code += soundex_map[pinyin_str[i]]
|
|
|
+ i += 1
|
|
|
+ elif pinyin_str[i].isdigit():
|
|
|
+ tone = pinyin_str[i]
|
|
|
+ i += 1
|
|
|
+ else:
|
|
|
+ i += 1
|
|
|
|
|
|
- for char in pinyin[1:]:
|
|
|
- if char.isdigit():
|
|
|
- tone = char
|
|
|
- elif char in soundex_map:
|
|
|
- if len(code) == 1 or code[-1] != soundex_map[char]:
|
|
|
- code += soundex_map[char]
|
|
|
- if len(code) == 4:
|
|
|
- break
|
|
|
-
|
|
|
- return (code.ljust(4, '0') + tone)[:5]
|
|
|
+ code = code[:1] + ''.join(sorted(set(code[1:])))
|
|
|
+ return (code[:3] + tone).ljust(4, '0')
|
|
|
|
|
|
-def compare_chinese_words(word1, word2):
|
|
|
+def compare_chinese_words(word1, word2, tone_sensitive=True):
|
|
|
pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
|
|
|
pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
|
|
|
|
|
|
soundex1 = chinese_soundex(pinyin1)
|
|
|
soundex2 = chinese_soundex(pinyin2)
|
|
|
|
|
|
- return soundex1 == soundex2
|
|
|
+ if tone_sensitive:
|
|
|
+ return soundex1 == soundex2
|
|
|
+ else:
|
|
|
+ return soundex1[:3] == soundex2[:3]
|
|
|
|
|
|
error_correction = {
|
|
|
"看拳": "碳權",
|
|
@@ -139,7 +135,6 @@ error_correction = {
|
|
|
"看": "碳"
|
|
|
}
|
|
|
|
|
|
-
|
|
|
def fuzzy_correct_chinese(text, correct_terms):
|
|
|
words = jieba.cut(text)
|
|
|
corrected_words = []
|
|
@@ -148,13 +143,13 @@ def fuzzy_correct_chinese(text, correct_terms):
|
|
|
corrected_words.append(error_correction[word])
|
|
|
else:
|
|
|
for term in correct_terms:
|
|
|
- if compare_chinese_words(word, term):
|
|
|
+ if compare_chinese_words(word, term, tone_sensitive=True):
|
|
|
+ print(f"corrected: {word} -> {term}")
|
|
|
corrected_words.append(term)
|
|
|
break
|
|
|
else:
|
|
|
corrected_words.append(word)
|
|
|
- return ' '.join(corrected_words)
|
|
|
-
|
|
|
+ return ''.join(corrected_words)
|
|
|
|
|
|
def post_process_transcript(transcript, temperature=0):
|
|
|
correct_terms = ["碳", "溫室氣體", "碳排放", "排放", "碳管理", "管理", "碳盤查", "盤查", "碳權交易", "碳費",
|
|
@@ -171,48 +166,31 @@ def post_process_transcript(transcript, temperature=0):
|
|
|
]
|
|
|
|
|
|
response = client.chat.completions.create(
|
|
|
- model="gpt-4o",
|
|
|
+ model="gpt-4",
|
|
|
temperature=temperature,
|
|
|
messages=messages
|
|
|
)
|
|
|
|
|
|
- # return response.choices[0].message.content
|
|
|
- return transcript
|
|
|
+ return response.choices[0].message.content
|
|
|
|
|
|
-# 處理單個音頻,使用transcript端點發送音頻給API
|
|
|
-if __name__ == "__main__":
|
|
|
- import argparse
|
|
|
+def main():
|
|
|
parser = argparse.ArgumentParser(description="處理音頻文件使用 Whisper")
|
|
|
parser.add_argument("--file", help="要處理的單個音頻文件的路徑")
|
|
|
+ parser.add_argument("--folder", default="data", help="包含音頻文件的文件夾路徑(默認:data)")
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
if args.file:
|
|
|
- with open(args.file, "rb") as audio_file:
|
|
|
- transcript = transcribe(audio_file)
|
|
|
- if transcript:
|
|
|
- corrected = post_process_transcript(transcript)
|
|
|
- print("Original:", transcript)
|
|
|
- print("Corrected:", corrected)
|
|
|
-
|
|
|
-
|
|
|
-# def main():
|
|
|
-# parser = argparse.ArgumentParser(description="處理音頻文件使用 Whisper")
|
|
|
-# parser.add_argument("--file", help="要處理的單個音頻文件的路徑")
|
|
|
-# parser.add_argument("--folder", default="data", help="包含音頻文件的文件夾路徑(默認:data)")
|
|
|
-# args = parser.parse_args()
|
|
|
-
|
|
|
-# if args.file:
|
|
|
-# if os.path.isfile(args.file):
|
|
|
-# process_audio_file(args.file)
|
|
|
-# else:
|
|
|
-# print(f"錯誤:文件 '{args.file}' 不存在。")
|
|
|
-# elif args.folder:
|
|
|
-# if os.path.isdir(args.folder):
|
|
|
-# process_folder(args.folder)
|
|
|
-# else:
|
|
|
-# print(f"錯誤:文件夾 '{args.folder}' 不存在。")
|
|
|
-# else:
|
|
|
-# print("錯誤:請指定一個文件(--file)或文件夾(--folder)來處理。")
|
|
|
-
|
|
|
-# if __name__ == "__main__":
|
|
|
-# main()
|
|
|
+ if os.path.isfile(args.file):
|
|
|
+ process_audio_file(args.file)
|
|
|
+ else:
|
|
|
+ print(f"錯誤:文件 '{args.file}' 不存在。")
|
|
|
+ elif args.folder:
|
|
|
+ if os.path.isdir(args.folder):
|
|
|
+ process_folder(args.folder)
|
|
|
+ else:
|
|
|
+ print(f"錯誤:文件夾 '{args.folder}' 不存在。")
|
|
|
+ else:
|
|
|
+ print("錯誤:請指定一個文件(--file)或文件夾(--folder)來處理。")
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|