|
@@ -1,199 +0,0 @@
|
|
|
-import os
|
|
|
-import argparse
|
|
|
-from openai import OpenAI
|
|
|
-from dotenv import load_dotenv
|
|
|
-import jieba
|
|
|
-from datetime import datetime
|
|
|
-from pypinyin import pinyin, Style
|
|
|
-from langchain_community.chat_models import ChatOllama
|
|
|
-from langchain.schema import HumanMessage, SystemMessage
|
|
|
-
|
|
|
-load_dotenv('environment.env')
|
|
|
-client = OpenAI()
|
|
|
-
|
|
|
-local_llm = "llama3-groq-tool-use:latest"
|
|
|
-llm = ChatOllama(model=local_llm, temperature=0)
|
|
|
-
|
|
|
-system_prompt = """你是一位專業的轉錄校對助理,專門處理有關溫室氣體、碳排放和碳管理的對話轉錄。
|
|
|
-你的任務是:
|
|
|
-1. 確保以下專業術語的準確性:溫室氣體、碳排放、碳管理、碳盤查、碳權交易、碳足跡、淨零排放、碳權。
|
|
|
-2. 在必要時添加適當的標點符號,如句號、逗號
|
|
|
-3. 使用台灣的繁體中文,確保語言表達符合台灣的用語習慣。
|
|
|
-4. 只更正明顯的錯誤或改善可讀性,不要改變原文的意思或結構。
|
|
|
-5. 不要回答問題、解釋概念或添加任何不在原文中的信息。
|
|
|
-6. 如果原文是一個問句,保持它的問句形式,不要提供答案。
|
|
|
-
|
|
|
-請只根據提供的原文進行必要的更正,不要添加或刪除任何實質性內容。在修正時,請特別注意上下文,確保修正後的詞語符合整句話的語境。"""
|
|
|
-
|
|
|
-def transcribe(audio_file):
|
|
|
- try:
|
|
|
- transcript = client.audio.transcriptions.create(
|
|
|
- file=audio_file,
|
|
|
- model="whisper-1",
|
|
|
- response_format="text"
|
|
|
- )
|
|
|
- return transcript
|
|
|
- except Exception as e:
|
|
|
- print(f"轉錄時發生錯誤:{str(e)}")
|
|
|
- return None
|
|
|
-
|
|
|
-def save_output(file_name, raw_transcript, corrected_transcript):
|
|
|
- output_dir = "output"
|
|
|
- os.makedirs(output_dir, exist_ok=True)
|
|
|
-
|
|
|
- output_file = os.path.join(output_dir, "transcription_results.txt")
|
|
|
-
|
|
|
- with open(output_file, "a", encoding="utf-8") as f:
|
|
|
- f.write(f"\n{'='*50}\n")
|
|
|
- f.write(f"文件名: {file_name}\n")
|
|
|
- f.write(f"處理時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|
|
- f.write("原始轉錄:\n")
|
|
|
- f.write(f"{raw_transcript}\n\n")
|
|
|
- f.write("修正後的轉錄:\n")
|
|
|
- f.write(f"{corrected_transcript}\n")
|
|
|
-
|
|
|
-def process_audio_file(file_path):
|
|
|
- try:
|
|
|
- with open(file_path, "rb") as audio_file:
|
|
|
- file_size = os.path.getsize(file_path) / (1024 * 1024) # 轉換為 MB
|
|
|
- if file_size > 25:
|
|
|
- print(f"警告:文件 {os.path.basename(file_path)} 大小為 {file_size:.2f} MB,超過了 25 MB 的限制。可能無法處理。")
|
|
|
-
|
|
|
- print(f"\n處理文件:{os.path.basename(file_path)}")
|
|
|
- raw_transcript = transcribe(audio_file)
|
|
|
- if raw_transcript is None:
|
|
|
- return
|
|
|
-
|
|
|
- print("\n原始轉錄:")
|
|
|
- print(raw_transcript)
|
|
|
-
|
|
|
- corrected_transcript = post_process_transcript(raw_transcript)
|
|
|
- print("\n修正後的轉錄:")
|
|
|
- print(corrected_transcript)
|
|
|
-
|
|
|
- save_output(os.path.basename(file_path), raw_transcript, corrected_transcript)
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- print(f"處理文件 {os.path.basename(file_path)} 時發生錯誤:{str(e)}")
|
|
|
-
|
|
|
-def process_folder(folder_path):
|
|
|
- processed_files = 0
|
|
|
-
|
|
|
- for filename in os.listdir(folder_path):
|
|
|
- if filename.lower().endswith((".mp3", ".wav", ".m4a")):
|
|
|
- file_path = os.path.join(folder_path, filename)
|
|
|
- process_audio_file(file_path)
|
|
|
- processed_files += 1
|
|
|
-
|
|
|
- print("\n=== 總結 ===")
|
|
|
- print(f"處理的文件數:{processed_files}")
|
|
|
-
|
|
|
-def chinese_soundex(pinyin_str):
|
|
|
- soundex_map = {
|
|
|
- 'b': '1', 'p': '1', 'm': '1', 'f': '1',
|
|
|
- 'd': '2', 't': '2', 'n': '2', 'l': '2',
|
|
|
- 'g': '3', 'k': '3', 'h': '3',
|
|
|
- 'j': '4', 'q': '4', 'x': '4',
|
|
|
- 'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
|
|
|
- 'z': '6', 'c': '6', 's': '6',
|
|
|
- 'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
|
|
|
- 'ong': '9', 'un': '9', 'uan': '9',
|
|
|
- 'i': 'A', 'u': 'A', 'v': 'A', # 'v' is used for 'ü' in some systems
|
|
|
- 'e': 'B', 'o': 'B',
|
|
|
- }
|
|
|
-
|
|
|
- code = ''
|
|
|
- tone = '0'
|
|
|
- i = 0
|
|
|
- while i < len(pinyin_str):
|
|
|
- if pinyin_str[i:i+2] in soundex_map:
|
|
|
- code += soundex_map[pinyin_str[i:i+2]]
|
|
|
- i += 2
|
|
|
- elif pinyin_str[i] in soundex_map:
|
|
|
- code += soundex_map[pinyin_str[i]]
|
|
|
- i += 1
|
|
|
- elif pinyin_str[i].isdigit():
|
|
|
- tone = pinyin_str[i]
|
|
|
- i += 1
|
|
|
- else:
|
|
|
- i += 1
|
|
|
-
|
|
|
- code = code[:1] + ''.join(sorted(set(code[1:])))
|
|
|
- return (code[:3] + tone).ljust(4, '0')
|
|
|
-
|
|
|
-def compare_chinese_words(word1, word2, tone_sensitive=True):
|
|
|
- pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
|
|
|
- pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
|
|
|
-
|
|
|
- soundex1 = chinese_soundex(pinyin1)
|
|
|
- soundex2 = chinese_soundex(pinyin2)
|
|
|
-
|
|
|
- if tone_sensitive:
|
|
|
- return soundex1 == soundex2
|
|
|
- else:
|
|
|
- return soundex1[:3] == soundex2[:3]
|
|
|
-
|
|
|
-error_correction = {
|
|
|
- "看拳": "碳權",
|
|
|
- "看盤插": "碳盤查",
|
|
|
- "盤插": "盤查",
|
|
|
- "看": "碳"
|
|
|
-}
|
|
|
-
|
|
|
-def fuzzy_correct_chinese(text, correct_terms):
|
|
|
- words = jieba.cut(text)
|
|
|
- corrected_words = []
|
|
|
- for word in words:
|
|
|
- if word in error_correction:
|
|
|
- corrected_words.append(error_correction[word])
|
|
|
- else:
|
|
|
- for term in correct_terms:
|
|
|
- if compare_chinese_words(word, term, tone_sensitive=True):
|
|
|
- print(f"corrected: {word} -> {term}")
|
|
|
- corrected_words.append(term)
|
|
|
- break
|
|
|
- else:
|
|
|
- corrected_words.append(word)
|
|
|
- return ''.join(corrected_words)
|
|
|
-
|
|
|
-def post_process_transcript(transcript):
|
|
|
- correct_terms = ["碳", "溫室氣體", "碳排放", "排放", "碳管理", "管理", "碳盤查", "盤查", "碳權交易", "碳費",
|
|
|
- "碳權", "碳足跡", "足跡", "淨零排放", "零排放", "排放", "淨零",
|
|
|
- "氣候變遷法", "氣候", "氣候變遷", "法",
|
|
|
- "是什麼", "請解釋", "為什麼", "什麼意思",
|
|
|
- "台灣"]
|
|
|
-
|
|
|
- corrected_transcript = fuzzy_correct_chinese(transcript, correct_terms)
|
|
|
-
|
|
|
- # 準備輸入
|
|
|
- messages = [
|
|
|
- SystemMessage(content=system_prompt),
|
|
|
- HumanMessage(content=f"請校對並修正以下轉錄文本,但不要改變其原意或回答問題:\n\n{corrected_transcript}")
|
|
|
- ]
|
|
|
-
|
|
|
- # 使用 ChatOllama 生成回應
|
|
|
- response = llm(messages)
|
|
|
-
|
|
|
- return response.content
|
|
|
-
|
|
|
-def main():
|
|
|
- parser = argparse.ArgumentParser(description="處理音頻文件使用 llama3-groq-tool-use:latest 模型")
|
|
|
- parser.add_argument("--file", help="要處理的單個音頻文件的路徑")
|
|
|
- parser.add_argument("--folder", default="data", help="包含音頻文件的文件夾路徑(默認:data)")
|
|
|
- args = parser.parse_args()
|
|
|
-
|
|
|
- if args.file:
|
|
|
- if os.path.isfile(args.file):
|
|
|
- process_audio_file(args.file)
|
|
|
- else:
|
|
|
- print(f"錯誤:文件 '{args.file}' 不存在。")
|
|
|
- elif args.folder:
|
|
|
- if os.path.isdir(args.folder):
|
|
|
- process_folder(args.folder)
|
|
|
- else:
|
|
|
- print(f"錯誤:文件夾 '{args.folder}' 不存在。")
|
|
|
- else:
|
|
|
- print("錯誤:請指定一個文件(--file)或文件夾(--folder)來處理。")
|
|
|
-
|
|
|
-if __name__ == "__main__":
|
|
|
- main()
|