| 
					
				 | 
			
			
				@@ -1,199 +0,0 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import os 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import argparse 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from openai import OpenAI 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from dotenv import load_dotenv 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import jieba 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from datetime import datetime 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from pypinyin import pinyin, Style 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from langchain_community.chat_models import ChatOllama 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from langchain.schema import HumanMessage, SystemMessage 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-load_dotenv('environment.env') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-client = OpenAI() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-local_llm = "llama3-groq-tool-use:latest" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-llm = ChatOllama(model=local_llm, temperature=0) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-system_prompt = """你是一位專業的轉錄校對助理,專門處理有關溫室氣體、碳排放和碳管理的對話轉錄。 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-你的任務是: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-1. 確保以下專業術語的準確性:溫室氣體、碳排放、碳管理、碳盤查、碳權交易、碳足跡、淨零排放、碳權。 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-2. 在必要時添加適當的標點符號,如句號、逗號 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-3. 使用台灣的繁體中文,確保語言表達符合台灣的用語習慣。 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-4. 只更正明顯的錯誤或改善可讀性,不要改變原文的意思或結構。 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-5. 不要回答問題、解釋概念或添加任何不在原文中的信息。 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-6. 如果原文是一個問句,保持它的問句形式,不要提供答案。 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-請只根據提供的原文進行必要的更正,不要添加或刪除任何實質性內容。在修正時,請特別注意上下文,確保修正後的詞語符合整句話的語境。""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def transcribe(audio_file): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        transcript = client.audio.transcriptions.create( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            file=audio_file, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            model="whisper-1", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            response_format="text" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        return transcript 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        print(f"轉錄時發生錯誤:{str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def save_output(file_name, raw_transcript, corrected_transcript): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    output_dir = "output" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    os.makedirs(output_dir, exist_ok=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    output_file = os.path.join(output_dir, "transcription_results.txt") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    with open(output_file, "a", encoding="utf-8") as f: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        f.write(f"\n{'='*50}\n") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        f.write(f"文件名: {file_name}\n") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        f.write(f"處理時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        f.write("原始轉錄:\n") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        f.write(f"{raw_transcript}\n\n") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        f.write("修正後的轉錄:\n") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        f.write(f"{corrected_transcript}\n") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def process_audio_file(file_path): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        with open(file_path, "rb") as audio_file: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            file_size = os.path.getsize(file_path) / (1024 * 1024)  # 轉換為 MB 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            if file_size > 25: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                print(f"警告:文件 {os.path.basename(file_path)} 大小為 {file_size:.2f} MB,超過了 25 MB 的限制。可能無法處理。") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            print(f"\n處理文件:{os.path.basename(file_path)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            raw_transcript = transcribe(audio_file) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            if raw_transcript is None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                return 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            print("\n原始轉錄:") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            print(raw_transcript) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            corrected_transcript = post_process_transcript(raw_transcript) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            print("\n修正後的轉錄:") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            print(corrected_transcript) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            save_output(os.path.basename(file_path), raw_transcript, corrected_transcript) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        print(f"處理文件 {os.path.basename(file_path)} 時發生錯誤:{str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def process_folder(folder_path): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    processed_files = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    for filename in os.listdir(folder_path): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if filename.lower().endswith((".mp3", ".wav", ".m4a")): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            file_path = os.path.join(folder_path, filename) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            process_audio_file(file_path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            processed_files += 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    print("\n=== 總結 ===") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    print(f"處理的文件數:{processed_files}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def chinese_soundex(pinyin_str): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    soundex_map = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        'b': '1', 'p': '1', 'm': '1', 'f': '1', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        'd': '2', 't': '2', 'n': '2', 'l': '2', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        'g': '3', 'k': '3', 'h': '3', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        'j': '4', 'q': '4', 'x': '4', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        'z': '6', 'c': '6', 's': '6', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        'ong': '9', 'un': '9', 'uan': '9', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        'i': 'A', 'u': 'A', 'v': 'A',  # 'v' is used for 'ü' in some systems 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        'e': 'B', 'o': 'B', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    code = '' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    tone = '0' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    i = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    while i < len(pinyin_str): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if pinyin_str[i:i+2] in soundex_map: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            code += soundex_map[pinyin_str[i:i+2]] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            i += 2 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        elif pinyin_str[i] in soundex_map: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            code += soundex_map[pinyin_str[i]] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            i += 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        elif pinyin_str[i].isdigit(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            tone = pinyin_str[i] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            i += 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            i += 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    code = code[:1] + ''.join(sorted(set(code[1:]))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    return (code[:3] + tone).ljust(4, '0') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def compare_chinese_words(word1, word2, tone_sensitive=True): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    soundex1 = chinese_soundex(pinyin1) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    soundex2 = chinese_soundex(pinyin2) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    if tone_sensitive: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        return soundex1 == soundex2 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        return soundex1[:3] == soundex2[:3] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-error_correction = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "看拳": "碳權", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "看盤插": "碳盤查", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "盤插": "盤查", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "看": "碳" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def fuzzy_correct_chinese(text, correct_terms): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    words = jieba.cut(text) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    corrected_words = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    for word in words: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if word in error_correction: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            corrected_words.append(error_correction[word]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            for term in correct_terms: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                if compare_chinese_words(word, term, tone_sensitive=True): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    print(f"corrected: {word} -> {term}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    corrected_words.append(term) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                corrected_words.append(word) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    return ''.join(corrected_words) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def post_process_transcript(transcript): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    correct_terms = ["碳", "溫室氣體", "碳排放", "排放", "碳管理", "管理", "碳盤查", "盤查", "碳權交易", "碳費", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                     "碳權", "碳足跡", "足跡", "淨零排放", "零排放", "排放", "淨零", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                     "氣候變遷法", "氣候", "氣候變遷", "法", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                     "是什麼", "請解釋", "為什麼", "什麼意思", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                     "台灣"] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    corrected_transcript = fuzzy_correct_chinese(transcript, correct_terms) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    # 準備輸入 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    messages = [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        SystemMessage(content=system_prompt), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        HumanMessage(content=f"請校對並修正以下轉錄文本,但不要改變其原意或回答問題:\n\n{corrected_transcript}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    # 使用 ChatOllama 生成回應 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    response = llm(messages) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    return response.content 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    parser = argparse.ArgumentParser(description="處理音頻文件使用 llama3-groq-tool-use:latest 模型") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    parser.add_argument("--file", help="要處理的單個音頻文件的路徑") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    parser.add_argument("--folder", default="data", help="包含音頻文件的文件夾路徑(默認:data)") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    args = parser.parse_args() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    if args.file: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if os.path.isfile(args.file): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            process_audio_file(args.file) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            print(f"錯誤:文件 '{args.file}' 不存在。") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    elif args.folder: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if os.path.isdir(args.folder): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            process_folder(args.folder) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            print(f"錯誤:文件夾 '{args.folder}' 不存在。") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        print("錯誤:請指定一個文件(--file)或文件夾(--folder)來處理。") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-if __name__ == "__main__": 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    main() 
			 |