from openai import OpenAI from api.openai_scripts_tai_gi.config import SYSTEM_PROMPT, OPENAI_API_KEY, SUPABASE_KEY, SUPABASE_URL, ERROR_CORRECTION from supabase import create_client, Client from api.openai_scripts_tai_gi.text_processing import fuzzy_correct_chinese client = OpenAI(api_key=OPENAI_API_KEY) supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY) def transcribe(audio_file): try: table_name = "word_database" response = supabase.table(table_name).select("term").execute() custom_vocab = [] if response.data: for item in response.data: custom_vocab.append({item['term']}) else: print(f"No data found or an error occurred: {response.error}") print("Using default dictionary as Supabase data couldn't be fetched.") transcript = client.audio.transcriptions.create( file=audio_file, model="whisper-1", response_format="text", prompt=f"""你是一位專業的閩南語轉錄為中文校對助理,專門處理有關長照/日照中心的台語對話轉錄. 轉錄時對於「早餐、午餐、晚餐、幼兒園、國小、國中、高中、大學」等詞需特別注意。也需注意以下詞彙:{custom_vocab}""" ) return transcript except Exception as e: print(f"轉錄時發生錯誤:{str(e)}") return None def post_process_transcript(transcript, temperature=0): corrected_transcript = fuzzy_correct_chinese(transcript) messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": f"請校對並修正下面引號內的轉錄文本且只需回傳修正後的文本內容:「{corrected_transcript}」,重點是要查看上述文本是否還有台語的部分,若有則需要修正為繁體中文意思,沒有的話只需要順一下句子。文本裡面若出現「{ERROR_CORRECTION}」這個字典中的字,就直接以此字典內的規則修正文本。最後強調,只需要回傳順過之後的文本,不用加其他不相干的字或是說明。"} ] response = client.chat.completions.create( model="gpt-4", temperature=temperature, messages=messages ) return response.choices[0].message.content def process_audio(audio_data): raw_transcript = transcribe(audio_data) print(raw_transcript) if raw_transcript is None: return None, None corrected_transcript = post_process_transcript(raw_transcript) return raw_transcript, corrected_transcript