1 年之前 · d6dafdf872
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
 
				+.env
			
 
				+chroma_db/
			
 
				+__pycache__/
			
 
				+speech_audio/
			
--- a/api/openai_scripts_chinese/audio_processing.py
+++ b/api/openai_scripts_chinese/audio_processing.py
@@ -0,0 +1,53 @@
 
				+from openai import OpenAI
			
 
				+from api.openai_scripts_chinese.config import SYSTEM_PROMPT, OPENAI_API_KEY, SUPABASE_KEY, SUPABASE_URL
			
 
				+from supabase import create_client, Client
			
 
				+from api.openai_scripts_chinese.text_processing import fuzzy_correct_chinese
			
 
				+
			
 
				+client = OpenAI(api_key=OPENAI_API_KEY)
			
 
				+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
			
 
				+
			
 
				+def transcribe(audio_file):
			
 
				+    try:
			
 
				+        table_name = "word_database"
			
 
				+        response = supabase.table(table_name).select("term").execute()
			
 
				+        custom_vocab = []
			
 
				+        if response.data:
			
 
				+            for item in response.data:
			
 
				+                custom_vocab.append({item['term']})
			
 
				+        else:
			
 
				+            print(f"No data found or an error occurred: {response.error}")
			
 
				+            print("Using default dictionary as Supabase data couldn't be fetched.")
			
 
				+        transcript = client.audio.transcriptions.create(
			
 
				+            file=audio_file,
			
 
				+            model="whisper-1",
			
 
				+            response_format="text", 
			
 
				+            prompt=f"請注意以下詞彙：{custom_vocab}"
			
 
				+        )
			
 
				+        return transcript
			
 
				+    except Exception as e:
			
 
				+        print(f"轉錄時發生錯誤：{str(e)}")
			
 
				+        return None
			
 
				+
			
 
				+def post_process_transcript(transcript, temperature=0):
			
 
				+    corrected_transcript = fuzzy_correct_chinese(transcript)
			
 
				+    
			
 
				+    messages = [
			
 
				+        {"role": "system", "content": SYSTEM_PROMPT},
			
 
				+        {"role": "user", "content": f"請校對並修正以下轉錄文本，但不要改變其原意或回答問題：\n\n{corrected_transcript}"}
			
 
				+    ]
			
 
				+
			
 
				+    response = client.chat.completions.create(
			
 
				+        model="gpt-4",
			
 
				+        temperature=temperature,
			
 
				+        messages=messages
			
 
				+    )
			
 
				+
			
 
				+    return response.choices[0].message.content
			
 
				+
			
 
				+def process_audio(audio_data):
			
 
				+    raw_transcript = transcribe(audio_data)
			
 
				+    print(raw_transcript)
			
 
				+    if raw_transcript is None:
			
 
				+        return None, None
			
 
				+    corrected_transcript = post_process_transcript(raw_transcript)
			
 
				+    return raw_transcript, corrected_transcript
			
--- a/api/openai_scripts_chinese/config.py
+++ b/api/openai_scripts_chinese/config.py
@@ -0,0 +1,31 @@
 
				+import os

			
 
				+from dotenv import load_dotenv

			
 
				+

			
 
				+load_dotenv()

			
 
				+

			
 
				+SUPABASE_URL: str = os.getenv('SUPABASE_URL')

			
 
				+SUPABASE_KEY: str = os.getenv('SUPABASE_KEY')

			
 
				+OPENAI_API_KEY: str = os.getenv('OPENAI_API_KEY')

			
 
				+

			
 
				+if not SUPABASE_URL or not SUPABASE_KEY:

			
 
				+    raise ValueError("SUPABASE_URL and SUPABASE_KEY must be set in the .env file")

			
 
				+

			
 
				+

			
 
				+SYSTEM_PROMPT = """你是一位專業的中文轉錄校對助理，專門處理有關長照/日照中心的中文對話轉錄。

			
 
				+你的任務是：

			
 
				+1. 確保以下專業術語的準確性：垃圾、冷氣、電風扇、喝水、倒垃圾、上廁所、上班、看孫子、走路、起床、幾點、毛筆、冰箱、哥哥、妹妹、菜市場、夜市、工作、老闆。

			
 
				+2. 在必要時添加適當的標點符號，如句號、逗號

			
 
				+3. 使用台灣的繁體中文，確保語言表達符合台灣的用語習慣。

			
 
				+4. 只更正明顯的錯誤或改善可讀性，不要改變原文的意思或結構。

			
 
				+5. 不要回答問題、解釋概念或添加任何不在原文中的信息。

			
 
				+6. 如果原文是一個問句，保持它的問句形式，不要提供答案。

			
 
				+

			
 
				+請只根據提供的原文進行必要的更正，不要添加或刪除任何實質性內容。在修正時，請特別注意上下文，確保修正後的詞語符合整句話的語境。"""

			
 
				+

			
 
				+CORRECT_TERMS = [

			
 
				+    "日照中心", "吃飯", "垃圾", "冷氣", "孫子", "電風扇", "喝水", "倒垃圾", "上廁所", "上班", "看孫子", "聯繫", "走路", "起床", "幾點", "毛筆", "電視", "冰箱", "廚房", "可以", "哥哥", "妹妹", "夜市", "工作", "老闆"

			
 
				+]

			
 
				+

			
 
				+ERROR_CORRECTION = {

			
 
				+}

			
 
				+

			
--- a/api/openai_scripts_chinese/dictionary_loader.py
+++ b/api/openai_scripts_chinese/dictionary_loader.py
@@ -0,0 +1,27 @@
 
				+import io

			
 
				+import jieba

			
 
				+from supabase import create_client, Client

			
 
				+from api.openai_scripts_chinese.config import SUPABASE_URL, SUPABASE_KEY

			
 
				+

			
 
				+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

			
 
				+

			
 
				+

			
 
				+def load_word_database_dictionary_from_supabase():

			
 
				+    table_name = "word_database"

			
 
				+    response = supabase.table(table_name).select("term, weight, type").execute()

			
 
				+    

			
 
				+    if response.data:

			
 
				+        dict_data = io.StringIO()

			
 
				+        for item in response.data:

			
 
				+            dict_data.write(f"{item['term']} {item['weight']} {item['type']}\n")

			
 
				+        

			
 
				+        dict_data.seek(0)

			
 
				+        jieba.load_userdict(dict_data)

			
 
				+        # print("Loaded dictionary from Supabase")

			
 
				+        return True

			
 
				+    else:

			
 
				+        print(f"No data found or an error occurred: {response.error}")

			
 
				+        print("Using default dictionary as Supabase data couldn't be fetched.")

			
 
				+        return False

			
 
				+

			
 
				+

			
--- a/api/openai_scripts_chinese/main_script.py
+++ b/api/openai_scripts_chinese/main_script.py
@@ -0,0 +1,47 @@
 
				+import sys
			
 
				+from api.openai_scripts_chinese.dictionary_loader import load_word_database_dictionary_from_supabase
			
 
				+from api.openai_scripts_chinese.audio_processing import process_audio
			
 
				+
			
 
				+def initialize():
			
 
				+    word_database_success = load_word_database_dictionary_from_supabase()
			
 
				+    if not word_database_success:
			
 
				+        print("Warning: Word Database Dictionary loading failed. Proceeding with default dictionary.")
			
 
				+
			
 
				+
			
 
				+def process_audio_file(audio_file):
			
 
				+    try:
			
 
				+        result = process_audio(audio_file)
			
 
				+        if isinstance(result, tuple) and len(result) == 2:
			
 
				+            return result
			
 
				+        else:
			
 
				+            print("Unexpected result from process_audio")
			
 
				+            return None, None
			
 
				+    except Exception as e:
			
 
				+        print(f"Error processing audio: {str(e)}")
			
 
				+        return None, None
			
 
				+    
			
 
				+# 加入檢查user是否詢問特定問題
			
 
				+def main(audio_file):
			
 
				+    initialize()
			
 
				+    try:
			
 
				+        raw_transcript, corrected_transcript = process_audio_file(audio_file)
			
 
				+        
			
 
				+        return raw_transcript, corrected_transcript
			
 
				+    except FileNotFoundError:
			
 
				+        print(f"Error: The file was not found.")
			
 
				+        return None, None
			
 
				+    except Exception as e:
			
 
				+        print(f"An unexpected error occurred: {str(e)}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return None, None
			
 
				+        
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print("Usage: python script_name.py <audio_file_path>")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    audio_file_path = sys.argv[1]
			
 
				+    main(audio_file_path)
			
--- a/api/openai_scripts_chinese/text_processing.py
+++ b/api/openai_scripts_chinese/text_processing.py
@@ -0,0 +1,72 @@
 
				+import jieba

			
 
				+from pypinyin import pinyin, Style

			
 
				+from api.openai_scripts_chinese.config import CORRECT_TERMS, ERROR_CORRECTION

			
 
				+

			
 
				+def chinese_soundex(pinyin_str):

			
 
				+    soundex_map = {

			
 
				+        'b': '1', 'p': '1', 'm': '1', 'f': '1',

			
 
				+        'd': '2', 'n': '2', 'l': '2',

			
 
				+        'g': '3', 'k': '3', 'h': '3', 't': '3',

			
 
				+        'j': '4', 'q': '4', 'x': '4',

			
 
				+        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',

			
 
				+        'z': '6', 'c': '6', 's': '6',

			
 
				+        'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',

			
 
				+        'ong': '9', 'un': '9', 'uan': '9',

			
 
				+        'i': 'A', 'u': 'A', 'v': 'A',

			
 
				+        'e': 'B', 'o': 'B',

			
 
				+    }

			
 
				+    

			
 
				+    code = ''

			
 
				+    tone = '0'

			
 
				+    i = 0

			
 
				+    while i < len(pinyin_str):

			
 
				+        if pinyin_str[i:i+2] in soundex_map:

			
 
				+            code += soundex_map[pinyin_str[i:i+2]]

			
 
				+            i += 2

			
 
				+        elif pinyin_str[i] in soundex_map:

			
 
				+            code += soundex_map[pinyin_str[i]]

			
 
				+            i += 1

			
 
				+        elif pinyin_str[i].isdigit():

			
 
				+            tone = pinyin_str[i]

			
 
				+            i += 1

			
 
				+        else:

			
 
				+            i += 1

			
 
				+    

			
 
				+    code = code[:1] + ''.join(sorted(set(code[1:])))

			
 
				+    return (code[:3] + tone).ljust(4, '0')

			
 
				+

			
 
				+def compare_chinese_words(word1, word2, tone_sensitive=True):

			
 
				+    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])

			
 
				+    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])

			
 
				+    

			
 
				+    soundex1 = chinese_soundex(pinyin1)

			
 
				+    # print(soundex1)

			
 
				+    soundex2 = chinese_soundex(pinyin2)

			
 
				+    # print('soundex2', soundex2)

			
 
				+

			
 
				+    if tone_sensitive:

			
 
				+        return soundex1 == soundex2

			
 
				+    else:

			
 
				+        return soundex1[:3] == soundex2[:3]

			
 
				+

			
 
				+def fuzzy_correct_chinese(text):

			
 
				+    words = jieba.lcut(text)

			
 
				+    corrected_words = []

			
 
				+    for word in words:

			
 
				+        if word.isalpha():

			
 
				+            corrected_words.append(word)

			
 
				+            continue

			
 
				+        word_pinyin = ''.join([p[0] for p in pinyin(word, style=Style.NORMAL)])

			
 
				+        # print(f"Term: {word}, Pinyin: {word_pinyin}")

			
 
				+        if word in ERROR_CORRECTION:

			
 
				+            corrected_words.append(ERROR_CORRECTION[word])

			
 
				+        else:

			
 
				+            for term in CORRECT_TERMS:

			
 
				+                if compare_chinese_words(word, term, tone_sensitive=True):

			
 
				+                    # print(f"corrected: {word} -> {term}")

			
 
				+                    corrected_words.append(term)

			
 
				+                    break

			
 
				+            else:

			
 
				+                corrected_words.append(word)

			
 
				+    return ''.join(corrected_words)

			
 
				+

			
--- a/api/openai_scripts_new/audio_processing.py
+++ b/api/openai_scripts_new/audio_processing.py
@@ -0,0 +1,97 @@
 
				+from openai import OpenAI
			
 
				+from api.openai_scripts_new.config import SYSTEM_PROMPT, OPENAI_API_KEY, SUPABASE_KEY, SUPABASE_URL
			
 
				+from supabase import create_client, Client
			
 
				+from api.openai_scripts_new.text_processing import fuzzy_correct_chinese
			
 
				+from transformers import pipeline
			
 
				+from langchain_core.prompts import ChatPromptTemplate
			
 
				+from langchain_community.callbacks import get_openai_callback
			
 
				+from langchain_core.output_parsers import StrOutputParser
			
 
				+from langchain_openai import ChatOpenAI
			
 
				+import torchaudio
			
 
				+import torch
			
 
				+
			
 
				+client = OpenAI(api_key=OPENAI_API_KEY)
			
 
				+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
			
 
				+
			
 
				+pipe = pipeline(model="linshoufan/linshoufanfork-whisper-small-nan-tw-pinyin")
			
 
				+
			
 
				+def transcribe(audio_data):
			
 
				+    try:
			
 
				+        # table_name = "word_database"
			
 
				+        # response = supabase.table(table_name).select("term").execute()
			
 
				+        # custom_vocab = []
			
 
				+        # if response.data:
			
 
				+        #     for item in response.data:
			
 
				+        #         custom_vocab.append({item['term']})
			
 
				+        # else:
			
 
				+        #     print(f"No data found or an error occurred: {response.error}")
			
 
				+        #     print("Using default dictionary as Supabase data couldn't be fetched.")
			
 
				+        # 如果音頻是立體聲，轉換為單聲道
			
 
				+        text = pipe(audio_data)["text"]
			
 
				+        print(transcript)
			
 
				+        # 翻譯台羅拼音為繁體中文
			
 
				+        model_name = "gpt-4o"
			
 
				+        llm = ChatOpenAI(model_name=model_name, temperature=0.7, api_key=OPENAI_API_KEY, max_tokens=4096)
			
 
				+        with get_openai_callback() as cb:
			
 
				+            qa_system_prompt = f"""你是一個專門翻譯台羅拼音的助理,可以將台語音精準的轉換成在繁體中文中的意思.
			
 
				+    你是一名資深的大語言模型領域的專家,精通模型架構原理和落地應用實踐,只需要翻譯成繁體中文即可."""
			
 
				+
			
 
				+            qa_prompt = ChatPromptTemplate.from_messages(
			
 
				+                [
			
 
				+                    ("system", qa_system_prompt),
			
 
				+                    ("human", "{transcript}"),
			
 
				+                ]
			
 
				+            )
			
 
				+
			
 
				+            rag_chain = (
			
 
				+                qa_prompt
			
 
				+                | llm
			
 
				+                | StrOutputParser()
			
 
				+            )
			
 
				+
			
 
				+            # session_id = "abc123"  # 這應該是從某個上下文獲取的動態值
			
 
				+            # chat_history = get_session_history(session_id)
			
 
				+
			
 
				+            text = rag_chain.invoke(
			
 
				+                {"transcript": transcript}
			
 
				+            )
			
 
				+
			
 
				+            # # 更新聊天歷史
			
 
				+            # chat_history.add_user_message(inp)
			
 
				+            # chat_history.add_ai_message(text)
			
 
				+            # # chat_history.add_message({'role':HumanMessage(content=input), 'message':AIMessage(content=text)})
			
 
				+            # save_session_history(session_id, chat_history)
			
 
				+            print(f"Total Tokens: {cb.total_tokens}")
			
 
				+            print(f"Prompt Tokens: {cb.prompt_tokens}")
			
 
				+            print(f"Completion Tokens: {cb.completion_tokens}")
			
 
				+            print(f"Total Cost (USD): ${cb.total_cost}")
			
 
				+
			
 
				+        return text
			
 
				+    except Exception as e:
			
 
				+        print(f"轉錄時發生錯誤：{str(e)}")
			
 
				+        return None
			
 
				+
			
 
				+def post_process_transcript(transcript, temperature=0):
			
 
				+    corrected_transcript = fuzzy_correct_chinese(transcript)
			
 
				+    
			
 
				+    messages = [
			
 
				+        {"role": "system", "content": SYSTEM_PROMPT},
			
 
				+        {"role": "user", "content": f"請校對並修正以下轉錄文本，但不要改變其原意或回答問題：\n\n{corrected_transcript}"}
			
 
				+    ]
			
 
				+
			
 
				+    response = client.chat.completions.create(
			
 
				+        model="gpt-4",
			
 
				+        temperature=temperature,
			
 
				+        messages=messages
			
 
				+    )
			
 
				+
			
 
				+    return response.choices[0].message.content
			
 
				+
			
 
				+def process_audio(audio_data):
			
 
				+    raw_transcript = transcribe(audio_data)
			
 
				+    print(raw_transcript)
			
 
				+    if raw_transcript is None:
			
 
				+        return None, None
			
 
				+    corrected_transcript = post_process_transcript(raw_transcript)
			
 
				+    return raw_transcript, corrected_transcript
			
 
				+
			
--- a/api/openai_scripts_new/config.py
+++ b/api/openai_scripts_new/config.py
@@ -0,0 +1,33 @@
 
				+import os

			
 
				+from dotenv import load_dotenv

			
 
				+

			
 
				+load_dotenv()

			
 
				+

			
 
				+SUPABASE_URL: str = os.getenv('SUPABASE_URL')

			
 
				+SUPABASE_KEY: str = os.getenv('SUPABASE_KEY')

			
 
				+OPENAI_API_KEY: str = os.getenv('OPENAI_API_KEY')

			
 
				+

			
 
				+if not SUPABASE_URL or not SUPABASE_KEY:

			
 
				+    raise ValueError("SUPABASE_URL and SUPABASE_KEY must be set in the .env file")

			
 
				+

			
 
				+

			
 
				+SYSTEM_PROMPT = """你是一位專業的閩南語轉錄為中文校對助理，專門處理有關長照/日照中心的台語對話轉錄。

			
 
				+你的任務是：

			
 
				+1. 確保以下專業術語在台語拼音轉換時的準確性：什麼、看什麼、可以、什麼意思、垃圾、冷氣、電風扇、喝水、倒垃圾、上廁所、上班、看孫子、走路、起床、幾點、毛筆、冰箱、哥哥、妹妹、菜市場、夜市、工作、老闆。

			
 
				+2. 在必要時添加適當的標點符號，如句號、逗號

			
 
				+3. 使用台灣的繁體中文，確保語言表達符合台灣的用語習慣。

			
 
				+4. 只更正明顯的錯誤或改善可讀性，不要改變原文的意思或結構。

			
 
				+5. 不要回答問題、解釋概念或添加任何不在原文中的信息。

			
 
				+6. 如果原文是一個問句，保持它的問句形式，不要提供答案。

			
 
				+

			
 
				+請只根據提供的原文進行必要的更正，不要添加或刪除任何實質性內容。在修正時，請特別注意上下文，確保修正後的詞語符合整句話的語境。"""

			
 
				+

			
 
				+CORRECT_TERMS = [

			
 
				+    "日照中心", "吃飯", "垃圾", "冷氣", "孫子", "電風扇", "喝水", "倒垃圾", "上廁所", "上班", "看孫子", "聯繫", "走路", "起床", "幾點", "毛筆", "電視", "冰箱", "廚房", "可以", "哥哥", "妹妹", "夜市", "工作", "老闆", "什麼"

			
 
				+]

			
 
				+

			
 
				+ERROR_CORRECTION = {

			
 
				+    '三歲' : "什麼", 

			
 
				+    '跨三回' : '看什麼'

			
 
				+}

			
 
				+

			
--- a/api/openai_scripts_new/dictionary_loader.py
+++ b/api/openai_scripts_new/dictionary_loader.py
@@ -0,0 +1,27 @@
 
				+import io

			
 
				+import jieba

			
 
				+from supabase import create_client, Client

			
 
				+from api.openai_scripts_new.config import SUPABASE_URL, SUPABASE_KEY

			
 
				+

			
 
				+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

			
 
				+

			
 
				+

			
 
				+def load_word_database_dictionary_from_supabase():

			
 
				+    table_name = "word_database"

			
 
				+    response = supabase.table(table_name).select("term, weight, type").execute()

			
 
				+    

			
 
				+    if response.data:

			
 
				+        dict_data = io.StringIO()

			
 
				+        for item in response.data:

			
 
				+            dict_data.write(f"{item['term']} {item['weight']} {item['type']}\n")

			
 
				+        

			
 
				+        dict_data.seek(0)

			
 
				+        jieba.load_userdict(dict_data)

			
 
				+        # print("Loaded dictionary from Supabase")

			
 
				+        return True

			
 
				+    else:

			
 
				+        print(f"No data found or an error occurred: {response.error}")

			
 
				+        print("Using default dictionary as Supabase data couldn't be fetched.")

			
 
				+        return False

			
 
				+

			
 
				+

			
--- a/api/openai_scripts_new/main_script.py
+++ b/api/openai_scripts_new/main_script.py
@@ -0,0 +1,47 @@
 
				+import sys
			
 
				+from api.openai_scripts_new.dictionary_loader import load_word_database_dictionary_from_supabase
			
 
				+from api.openai_scripts_new.audio_processing import process_audio
			
 
				+
			
 
				+def initialize():
			
 
				+    word_database_success = load_word_database_dictionary_from_supabase()
			
 
				+    if not word_database_success:
			
 
				+        print("Warning: Word Database Dictionary loading failed. Proceeding with default dictionary.")
			
 
				+
			
 
				+
			
 
				+def process_audio_file(audio_data):
			
 
				+    try:
			
 
				+        result = process_audio(audio_data)
			
 
				+        if isinstance(result, tuple) and len(result) == 2:
			
 
				+            return result
			
 
				+        else:
			
 
				+            print("Unexpected result from process_audio")
			
 
				+            return None, None
			
 
				+    except Exception as e:
			
 
				+        print(f"Error processing audio: {str(e)}")
			
 
				+        return None, None
			
 
				+    
			
 
				+# 加入檢查user是否詢問特定問題
			
 
				+def main(audio_data):
			
 
				+    initialize()
			
 
				+    try:
			
 
				+        raw_transcript, corrected_transcript = process_audio_file(audio_data)
			
 
				+        
			
 
				+        return raw_transcript, corrected_transcript
			
 
				+    except FileNotFoundError:
			
 
				+        print(f"Error: The file was not found.")
			
 
				+        return None, None
			
 
				+    except Exception as e:
			
 
				+        print(f"An unexpected error occurred: {str(e)}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return None, None
			
 
				+        
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print("Usage: python script_name.py <audio_file_path>")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    audio_file_path = sys.argv[1]
			
 
				+    main(audio_file_path)
			
--- a/api/openai_scripts_new/text_processing.py
+++ b/api/openai_scripts_new/text_processing.py
@@ -0,0 +1,72 @@
 
				+import jieba

			
 
				+from pypinyin import pinyin, Style

			
 
				+from api.openai_scripts_new.config import CORRECT_TERMS, ERROR_CORRECTION

			
 
				+

			
 
				+def chinese_soundex(pinyin_str):

			
 
				+    soundex_map = {

			
 
				+        'b': '1', 'p': '1', 'm': '1', 'f': '1',

			
 
				+        'd': '2', 'n': '2', 'l': '2',

			
 
				+        'g': '3', 'k': '3', 'h': '3', 't': '3',

			
 
				+        'j': '4', 'q': '4', 'x': '4',

			
 
				+        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',

			
 
				+        'z': '6', 'c': '6', 's': '6',

			
 
				+        'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',

			
 
				+        'ong': '9', 'un': '9', 'uan': '9',

			
 
				+        'i': 'A', 'u': 'A', 'v': 'A',

			
 
				+        'e': 'B', 'o': 'B',

			
 
				+    }

			
 
				+    

			
 
				+    code = ''

			
 
				+    tone = '0'

			
 
				+    i = 0

			
 
				+    while i < len(pinyin_str):

			
 
				+        if pinyin_str[i:i+2] in soundex_map:

			
 
				+            code += soundex_map[pinyin_str[i:i+2]]

			
 
				+            i += 2

			
 
				+        elif pinyin_str[i] in soundex_map:

			
 
				+            code += soundex_map[pinyin_str[i]]

			
 
				+            i += 1

			
 
				+        elif pinyin_str[i].isdigit():

			
 
				+            tone = pinyin_str[i]

			
 
				+            i += 1

			
 
				+        else:

			
 
				+            i += 1

			
 
				+    

			
 
				+    code = code[:1] + ''.join(sorted(set(code[1:])))

			
 
				+    return (code[:3] + tone).ljust(4, '0')

			
 
				+

			
 
				+def compare_chinese_words(word1, word2, tone_sensitive=True):

			
 
				+    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])

			
 
				+    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])

			
 
				+    

			
 
				+    soundex1 = chinese_soundex(pinyin1)

			
 
				+    # print(soundex1)

			
 
				+    soundex2 = chinese_soundex(pinyin2)

			
 
				+    # print('soundex2', soundex2)

			
 
				+

			
 
				+    if tone_sensitive:

			
 
				+        return soundex1 == soundex2

			
 
				+    else:

			
 
				+        return soundex1[:3] == soundex2[:3]

			
 
				+

			
 
				+def fuzzy_correct_chinese(text):

			
 
				+    words = jieba.lcut(text)

			
 
				+    corrected_words = []

			
 
				+    for word in words:

			
 
				+        if word.isalpha():

			
 
				+            corrected_words.append(word)

			
 
				+            continue

			
 
				+        word_pinyin = ''.join([p[0] for p in pinyin(word, style=Style.NORMAL)])

			
 
				+        # print(f"Term: {word}, Pinyin: {word_pinyin}")

			
 
				+        if word in ERROR_CORRECTION:

			
 
				+            corrected_words.append(ERROR_CORRECTION[word])

			
 
				+        else:

			
 
				+            for term in CORRECT_TERMS:

			
 
				+                if compare_chinese_words(word, term, tone_sensitive=True):

			
 
				+                    # print(f"corrected: {word} -> {term}")

			
 
				+                    corrected_words.append(term)

			
 
				+                    break

			
 
				+            else:

			
 
				+                corrected_words.append(word)

			
 
				+    return ''.join(corrected_words)

			
 
				+

			
--- a/api/openai_scripts_tai_gi/audio_processing.py
+++ b/api/openai_scripts_tai_gi/audio_processing.py
@@ -0,0 +1,54 @@
 
				+from openai import OpenAI
			
 
				+from api.openai_scripts_tai_gi.config import SYSTEM_PROMPT, OPENAI_API_KEY, SUPABASE_KEY, SUPABASE_URL, ERROR_CORRECTION
			
 
				+from supabase import create_client, Client
			
 
				+from api.openai_scripts_tai_gi.text_processing import fuzzy_correct_chinese
			
 
				+
			
 
				+client = OpenAI(api_key=OPENAI_API_KEY)
			
 
				+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
			
 
				+
			
 
				+def transcribe(audio_file):
			
 
				+    try:
			
 
				+        table_name = "word_database"
			
 
				+        response = supabase.table(table_name).select("term").execute()
			
 
				+        custom_vocab = []
			
 
				+        if response.data:
			
 
				+            for item in response.data:
			
 
				+                custom_vocab.append({item['term']})
			
 
				+        else:
			
 
				+            print(f"No data found or an error occurred: {response.error}")
			
 
				+            print("Using default dictionary as Supabase data couldn't be fetched.")
			
 
				+        transcript = client.audio.transcriptions.create(
			
 
				+            file=audio_file,
			
 
				+            model="whisper-1",
			
 
				+            response_format="text", 
			
 
				+            prompt=f"轉錄時對於「早餐、午餐、晚餐、幼兒園、國小、國中、高中、大學」等詞需特別注意。也需注意以下詞彙：{custom_vocab}"
			
 
				+        )
			
 
				+        return transcript
			
 
				+    except Exception as e:
			
 
				+        print(f"轉錄時發生錯誤：{str(e)}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def post_process_transcript(transcript, temperature=0):
			
 
				+    corrected_transcript = fuzzy_correct_chinese(transcript)
			
 
				+    
			
 
				+    messages = [
			
 
				+        {"role": "system", "content": SYSTEM_PROMPT},
			
 
				+        {"role": "user", "content": f"請校對並修正下面引號內的轉錄文本且只需回傳修正後的文本內容：「{corrected_transcript}」，重點是要查看上述文本是否還有台語的部分，若有則需要修正為繁體中文意思，沒有的話只需要順一下句子。文本裡面若出現「{ERROR_CORRECTION}」這個字典中的字，就直接以此字典內的規則修正文本。最後強調，只需要回傳順過之後的文本，不用加其他不相干的字或是說明。"}
			
 
				+    ]
			
 
				+
			
 
				+    response = client.chat.completions.create(
			
 
				+        model="gpt-4",
			
 
				+        temperature=temperature,
			
 
				+        messages=messages
			
 
				+    )
			
 
				+
			
 
				+    return response.choices[0].message.content
			
 
				+
			
 
				+def process_audio(audio_data):
			
 
				+    raw_transcript = transcribe(audio_data)
			
 
				+    print(raw_transcript)
			
 
				+    if raw_transcript is None:
			
 
				+        return None, None
			
 
				+    corrected_transcript = post_process_transcript(raw_transcript)
			
 
				+    return raw_transcript, corrected_transcript
			
--- a/api/openai_scripts_tai_gi/config.py
+++ b/api/openai_scripts_tai_gi/config.py
@@ -0,0 +1,33 @@
 
				+import os

			
 
				+from dotenv import load_dotenv

			
 
				+

			
 
				+load_dotenv()

			
 
				+

			
 
				+SUPABASE_URL: str = os.getenv('SUPABASE_URL')

			
 
				+SUPABASE_KEY: str = os.getenv('SUPABASE_KEY')

			
 
				+OPENAI_API_KEY: str = os.getenv('OPENAI_API_KEY')

			
 
				+

			
 
				+if not SUPABASE_URL or not SUPABASE_KEY:

			
 
				+    raise ValueError("SUPABASE_URL and SUPABASE_KEY must be set in the .env file")

			
 
				+

			
 
				+

			
 
				+SYSTEM_PROMPT = """你是一位專業的閩南語轉錄為中文校對助理，專門處理有關長照/日照中心的台語對話轉錄。

			
 
				+你的任務是：

			
 
				+1. 確保以下專業術語在台語拼音轉換時的準確性：幫我、身體、什麼時候、現在、有點、冷、會不會、車、昨晚、血壓、需要、量血壓、早餐、午餐、晚餐、主任、早安、午安、晚安、聯絡簿、等車、交通車、國小、什麼、看什麼、可以、什麼意思、垃圾、冷氣、電風扇、喝水、倒垃圾、上廁所、上班、看孫子、走路、起床、幾點、毛筆、冰箱、哥哥、妹妹、菜市場、夜市、工作、老闆。

			
 
				+2. 在必要時添加適當的標點符號，如句號、逗號

			
 
				+3. 使用台灣的繁體中文，確保語言表達符合台灣的用語習慣。

			
 
				+4. 只更正明顯的錯誤或改善可讀性，不要改變原文的意思或結構。

			
 
				+5. 不要回答問題、解釋概念或添加任何不在原文中的信息。

			
 
				+6. 如果原文是一個問句，保持它的問句形式，不要提供答案。

			
 
				+

			
 
				+請只根據提供的原文進行必要的更正，不要添加或刪除任何實質性內容。在修正時，請特別注意上下文，確保修正後的詞語符合整句話的語境。"""

			
 
				+

			
 
				+CORRECT_TERMS = ["幫我", "什麼時候", "身體", "現在", "有點", "冷", "會不會", "車", "昨晚", "血壓", "需要", "量血壓", "早餐", "午餐", "晚餐", "主任", "早安", "午安", "晚安", "聯絡簿", "等車", "交通車", "幼兒園", "國小", "國中", "高中", "大學", "日照中心", "吃飯", "垃圾", "冷氣", "孫子", "電風扇", "喝水", "倒垃圾", "上廁所", "上班", "看孫子", "聯繫", "走路", "起床", "幾點", "毛筆", "電視", "冰箱", "廚房", "可以", "哥哥", "妹妹", "夜市", "工作", "老闆", "什麼"]

			
 
				+

			
 
				+ERROR_CORRECTION = {

			
 
				+    '跨三回' : '看什麼', 

			
 
				+    '畫沙小' : '看三小', 

			
 
				+    '高通車' : '交通車', 

			
 
				+    '的東西搞' : '什麼時候到'

			
 
				+}

			
 
				+

			
--- a/api/openai_scripts_tai_gi/dictionary_loader.py
+++ b/api/openai_scripts_tai_gi/dictionary_loader.py
@@ -0,0 +1,27 @@
 
				+import io

			
 
				+import jieba

			
 
				+from supabase import create_client, Client

			
 
				+from api.openai_scripts_tai_gi.config import SUPABASE_URL, SUPABASE_KEY

			
 
				+

			
 
				+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

			
 
				+

			
 
				+

			
 
				+def load_word_database_dictionary_from_supabase():

			
 
				+    table_name = "word_database"

			
 
				+    response = supabase.table(table_name).select("term, weight, type").execute()

			
 
				+    

			
 
				+    if response.data:

			
 
				+        dict_data = io.StringIO()

			
 
				+        for item in response.data:

			
 
				+            dict_data.write(f"{item['term']} {item['weight']} {item['type']}\n")

			
 
				+        

			
 
				+        dict_data.seek(0)

			
 
				+        jieba.load_userdict(dict_data)

			
 
				+        # print("Loaded dictionary from Supabase")

			
 
				+        return True

			
 
				+    else:

			
 
				+        print(f"No data found or an error occurred: {response.error}")

			
 
				+        print("Using default dictionary as Supabase data couldn't be fetched.")

			
 
				+        return False

			
 
				+

			
 
				+

			
--- a/api/openai_scripts_tai_gi/main_script.py
+++ b/api/openai_scripts_tai_gi/main_script.py
@@ -0,0 +1,47 @@
 
				+import sys
			
 
				+from api.openai_scripts_tai_gi.dictionary_loader import load_word_database_dictionary_from_supabase
			
 
				+from api.openai_scripts_tai_gi.audio_processing import process_audio
			
 
				+
			
 
				+def initialize():
			
 
				+    word_database_success = load_word_database_dictionary_from_supabase()
			
 
				+    if not word_database_success:
			
 
				+        print("Warning: Word Database Dictionary loading failed. Proceeding with default dictionary.")
			
 
				+
			
 
				+
			
 
				+def process_audio_file(audio_file):
			
 
				+    try:
			
 
				+        result = process_audio(audio_file)
			
 
				+        if isinstance(result, tuple) and len(result) == 2:
			
 
				+            return result
			
 
				+        else:
			
 
				+            print("Unexpected result from process_audio")
			
 
				+            return None, None
			
 
				+    except Exception as e:
			
 
				+        print(f"Error processing audio: {str(e)}")
			
 
				+        return None, None
			
 
				+    
			
 
				+# 加入檢查user是否詢問特定問題
			
 
				+def main(audio_file):
			
 
				+    initialize()
			
 
				+    try:
			
 
				+        raw_transcript, corrected_transcript = process_audio_file(audio_file)
			
 
				+        
			
 
				+        return raw_transcript, corrected_transcript
			
 
				+    except FileNotFoundError:
			
 
				+        print(f"Error: The file was not found.")
			
 
				+        return None, None
			
 
				+    except Exception as e:
			
 
				+        print(f"An unexpected error occurred: {str(e)}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return None, None
			
 
				+        
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print("Usage: python script_name.py <audio_file_path>")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    audio_file_path = sys.argv[1]
			
 
				+    main(audio_file_path)
			
--- a/api/openai_scripts_tai_gi/text_processing.py
+++ b/api/openai_scripts_tai_gi/text_processing.py
@@ -0,0 +1,72 @@
 
				+import jieba

			
 
				+from pypinyin import pinyin, Style

			
 
				+from api.openai_scripts_tai_gi.config import CORRECT_TERMS, ERROR_CORRECTION

			
 
				+

			
 
				+def chinese_soundex(pinyin_str):

			
 
				+    soundex_map = {

			
 
				+        'b': '1', 'p': '1', 'm': '1', 'f': '1',

			
 
				+        'd': '2', 'n': '2', 'l': '2',

			
 
				+        'g': '3', 'k': '3', 'h': '3', 't': '3',

			
 
				+        'j': '4', 'q': '4', 'x': '4',

			
 
				+        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',

			
 
				+        'z': '6', 'c': '6', 's': '6',

			
 
				+        'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',

			
 
				+        'ong': '9', 'un': '9', 'uan': '9',

			
 
				+        'i': 'A', 'u': 'A', 'v': 'A',

			
 
				+        'e': 'B', 'o': 'B',

			
 
				+    }

			
 
				+    

			
 
				+    code = ''

			
 
				+    tone = '0'

			
 
				+    i = 0

			
 
				+    while i < len(pinyin_str):

			
 
				+        if pinyin_str[i:i+2] in soundex_map:

			
 
				+            code += soundex_map[pinyin_str[i:i+2]]

			
 
				+            i += 2

			
 
				+        elif pinyin_str[i] in soundex_map:

			
 
				+            code += soundex_map[pinyin_str[i]]

			
 
				+            i += 1

			
 
				+        elif pinyin_str[i].isdigit():

			
 
				+            tone = pinyin_str[i]

			
 
				+            i += 1

			
 
				+        else:

			
 
				+            i += 1

			
 
				+    

			
 
				+    code = code[:1] + ''.join(sorted(set(code[1:])))

			
 
				+    return (code[:3] + tone).ljust(4, '0')

			
 
				+

			
 
				+def compare_chinese_words(word1, word2, tone_sensitive=True):

			
 
				+    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])

			
 
				+    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])

			
 
				+    

			
 
				+    soundex1 = chinese_soundex(pinyin1)

			
 
				+    # print(soundex1)

			
 
				+    soundex2 = chinese_soundex(pinyin2)

			
 
				+    # print('soundex2', soundex2)

			
 
				+

			
 
				+    if tone_sensitive:

			
 
				+        return soundex1 == soundex2

			
 
				+    else:

			
 
				+        return soundex1[:3] == soundex2[:3]

			
 
				+

			
 
				+def fuzzy_correct_chinese(text):

			
 
				+    words = jieba.lcut(text)

			
 
				+    corrected_words = []

			
 
				+    for word in words:

			
 
				+        if word.isalpha():

			
 
				+            corrected_words.append(word)

			
 
				+            continue

			
 
				+        word_pinyin = ''.join([p[0] for p in pinyin(word, style=Style.NORMAL)])

			
 
				+        # print(f"Term: {word}, Pinyin: {word_pinyin}")

			
 
				+        if word in ERROR_CORRECTION:

			
 
				+            corrected_words.append(ERROR_CORRECTION[word])

			
 
				+        else:

			
 
				+            for term in CORRECT_TERMS:

			
 
				+                if compare_chinese_words(word, term, tone_sensitive=True):

			
 
				+                    # print(f"corrected: {word} -> {term}")

			
 
				+                    corrected_words.append(term)

			
 
				+                    break

			
 
				+            else:

			
 
				+                corrected_words.append(word)

			
 
				+    return ''.join(corrected_words)

			
 
				+

			
--- a/api/whisper.py
+++ b/api/whisper.py
@@ -0,0 +1,112 @@
 
				+# import os, sys
			
 
				+# from typing import List, Any
			
 
				+from fastapi import Request, APIRouter, UploadFile, File
			
 
				+from fastapi.responses import FileResponse, PlainTextResponse
			
 
				+# from fastapi import FastAPI, HTTPException, status
			
 
				+# from fastapi.middleware.cors import CORSMiddleware
			
 
				+# import uvicorn
			
 
				+# from fastapi.exceptions import HTTPException
			
 
				+# from fastapi.encoders import jsonable_encoder
			
 
				+# from urllib.parse import urlparse, urljoin
			
 
				+from pathlib import Path
			
 
				+from api.openai_scripts_tai_gi.main_script import main
			
 
				+from api.openai_scripts_chinese.main_script import main as main2
			
 
				+# from api.openai_scripts_new.main_script import main as main3
			
 
				+from datetime import datetime
			
 
				+import random
			
 
				+import string
			
 
				+
			
 
				+router = APIRouter()
			
 
				+
			
 
				+# router = FastAPI()
			
 
				+# router.add_middleware(
			
 
				+#     CORSMiddleware,
			
 
				+#     allow_origins=["*"],
			
 
				+#     allow_credentials=True,
			
 
				+#     allow_methods=["*"],
			
 
				+#     allow_headers=["*"],
			
 
				+# )
			
 
				+
			
 
				+
			
 
				+@router.post('/tai_gi')
			
 
				+async def whisper_auto(file: UploadFile = File()):
			
 
				+    if file == None:
			
 
				+        return {'message': '請上傳檔案'}
			
 
				+    extension = file.filename.split(".")[-1] 
			
 
				+    if extension not in ("mp3", "wav", "webm"):
			
 
				+        return PlainTextResponse("Audio must be mp3, wav or webm format!", 400)
			
 
				+    filename = Path(__file__).parent.parent/'speech_audio'/datetime.now().strftime(f"%Y%m%d%H%M%S_{''.join(random.sample(string.ascii_lowercase, 3))}.{extension}")
			
 
				+    with open(filename, 'wb') as f:
			
 
				+        f.write(await file.read())
			
 
				+    with open(filename, 'rb') as f:
			
 
				+        raw_transcript, corrected_transcript = main(f)
			
 
				+    # if raw_transcript and corrected_transcript:
			
 
				+        # os.remove(filename)
			
 
				+        # return {'message': corrected_transcript}
			
 
				+    # else:
			
 
				+        # os.remove(filename)
			
 
				+        # return {"message": "Audio processing failed."}
			
 
				+    if raw_transcript and corrected_transcript:
			
 
				+        # os.remove(save_path)
			
 
				+        # return {'message': {"Raw transcript": raw_transcript, "Corrected transcript": corrected_transcript}}
			
 
				+        return {'message': corrected_transcript}
			
 
				+    else:
			
 
				+        # os.remove(save_path)
			
 
				+        return {"message": "Audio processing failed."}
			
 
				+        
			
 
				+        
			
 
				+@router.post('/chinese')
			
 
				+async def whisper_auto(file: UploadFile = File()):
			
 
				+    if file == None:
			
 
				+        return {'message': '請上傳檔案'}
			
 
				+    extension = file.filename.split(".")[-1] 
			
 
				+    if extension not in ("mp3", "wav", "webm"):
			
 
				+        return PlainTextResponse("Audio must be mp3, wav or webm format!", 400)
			
 
				+    filename = Path(__file__).parent.parent/'speech_audio'/datetime.now().strftime(f"%Y%m%d%H%M%S_{''.join(random.sample(string.ascii_lowercase, 3))}.{extension}")
			
 
				+    with open(filename, 'wb') as f:
			
 
				+        f.write(await file.read())
			
 
				+    with open(filename, 'rb') as f:
			
 
				+        raw_transcript, corrected_transcript = main2(f)
			
 
				+    if raw_transcript and corrected_transcript:
			
 
				+        # return {'message': {"Raw transcript": raw_transcript, "Corrected transcript": corrected_transcript}}
			
 
				+        return {'message': corrected_transcript}
			
 
				+    else:
			
 
				+        return {"message": "Audio processing failed."}
			
 
				+        
			
 
				+
			
 
				+# import numpy as np
			
 
				+# from transformers import pipeline
			
 
				+# import gradio as gr
			
 
				+
			
 
				+# @router.post('/tai_gi_new')
			
 
				+# async def whisper_auto(file: UploadFile = File()):
			
 
				+#     if file == None:
			
 
				+#         return {'message': '請上傳檔案'}
			
 
				+#     extension = file.filename.split(".")[-1] 
			
 
				+#     if extension not in ("mp3", "wav", "webm"):
			
 
				+#         return PlainTextResponse("Audio must be mp3, wav or webm format!", 400)
			
 
				+#     filename = Path(__file__).parent/'speech_audio'/datetime.now().strftime(f"%Y%m%d%H%M%S_{''.join(random.sample(string.ascii_lowercase, 3))}.{extension}")
			
 
				+#     with open(filename, 'wb') as f:
			
 
				+#         f.write(await file.read())
			
 
				+#     # with open(filename, 'rb') as f:
			
 
				+#         # raw_transcript, corrected_transcript = main3(f)
			
 
				+#     # audio_data, sample_rate = librosa.load(filename, sr=None)
			
 
				+#     # 讀取音頻文件
			
 
				+#     audio_input = gr.Audio(type="filepath")
			
 
				+#     raw_transcript, corrected_transcript = main3(audio_input(filename))
			
 
				+#     # if raw_transcript and corrected_transcript:
			
 
				+#         # os.remove(filename)
			
 
				+#         # return {'message': corrected_transcript}
			
 
				+#     # else:
			
 
				+#         # os.remove(filename)
			
 
				+#         # return {"message": "Audio processing failed."}
			
 
				+#     if raw_transcript and corrected_transcript:
			
 
				+#         # os.remove(save_path)
			
 
				+#         return {'message': {"Raw transcript": raw_transcript, "Corrected transcript": corrected_transcript}}
			
 
				+#     else:
			
 
				+#         # os.remove(save_path)
			
 
				+#         return {"message": "Audio processing failed."}
			
 
				+
			
 
				+        
			
 
				+# if __name__ == "__main__":
			
 
				+#     uvicorn.run("whisper:router", reload=False, port=8086, host='cmm.ai', ssl_keyfile="/etc/letsencrypt/live/cmm.ai/privkey.pem", ssl_certfile="/etc/letsencrypt/live/cmm.ai/fullchain.pem")
			
--- a/app.py
+++ b/app.py
@@ -0,0 +1,83 @@
 
				+import uvicorn
			
 
				+
			
 
				+from typing import List
			
 
				+from pydantic import BaseModel
			
 
				+from fastapi import FastAPI, Body
			
 
				+from fastapi.middleware.cors import CORSMiddleware
			
 
				+from langchain.schema import SystemMessage, AIMessage, HumanMessage
			
 
				+from langchain_core.prompts import ChatPromptTemplate
			
 
				+from langchain_openai import ChatOpenAI
			
 
				+from langchain.callbacks import get_openai_callback
			
 
				+
			
 
				+import os
			
 
				+from dotenv import load_dotenv
			
 
				+load_dotenv()
			
 
				+
			
 
				+
			
 
				+from supabase.client import Client, create_client
			
 
				+supabase_url = os.environ.get("SUPABASE_URL")
			
 
				+supabase_key = os.environ.get("SUPABASE_KEY")
			
 
				+supabase: Client = create_client(supabase_url, supabase_key)
			
 
				+
			
 
				+chat_model = ChatOpenAI(model='gpt-4o-mini')
			
 
				+
			
 
				+app = FastAPI()
			
 
				+app.add_middleware(
			
 
				+    CORSMiddleware,
			
 
				+    allow_origins=["*"],
			
 
				+    allow_credentials=True,
			
 
				+    allow_methods=["*"],
			
 
				+    allow_headers=["*"],
			
 
				+)
			
 
				+
			
 
				+from semantic_search import semantic_cache
			
 
				+from api import whisper
			
 
				+app.include_router(whisper.router, prefix='/whisper', tags=['whisper'])
			
 
				+
			
 
				+
			
 
				+class ChatHistoryItem(BaseModel):
			
 
				+    q: str
			
 
				+    a: str
			
 
				+
			
 
				+@app.post("/chat/")
			
 
				+async def chat(message, chat_history: List[ChatHistoryItem] = Body(...)):
			
 
				+    print(chat_history)
			
 
				+    messages_list = [
			
 
				+        SystemMessage(content="你是一名日照中心的志工，你的職責是陪伴老人聊天，你需要考量到老人的健康與安全，並能安撫老人，只有家人准許時才能離開日照中心。請用繁體中文"),
			
 
				+        AIMessage(content="你好！很高興能和您聊天。今天您過得怎麼樣呢？有沒有什麼想分享的事情？")
			
 
				+    ]
			
 
				+    
			
 
				+    for item in chat_history:
			
 
				+        if item.q == "string" or item.a == "string" : continue
			
 
				+        messages_list.append(HumanMessage(content=item.q))
			
 
				+        messages_list.append(AIMessage(content=item.a))
			
 
				+
			
 
				+    messages_list.append(HumanMessage(content=message))
			
 
				+    print(messages_list)
			
 
				+    prompt = ChatPromptTemplate(
			
 
				+        messages=messages_list
			
 
				+    )
			
 
				+
			
 
				+    with get_openai_callback() as cb:
			
 
				+        cache_question, cache_answer = semantic_cache(supabase, message)
			
 
				+        if cache_answer:
			
 
				+            save_history(message, cache_answer)
			
 
				+            return {"message": cache_answer}
			
 
				+
			
 
				+    AIMessage_ = chat_model.invoke(prompt.format_messages()).content
			
 
				+    save_history(message, AIMessage_)
			
 
				+    
			
 
				+    return {"message": AIMessage_}
			
 
				+
			
 
				+def save_history(question, answer):
			
 
				+    response = (
			
 
				+            supabase.table("INNOLUX_record")
			
 
				+            .insert({"question": question, "answer": answer})
			
 
				+            .execute()
			
 
				+        )
			
 
				+    
			
 
				+if __name__ == "__main__":
			
 
				+    
			
 
				+    uvicorn.run("app:app", reload=False, port=8087, host='cmm.ai', ssl_keyfile="/etc/letsencrypt/live/cmm.ai/privkey.pem", ssl_certfile="/etc/letsencrypt/live/cmm.ai/fullchain.pem")
			
 
				+
			
 
				+
			
--- a/semantic_search.py
+++ b/semantic_search.py
@@ -0,0 +1,60 @@
 
				+### Python = 3.9
			
 
				+import os
			
 
				+from dotenv import load_dotenv
			
 
				+load_dotenv('.env')
			
 
				+
			
 
				+import openai 
			
 
				+openai_api_key = os.getenv("OPENAI_API_KEY")
			
 
				+openai.api_key = openai_api_key
			
 
				+
			
 
				+from langchain_openai import OpenAIEmbeddings
			
 
				+embeddings_model = OpenAIEmbeddings()
			
 
				+
			
 
				+from langchain_community.document_loaders.csv_loader import CSVLoader
			
 
				+from langchain_community.vectorstores import Chroma
			
 
				+
			
 
				+import pandas as pd
			
 
				+import re
			
 
				+
			
 
				+from langchain_community.embeddings.openai import OpenAIEmbeddings
			
 
				+from langchain_community.vectorstores import SupabaseVectorStore
			
 
				+from supabase.client import create_client
			
 
				+
			
 
				+def create_qa_vectordb(supabase, vectordb_directory="./chroma_db"):
			
 
				+
			
 
				+    if os.path.isdir(vectordb_directory):
			
 
				+        vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
			
 
				+        vectorstore.delete_collection()
			
 
				+
			
 
				+    response = supabase.table("INNOLUX_cache").select("question, answer").execute()
			
 
				+    questions = [row["question"] for row in response.data]
			
 
				+
			
 
				+    vectorstore = Chroma.from_texts(
			
 
				+        texts=questions,
			
 
				+        embedding=embeddings_model,
			
 
				+        persist_directory=vectordb_directory
			
 
				+        )
			
 
				+    
			
 
				+    return vectorstore
			
 
				+
			
 
				+def semantic_cache(supabase, q, SIMILARITY_THRESHOLD=0.83, k=1, vectordb_directory="./chroma_db"):
			
 
				+
			
 
				+    if os.path.isdir(vectordb_directory):
			
 
				+        vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
			
 
				+    else:
			
 
				+        print("create new vector db ...")
			
 
				+        vectorstore = create_qa_vectordb(supabase, vectordb_directory)
			
 
				+
			
 
				+    docs_and_scores = vectorstore.similarity_search_with_relevance_scores(q, k=1)
			
 
				+    doc, score = docs_and_scores[0]
			
 
				+    print(score)
			
 
				+    
			
 
				+    if score >= SIMILARITY_THRESHOLD:
			
 
				+        cache_question = doc.page_content
			
 
				+
			
 
				+        response = supabase.table("INNOLUX_cache").select("question, answer").eq("question", cache_question).execute()
			
 
				+
			
 
				+        answer = response.data[0]["answer"]
			
 
				+        return cache_question, answer
			
 
				+    else:
			
 
				+        return None, None