Parcourir la source

get jieba dictionary from supabase

SherryLiu il y a 4 mois
Parent
commit
6799399033
7 fichiers modifiés avec 199 ajouts et 207 suppressions
  1. 6 8
      README.md
  2. 0 199
      llama_asr.py
  3. 40 0
      src/audio_processing.py
  4. 33 0
      src/config.py
  5. 25 0
      src/dictionary_loader.py
  6. 29 0
      src/main_script.py
  7. 66 0
      src/text_processing.py

+ 6 - 8
README.md

@@ -1,14 +1,12 @@
 # 語音轉文字 ASR
+### 自定義字典從Supabase下載
 
-### Prerequisite
-- 語音檔存放需放在`data/`
-- `environment.env`放openai key
+## Prerequisite 
+- 語音檔需存取於`src/data/`
+- openai key & supabase keys 放`src/environment.env`
 
 ### To run the code
 `pip install -r requirements.txt`
 
-執行所有在data目錄下的語音檔
-`python whisper.py`
-
-執行單個語音檔
-`python whisper.py --file 語音檔名`
+### 執行主程式
+`python src/main_script.py`

+ 0 - 199
llama_asr.py

@@ -1,199 +0,0 @@
-import os
-import argparse
-from openai import OpenAI
-from dotenv import load_dotenv
-import jieba
-from datetime import datetime
-from pypinyin import pinyin, Style
-from langchain_community.chat_models import ChatOllama
-from langchain.schema import HumanMessage, SystemMessage
-
-load_dotenv('environment.env')
-client = OpenAI()
-
-local_llm = "llama3-groq-tool-use:latest"
-llm = ChatOllama(model=local_llm, temperature=0)
-
-system_prompt = """你是一位專業的轉錄校對助理,專門處理有關溫室氣體、碳排放和碳管理的對話轉錄。
-你的任務是:
-1. 確保以下專業術語的準確性:溫室氣體、碳排放、碳管理、碳盤查、碳權交易、碳足跡、淨零排放、碳權。
-2. 在必要時添加適當的標點符號,如句號、逗號
-3. 使用台灣的繁體中文,確保語言表達符合台灣的用語習慣。
-4. 只更正明顯的錯誤或改善可讀性,不要改變原文的意思或結構。
-5. 不要回答問題、解釋概念或添加任何不在原文中的信息。
-6. 如果原文是一個問句,保持它的問句形式,不要提供答案。
-
-請只根據提供的原文進行必要的更正,不要添加或刪除任何實質性內容。在修正時,請特別注意上下文,確保修正後的詞語符合整句話的語境。"""
-
-def transcribe(audio_file):
-    try:
-        transcript = client.audio.transcriptions.create(
-            file=audio_file,
-            model="whisper-1",
-            response_format="text"
-        )
-        return transcript
-    except Exception as e:
-        print(f"轉錄時發生錯誤:{str(e)}")
-        return None
-
-def save_output(file_name, raw_transcript, corrected_transcript):
-    output_dir = "output"
-    os.makedirs(output_dir, exist_ok=True)
-    
-    output_file = os.path.join(output_dir, "transcription_results.txt")
-    
-    with open(output_file, "a", encoding="utf-8") as f:
-        f.write(f"\n{'='*50}\n")
-        f.write(f"文件名: {file_name}\n")
-        f.write(f"處理時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
-        f.write("原始轉錄:\n")
-        f.write(f"{raw_transcript}\n\n")
-        f.write("修正後的轉錄:\n")
-        f.write(f"{corrected_transcript}\n")
-
-def process_audio_file(file_path):
-    try:
-        with open(file_path, "rb") as audio_file:
-            file_size = os.path.getsize(file_path) / (1024 * 1024)  # 轉換為 MB
-            if file_size > 25:
-                print(f"警告:文件 {os.path.basename(file_path)} 大小為 {file_size:.2f} MB,超過了 25 MB 的限制。可能無法處理。")
-
-            print(f"\n處理文件:{os.path.basename(file_path)}")
-            raw_transcript = transcribe(audio_file)
-            if raw_transcript is None:
-                return
-
-            print("\n原始轉錄:")
-            print(raw_transcript)
-
-            corrected_transcript = post_process_transcript(raw_transcript)
-            print("\n修正後的轉錄:")
-            print(corrected_transcript)
-
-            save_output(os.path.basename(file_path), raw_transcript, corrected_transcript)
-
-    except Exception as e:
-        print(f"處理文件 {os.path.basename(file_path)} 時發生錯誤:{str(e)}")
-
-def process_folder(folder_path):
-    processed_files = 0
-
-    for filename in os.listdir(folder_path):
-        if filename.lower().endswith((".mp3", ".wav", ".m4a")):
-            file_path = os.path.join(folder_path, filename)
-            process_audio_file(file_path)
-            processed_files += 1
-
-    print("\n=== 總結 ===")
-    print(f"處理的文件數:{processed_files}")
-
-def chinese_soundex(pinyin_str):
-    soundex_map = {
-        'b': '1', 'p': '1', 'm': '1', 'f': '1',
-        'd': '2', 't': '2', 'n': '2', 'l': '2',
-        'g': '3', 'k': '3', 'h': '3',
-        'j': '4', 'q': '4', 'x': '4',
-        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
-        'z': '6', 'c': '6', 's': '6',
-        'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
-        'ong': '9', 'un': '9', 'uan': '9',
-        'i': 'A', 'u': 'A', 'v': 'A',  # 'v' is used for 'ü' in some systems
-        'e': 'B', 'o': 'B',
-    }
-    
-    code = ''
-    tone = '0'
-    i = 0
-    while i < len(pinyin_str):
-        if pinyin_str[i:i+2] in soundex_map:
-            code += soundex_map[pinyin_str[i:i+2]]
-            i += 2
-        elif pinyin_str[i] in soundex_map:
-            code += soundex_map[pinyin_str[i]]
-            i += 1
-        elif pinyin_str[i].isdigit():
-            tone = pinyin_str[i]
-            i += 1
-        else:
-            i += 1
-    
-    code = code[:1] + ''.join(sorted(set(code[1:])))
-    return (code[:3] + tone).ljust(4, '0')
-
-def compare_chinese_words(word1, word2, tone_sensitive=True):
-    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
-    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
-    
-    soundex1 = chinese_soundex(pinyin1)
-    soundex2 = chinese_soundex(pinyin2)
-    
-    if tone_sensitive:
-        return soundex1 == soundex2
-    else:
-        return soundex1[:3] == soundex2[:3]
-
-error_correction = {
-    "看拳": "碳權",
-    "看盤插": "碳盤查",
-    "盤插": "盤查",
-    "看": "碳"
-}
-
-def fuzzy_correct_chinese(text, correct_terms):
-    words = jieba.cut(text)
-    corrected_words = []
-    for word in words:
-        if word in error_correction:
-            corrected_words.append(error_correction[word])
-        else:
-            for term in correct_terms:
-                if compare_chinese_words(word, term, tone_sensitive=True):
-                    print(f"corrected: {word} -> {term}")
-                    corrected_words.append(term)
-                    break
-            else:
-                corrected_words.append(word)
-    return ''.join(corrected_words)
-
-def post_process_transcript(transcript):
-    correct_terms = ["碳", "溫室氣體", "碳排放", "排放", "碳管理", "管理", "碳盤查", "盤查", "碳權交易", "碳費",
-                     "碳權", "碳足跡", "足跡", "淨零排放", "零排放", "排放", "淨零",
-                     "氣候變遷法", "氣候", "氣候變遷", "法",
-                     "是什麼", "請解釋", "為什麼", "什麼意思",
-                     "台灣"]
-    
-    corrected_transcript = fuzzy_correct_chinese(transcript, correct_terms)
-    
-    # 準備輸入
-    messages = [
-        SystemMessage(content=system_prompt),
-        HumanMessage(content=f"請校對並修正以下轉錄文本,但不要改變其原意或回答問題:\n\n{corrected_transcript}")
-    ]
-    
-    # 使用 ChatOllama 生成回應
-    response = llm(messages)
-    
-    return response.content
-
-def main():
-    parser = argparse.ArgumentParser(description="處理音頻文件使用 llama3-groq-tool-use:latest 模型")
-    parser.add_argument("--file", help="要處理的單個音頻文件的路徑")
-    parser.add_argument("--folder", default="data", help="包含音頻文件的文件夾路徑(默認:data)")
-    args = parser.parse_args()
-
-    if args.file:
-        if os.path.isfile(args.file):
-            process_audio_file(args.file)
-        else:
-            print(f"錯誤:文件 '{args.file}' 不存在。")
-    elif args.folder:
-        if os.path.isdir(args.folder):
-            process_folder(args.folder)
-        else:
-            print(f"錯誤:文件夾 '{args.folder}' 不存在。")
-    else:
-        print("錯誤:請指定一個文件(--file)或文件夾(--folder)來處理。")
-
-if __name__ == "__main__":
-    main()

+ 40 - 0
src/audio_processing.py

@@ -0,0 +1,40 @@
+from openai import OpenAI
+from config import SYSTEM_PROMPT
+from text_processing import fuzzy_correct_chinese
+
+client = OpenAI()
+
+def transcribe(audio_data):
+    try:
+        transcript = client.audio.transcriptions.create(
+            file=audio_data,
+            model="whisper-1",
+            response_format="text"
+        )
+        return transcript
+    except Exception as e:
+        print(f"轉錄時發生錯誤:{str(e)}")
+        return None
+
+def post_process_transcript(transcript, temperature=0):
+    corrected_transcript = fuzzy_correct_chinese(transcript)
+    
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"請校對並修正以下轉錄文本,但不要改變其原意或回答問題:\n\n{corrected_transcript}"}
+    ]
+
+    response = client.chat.completions.create(
+        model="gpt-4",
+        temperature=temperature,
+        messages=messages
+    )
+
+    return response.choices[0].message.content
+
+def process_audio(audio_data):
+    raw_transcript = transcribe(audio_data)
+    if raw_transcript is None:
+        return None, None
+    corrected_transcript = post_process_transcript(raw_transcript)
+    return raw_transcript, corrected_transcript

+ 33 - 0
src/config.py

@@ -0,0 +1,33 @@
+import os
+from dotenv import load_dotenv
+
+load_dotenv('environment_systex.env')
+
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_KEY = os.getenv("SUPABASE_KEY")
+
+
+SYSTEM_PROMPT = """你是一位專業的語音到文字轉錄校對助理,專門處理有關溫室氣體、碳排放和碳管理的對話轉錄。
+你的任務是:
+1. 確保以下專業術語的準確性:溫室氣體、碳排放、碳管理、碳盤查、碳權交易、碳足跡、淨零排放、碳權。
+2. 在必要時添加適當的標點符號,如句號、逗號
+3. 使用台灣的繁體中文,確保語言表達符合台灣的用語習慣。
+4. 只更正明顯的錯誤或改善可讀性,不要改變原文的意思或結構。
+5. 不要回答問題、解釋概念或添加任何不在原文中的信息。
+6. 如果原文是一個問句,保持它的問句形式,不要提供答案。
+
+請只根據提供的原文進行必要的更正,不要添加或刪除任何實質性內容。在修正時,請特別注意上下文,確保修正後的詞語符合整句話的語境。"""
+
+CORRECT_TERMS = [
+    "溫室氣體", "碳排放", "碳管理", "碳盤查", "碳權交易", "碳費",
+    "碳權", "碳足跡", "足跡", "淨零排放", "零排放", "淨零",
+    "氣候變遷法", "氣候", "氣候變遷", "台灣"
+]
+
+ERROR_CORRECTION = {
+    "看拳": "碳權",
+    "看盤插": "碳盤查",
+    "盤插": "盤查",
+    "看": "碳",
+    "看權": "碳權"
+}

+ 25 - 0
src/dictionary_loader.py

@@ -0,0 +1,25 @@
+import io
+import jieba
+from supabase import create_client, Client
+from config import SUPABASE_URL, SUPABASE_KEY
+
+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
+
+
+def load_dictionary_from_supabase():
+    table_name = "word_database"
+    response = supabase.table(table_name).select("term", "weight", "type").execute()
+    
+    if response.data:
+        dict_data = io.StringIO()
+        for item in response.data:
+            dict_data.write(f"{item['term']} {item['weight']} {item['type']}\n")
+        
+        dict_data.seek(0)
+        jieba.load_userdict(dict_data)
+        # print("Loaded dictionary from Supabase")
+        return True
+    else:
+        print(f"No data found or an error occurred: {response.error}")
+        print("Using default dictionary as Supabase data couldn't be fetched.")
+        return False

+ 29 - 0
src/main_script.py

@@ -0,0 +1,29 @@
+from dictionary_loader import load_dictionary_from_supabase
+from audio_processing import process_audio
+
+def initialize():
+    success = load_dictionary_from_supabase()
+    if not success:
+        print("Warning: Dictionary loading failed. Proceeding with default dictionary.")
+    # Add any other necessary initializations here
+
+def main():
+    initialize()
+    
+    # This is where you'd typically set up your API routes
+    # For demonstration, let's just process a sample audio file
+    
+    sample_audio_path = "data/01.m4a"
+    with open(sample_audio_path, "rb") as audio_file:
+        raw_transcript, corrected_transcript = process_audio(audio_file)
+    
+    if raw_transcript and corrected_transcript:
+        # print("Raw Transcript:")
+        # print(raw_transcript)
+        # print("\nCorrected Transcript:")
+        print(corrected_transcript)
+    else:
+        print("Audio processing failed.")
+
+if __name__ == "__main__":
+    main()

+ 66 - 0
src/text_processing.py

@@ -0,0 +1,66 @@
+import jieba
+from pypinyin import pinyin, Style
+from config import CORRECT_TERMS, ERROR_CORRECTION
+
+def chinese_soundex(pinyin_str):
+    soundex_map = {
+        'b': '1', 'p': '1', 'm': '1', 'f': '1',
+        'd': '2', 'n': '2', 'l': '2',
+        'g': '3', 'k': '3', 'h': '3', 't': '3',
+        'j': '4', 'q': '4', 'x': '4',
+        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
+        'z': '6', 'c': '6', 's': '6',
+        'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
+        'ong': '9', 'un': '9', 'uan': '9',
+        'i': 'A', 'u': 'A', 'v': 'A',
+        'e': 'B', 'o': 'B',
+    }
+    
+    code = ''
+    tone = '0'
+    i = 0
+    while i < len(pinyin_str):
+        if pinyin_str[i:i+2] in soundex_map:
+            code += soundex_map[pinyin_str[i:i+2]]
+            i += 2
+        elif pinyin_str[i] in soundex_map:
+            code += soundex_map[pinyin_str[i]]
+            i += 1
+        elif pinyin_str[i].isdigit():
+            tone = pinyin_str[i]
+            i += 1
+        else:
+            i += 1
+    
+    code = code[:1] + ''.join(sorted(set(code[1:])))
+    return (code[:3] + tone).ljust(4, '0')
+
+def compare_chinese_words(word1, word2, tone_sensitive=True):
+    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
+    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
+    
+    soundex1 = chinese_soundex(pinyin1)
+    soundex2 = chinese_soundex(pinyin2)
+    
+    if tone_sensitive:
+        return soundex1 == soundex2
+    else:
+        return soundex1[:3] == soundex2[:3]
+
+def fuzzy_correct_chinese(text):
+    words = jieba.lcut(text)
+    corrected_words = []
+    for word in words:
+        word_pinyin = ''.join([p[0] for p in pinyin(word, style=Style.NORMAL)])
+        # print(f"Term: {word}, Pinyin: {word_pinyin}")
+        if word in ERROR_CORRECTION:
+            corrected_words.append(ERROR_CORRECTION[word])
+        else:
+            for term in CORRECT_TERMS:
+                if compare_chinese_words(word, term, tone_sensitive=True):
+                    # print(f"corrected: {word} -> {term}")
+                    corrected_words.append(term)
+                    break
+            else:
+                corrected_words.append(word)
+    return ''.join(corrected_words)