sherry
/
ASR


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
							import os
import argparse
from openai import OpenAI
from dotenv import load_dotenv
import tiktoken
from pypinyin import pinyin, Style
import jieba

load_dotenv('environment.env')
client = OpenAI()

############ 以語意做調整 ################
system_prompt = """你是一位專業的轉錄校對助理，專門處理有關溫室氣體、碳排放和碳管理的對話轉錄。
你的任務是：
1. 確保以下專業術語的準確性：溫室氣體、碳排放、碳管理、碳盤查、碳權交易、碳足跡、淨零排放、碳權。
2. 在必要時添加適當的標點符號，如句號、逗號
3. 使用台灣的繁體中文，確保語言表達符合台灣的用語習慣。
4. 只更正明顯的錯誤或改善可讀性，不要改變原文的意思或結構。
5. 不要回答問題、解釋概念或添加任何不在原文中的信息。
6. 如果原文是一個問句，保持它的問句形式，不要提供答案。

請只根據提供的原文進行必要的更正，不要添加或刪除任何實質性內容。在修正時，請特別注意上下文，確保修正後的詞語符合整句話的語境。"""

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def transcribe(audio_file):
    try:
        transcript = client.audio.transcriptions.create(
            file=audio_file,
            model="whisper-1",
            response_format="text"
        )
        return transcript
    except Exception as e:
        print(f"轉錄時發生錯誤：{str(e)}")
        return None
    

def process_audio_file(file_path):
    try:
        with open(file_path, "rb") as audio_file:
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # 轉換為 MB
            if file_size > 25:
                print(f"警告：文件 {os.path.basename(file_path)} 大小為 {file_size:.2f} MB，超過了 25 MB 的限制。可能無法處理。")

            print(f"\n處理文件：{os.path.basename(file_path)}")
            raw_transcript = transcribe(audio_file)
            if raw_transcript is None:
                return

            print("\n原始轉錄：")
            print(raw_transcript)

            corrected_transcript = post_process_transcript(raw_transcript)
            print("\n修正後的轉錄：")
            print(corrected_transcript)

    except Exception as e:
        print(f"處理文件 {os.path.basename(file_path)} 時發生錯誤：{str(e)}")

def process_folder(folder_path):
    processed_files = 0

    for filename in os.listdir(folder_path):
        if filename.endswith((".mp3", ".wav", ".m4a")):  
            file_path = os.path.join(folder_path, filename)
            process_audio_file(file_path)
            processed_files += 1

    print("\n=== 總結 ===")
    print(f"處理的文件數：{processed_files}")

############ 以諧音做調整 #################
def chinese_soundex(pinyin):
    # 簡化的中文拼音Soundex映射
    soundex_map = {
        'b': '1', 'p': '1', 'm': '1', 'f': '1',
        'd': '2', 't': '2', 'n': '2', 'l': '2',
        'g': '3', 'k': '3', 'h': '3',
        'j': '4', 'q': '4', 'x': '4',
        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
        'z': '6', 'c': '6', 's': '6'
    }
    
    code = pinyin[0].upper()  # 保留第一個字母
    for char in pinyin[1:]:
        if char in soundex_map:
            if len(code) == 1 or code[-1] != soundex_map[char]:
                code += soundex_map[char]
        if len(code) == 4:
            break
    
    return code.ljust(4, '0')


def compare_chinese_words(word1, word2):
    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.NORMAL)])
    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.NORMAL)])
    
    soundex1 = chinese_soundex(pinyin1)
    soundex2 = chinese_soundex(pinyin2)
    
    return soundex1 == soundex2

# 幾個有修正困難的先hard code
error_correction = {
    "看拳": "碳權",
    "看盤插": "碳盤查",
    "盤插": "盤查",
    "看":"碳"
}

def fuzzy_correct_chinese(text, correct_terms):
    words = jieba.cut(text)
    corrected_words = []
    for word in words:
        # 首先檢查是否在錯誤修正字典中
        if word in error_correction:
            corrected_words.append(error_correction[word])
        else:
            # 如果不在錯誤修正字典中，則使用 Soundex 方法。先以自定義字典做諧音比較
            for term in correct_terms:
                if compare_chinese_words(word, term):
                    corrected_words.append(term)
                    break
                else:
                    corrected_words.append(word)
    return ' '.join(corrected_words)


################ 執行 ######################
def post_process_transcript(transcript, temperature=0):
    # 定義正確的術語列表
    correct_terms = ["碳", "溫室氣體", "碳排放", "排放", "碳管理", "管理", "碳盤查", "盤查", "碳權交易", "碳費"
                     , "碳權", "碳足跡", "足跡", "淨零排放", "零排放", "排放", "淨零"
                     , "氣候變遷法", "氣候", "氣候變遷", "法"
                     , "是什麼", "請解釋", "為什麼", "什麼意思"
                     , "台灣"
                     ]
    
    # 使用 Soundex 進行初步校正
    corrected_transcript = fuzzy_correct_chinese(transcript, correct_terms)
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"請校對並修正以下轉錄文本，但不要改變其原意或回答問題：\n\n{corrected_transcript}"}
    ]

    response = client.chat.completions.create(
        model="gpt-4o",   #gpt-4o效果比3.5好很多
        temperature=temperature,
        messages=messages
    )

    return response.choices[0].message.content


def main():
    parser = argparse.ArgumentParser(description="處理音頻文件使用 Whisper")
    parser.add_argument("--file", help="要處理的單個音頻文件的路徑")
    parser.add_argument("--folder", default="data", help="包含音頻文件的文件夾路徑（默認：data）")
    args = parser.parse_args()

    if args.file:
        if os.path.isfile(args.file):
            process_audio_file(args.file)
        else:
            print(f"錯誤：文件 '{args.file}' 不存在。")
    elif args.folder:
        if os.path.isdir(args.folder):
            process_folder(args.folder)
        else:
            print(f"錯誤：文件夾 '{args.folder}' 不存在。")
    else:
        print("錯誤：請指定一個文件（--file）或文件夾（--folder）來處理。")

if __name__ == "__main__":
    main()