|
@@ -9,7 +9,6 @@ import jieba
|
|
|
load_dotenv('environment.env')
|
|
|
client = OpenAI()
|
|
|
|
|
|
-############ 以語意做調整 ################
|
|
|
system_prompt = """你是一位專業的轉錄校對助理,專門處理有關溫室氣體、碳排放和碳管理的對話轉錄。
|
|
|
你的任務是:
|
|
|
1. 確保以下專業術語的準確性:溫室氣體、碳排放、碳管理、碳盤查、碳權交易、碳足跡、淨零排放、碳權。
|
|
@@ -21,11 +20,6 @@ system_prompt = """你是一位專業的轉錄校對助理,專門處理有關
|
|
|
|
|
|
請只根據提供的原文進行必要的更正,不要添加或刪除任何實質性內容。在修正時,請特別注意上下文,確保修正後的詞語符合整句話的語境。"""
|
|
|
|
|
|
-def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
|
|
- encoding = tiktoken.get_encoding(encoding_name)
|
|
|
- num_tokens = len(encoding.encode(string))
|
|
|
- return num_tokens
|
|
|
-
|
|
|
def transcribe(audio_file):
|
|
|
try:
|
|
|
transcript = client.audio.transcriptions.create(
|
|
@@ -37,7 +31,6 @@ def transcribe(audio_file):
|
|
|
except Exception as e:
|
|
|
print(f"轉錄時發生錯誤:{str(e)}")
|
|
|
return None
|
|
|
-
|
|
|
|
|
|
def process_audio_file(file_path):
|
|
|
try:
|
|
@@ -73,9 +66,7 @@ def process_folder(folder_path):
|
|
|
print("\n=== 總結 ===")
|
|
|
print(f"處理的文件數:{processed_files}")
|
|
|
|
|
|
-############ 以諧音做調整 #################
|
|
|
def chinese_soundex(pinyin):
|
|
|
- # 簡化的中文拼音Soundex映射
|
|
|
soundex_map = {
|
|
|
'b': '1', 'p': '1', 'm': '1', 'f': '1',
|
|
|
'd': '2', 't': '2', 'n': '2', 'l': '2',
|
|
@@ -85,43 +76,43 @@ def chinese_soundex(pinyin):
|
|
|
'z': '6', 'c': '6', 's': '6'
|
|
|
}
|
|
|
|
|
|
- code = pinyin[0].upper() # 保留第一個字母
|
|
|
+ code = pinyin[0].upper()
|
|
|
+ tone = '0'
|
|
|
+
|
|
|
for char in pinyin[1:]:
|
|
|
- if char in soundex_map:
|
|
|
+ if char.isdigit():
|
|
|
+ tone = char
|
|
|
+ elif char in soundex_map:
|
|
|
if len(code) == 1 or code[-1] != soundex_map[char]:
|
|
|
code += soundex_map[char]
|
|
|
if len(code) == 4:
|
|
|
break
|
|
|
|
|
|
- return code.ljust(4, '0')
|
|
|
-
|
|
|
+ return (code.ljust(4, '0') + tone)[:5]
|
|
|
|
|
|
def compare_chinese_words(word1, word2):
|
|
|
- pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE2, neutral_tone_with_five=True)])
|
|
|
- pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE2, neutral_tone_with_five=True)])
|
|
|
+ pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
|
|
|
+ pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
|
|
|
|
|
|
soundex1 = chinese_soundex(pinyin1)
|
|
|
soundex2 = chinese_soundex(pinyin2)
|
|
|
|
|
|
return soundex1 == soundex2
|
|
|
|
|
|
-# 幾個有修正困難的先hard code
|
|
|
error_correction = {
|
|
|
"看拳": "碳權",
|
|
|
"看盤插": "碳盤查",
|
|
|
"盤插": "盤查",
|
|
|
- "看":"碳"
|
|
|
+ "看": "碳"
|
|
|
}
|
|
|
|
|
|
def fuzzy_correct_chinese(text, correct_terms):
|
|
|
words = jieba.cut(text)
|
|
|
corrected_words = []
|
|
|
for word in words:
|
|
|
- # 首先檢查是否在錯誤修正字典中
|
|
|
if word in error_correction:
|
|
|
corrected_words.append(error_correction[word])
|
|
|
else:
|
|
|
- # 如果不在錯誤修正字典中,則使用 Soundex 方法
|
|
|
for term in correct_terms:
|
|
|
if compare_chinese_words(word, term):
|
|
|
corrected_words.append(term)
|
|
@@ -130,19 +121,13 @@ def fuzzy_correct_chinese(text, correct_terms):
|
|
|
corrected_words.append(word)
|
|
|
return ' '.join(corrected_words)
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-################ 執行 ######################
|
|
|
def post_process_transcript(transcript, temperature=0):
|
|
|
- # 定義正確的術語列表
|
|
|
- correct_terms = ["碳", "溫室氣體", "碳排放", "排放", "碳管理", "管理", "碳盤查", "盤查", "碳權交易", "碳費"
|
|
|
- , "碳權", "碳足跡", "足跡", "淨零排放", "零排放", "排放", "淨零"
|
|
|
- , "氣候變遷法", "氣候", "氣候變遷", "法"
|
|
|
- , "是什麼", "請解釋", "為什麼", "什麼意思"
|
|
|
- , "台灣"
|
|
|
- ]
|
|
|
+ correct_terms = ["碳", "溫室氣體", "碳排放", "排放", "碳管理", "管理", "碳盤查", "盤查", "碳權交易", "碳費",
|
|
|
+ "碳權", "碳足跡", "足跡", "淨零排放", "零排放", "排放", "淨零",
|
|
|
+ "氣候變遷法", "氣候", "氣候變遷", "法",
|
|
|
+ "是什麼", "請解釋", "為什麼", "什麼意思",
|
|
|
+ "台灣"]
|
|
|
|
|
|
- # 使用 Soundex 進行初步校正
|
|
|
corrected_transcript = fuzzy_correct_chinese(transcript, correct_terms)
|
|
|
|
|
|
messages = [
|
|
@@ -151,14 +136,13 @@ def post_process_transcript(transcript, temperature=0):
|
|
|
]
|
|
|
|
|
|
response = client.chat.completions.create(
|
|
|
- model="gpt-4o", #gpt-4o效果比3.5好很多
|
|
|
+ model="gpt-4o",
|
|
|
temperature=temperature,
|
|
|
messages=messages
|
|
|
)
|
|
|
|
|
|
return response.choices[0].message.content
|
|
|
|
|
|
-
|
|
|
def main():
|
|
|
parser = argparse.ArgumentParser(description="處理音頻文件使用 Whisper")
|
|
|
parser.add_argument("--file", help="要處理的單個音頻文件的路徑")
|