hai 1 ano · fed1681308
--- a/MYNOTE.md
+++ b/MYNOTE.md
@@ -1,17 +1,22 @@
 
				 ## Whisper Official doc
			
 
				 https://platform.openai.com/docs/guides/speech-to-text/prompting
			
 
				 
			
 
				+## 問題
			
 
				+語音轉文字，諧音字詞判斷錯誤
			
 
				 
			
 
				-0. 資料預處理
			
 
				-- jieba斷詞、建立中文語音數字編碼、拆解聲母韻母、諧音會有同樣的編碼
			
 
				+## 方法邏輯
			
 
				 
			
 
				 1. 用語意校正
			
 
				 - prompt
			
 
				-- gpt-4o
			
 
				+- gpt-4o (後處理)
			
 
				 
			
 
				 2. 用語音校正
			
 
				-以簡單的近似聲母韻母編碼，來計算相似度
			
 
				+- 發音越相近，編碼約相近。計算相似度距離
			
 
				 
			
 
				-3. 其他處理
			
 
				-自定義字典（尤其是專有名詞）， 以及hard code校正幾個比較困難的詞
			
 
				+3. 其他後處理
			
 
				+- 在jieba斷詞後以自定義字典抓出專有名詞。
			
 
				+- hard code校正幾個比較困難的詞
			
 
				 
			
 
				+## 嘗試
			
 
				+- 加上自定義字典（從knowledge graph提取出來的詞）做斷詞：沒有比較好，發現過度矯正的問題。output.txt 為jieba斷詞字典的結果，蠻正確的
			
 
				+- 加上聲調：有改進
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,23 +1,152 @@
 
				+aiohappyeyeballs==2.4.0
			
 
				+aiohttp==3.10.5
			
 
				+aiosignal==1.3.1
			
 
				 annotated-types==0.7.0
			
 
				 anyio==4.4.0
			
 
				+attrs==24.2.0
			
 
				+audioread==3.0.1
			
 
				+babel==2.16.0
			
 
				+bce-python-sdk==0.9.19
			
 
				+blinker==1.8.2
			
 
				+Bottleneck==1.4.0
			
 
				 certifi==2024.7.4
			
 
				+cffi==1.17.0
			
 
				 charset-normalizer==3.3.2
			
 
				+click==8.1.7
			
 
				+colorama==0.4.6
			
 
				+coloredlogs==15.0.1
			
 
				+colorlog==6.8.2
			
 
				+contourpy==1.3.0
			
 
				+cycler==0.12.1
			
 
				+Cython==3.0.11
			
 
				+datasets==2.21.0
			
 
				+decorator==5.1.1
			
 
				+dill==0.3.4
			
 
				+Distance==0.1.3
			
 
				 distro==1.9.0
			
 
				+editdistance==0.8.1
			
 
				+fastapi==0.112.2
			
 
				+filelock==3.15.4
			
 
				+Flask==3.0.3
			
 
				+flask-babel==4.0.0
			
 
				+flatbuffers==24.3.25
			
 
				+fonttools==4.53.1
			
 
				+frozenlist==1.4.1
			
 
				+fsspec==2024.6.1
			
 
				+future==1.0.0
			
 
				+fuzzywuzzy==0.18.0
			
 
				+g2p-en==2.1.0
			
 
				+g2pM==0.1.2.5
			
 
				 h11==0.14.0
			
 
				+h5py==3.11.0
			
 
				 httpcore==1.0.5
			
 
				 httpx==0.27.2
			
 
				+huggingface-hub==0.24.6
			
 
				+humanfriendly==10.0
			
 
				 idna==3.8
			
 
				+inflect==7.3.1
			
 
				+itsdangerous==2.2.0
			
 
				 jieba==0.42.1
			
 
				+Jinja2==3.1.4
			
 
				 jiter==0.5.0
			
 
				+joblib==1.4.2
			
 
				+jsonlines==4.0.0
			
 
				+kaldiio==2.18.0
			
 
				+kiwisolver==1.4.5
			
 
				+Levenshtein==0.25.1
			
 
				+librosa==0.8.1
			
 
				+llvmlite==0.43.0
			
 
				+loguru==0.7.2
			
 
				+lxml==5.3.0
			
 
				+markdown-it-py==3.0.0
			
 
				+MarkupSafe==2.1.5
			
 
				+matplotlib==3.9.2
			
 
				+mdurl==0.1.2
			
 
				+mock==5.1.0
			
 
				+more-itertools==10.4.0
			
 
				+mpmath==1.3.0
			
 
				+multidict==6.0.5
			
 
				+multiprocess==0.70.12.2
			
 
				+nara-wpe==0.0.10
			
 
				+nltk==3.9.1
			
 
				+numba==0.60.0
			
 
				+numpy==2.0.2
			
 
				+onnx==1.16.2
			
 
				+onnxruntime==1.19.0
			
 
				 openai==1.42.0
			
 
				+packaging==24.1
			
 
				+paddle2onnx==0.8.1
			
 
				+paddleaudio==1.0.2
			
 
				+paddlefsl==1.1.0
			
 
				+paddlenlp==2.6.1
			
 
				+paddlespeech==1.0.1
			
 
				+paddlespeech-feat==0.1.0
			
 
				+pandas==2.2.2
			
 
				+pathos==0.2.8
			
 
				+pattern_singleton==1.2.0
			
 
				+pillow==10.4.0
			
 
				+pip-autoremove==0.10.0
			
 
				+platformdirs==4.2.2
			
 
				+pooch==1.8.2
			
 
				+portalocker==2.10.1
			
 
				+pox==0.3.4
			
 
				+ppft==1.7.6.8
			
 
				+praatio==5.0.0
			
 
				+prettytable==3.11.0
			
 
				+protobuf==3.20.2
			
 
				+psutil==6.0.0
			
 
				+pyarrow==17.0.0
			
 
				+pycparser==2.22
			
 
				+pycryptodome==3.20.0
			
 
				 pydantic==2.8.2
			
 
				 pydantic_core==2.20.1
			
 
				+Pygments==2.18.0
			
 
				+pyparsing==3.1.4
			
 
				 pypinyin==0.52.0
			
 
				+pypinyin-dict==0.8.0
			
 
				+pytest-runner==6.0.1
			
 
				+python-dateutil==2.9.0.post0
			
 
				 python-dotenv==1.0.1
			
 
				+python-Levenshtein==0.25.1
			
 
				+pytz==2024.1
			
 
				+pyworld==0.3.4
			
 
				+PyYAML==6.0.2
			
 
				+rapidfuzz==3.9.6
			
 
				+rarfile==4.2
			
 
				 regex==2024.7.24
			
 
				 requests==2.32.3
			
 
				+resampy==0.2.2
			
 
				+rich==13.8.0
			
 
				+sacrebleu==2.4.3
			
 
				+safetensors==0.4.4
			
 
				+scikit-learn==1.5.1
			
 
				+scipy==1.14.1
			
 
				+sentencepiece==0.1.99
			
 
				+seqeval==1.2.2
			
 
				+shellingham==1.5.4
			
 
				+six==1.16.0
			
 
				 sniffio==1.3.1
			
 
				+soundfile==0.12.1
			
 
				+starlette==0.38.2
			
 
				+sympy==1.13.2
			
 
				+tabulate==0.9.0
			
 
				+TextGrid==1.6.1
			
 
				+threadpoolctl==3.5.0
			
 
				 tiktoken==0.7.0
			
 
				+timer==0.3.0
			
 
				 tqdm==4.66.5
			
 
				+typeguard==4.3.0
			
 
				+typer==0.12.5
			
 
				 typing_extensions==4.12.2
			
 
				+tzdata==2024.1
			
 
				 urllib3==2.2.2
			
 
				+uvicorn==0.30.6
			
 
				+visualdl==2.5.3
			
 
				+wcwidth==0.2.13
			
 
				+webrtcvad==2.0.10
			
 
				+websockets==13.0.1
			
 
				+Werkzeug==3.0.4
			
 
				+xxhash==3.5.0
			
 
				+yacs==0.1.8
			
 
				+yarl==1.9.4
			
 
				+zhon==2.0.2
			
--- a/whisper.py
+++ b/whisper.py
@@ -9,7 +9,6 @@ import jieba
 
				 load_dotenv('environment.env')
			
 
				 client = OpenAI()
			
 
				 
			
 
				-############ 以語意做調整 ################
			
 
				 system_prompt = """你是一位專業的轉錄校對助理，專門處理有關溫室氣體、碳排放和碳管理的對話轉錄。
			
 
				 你的任務是：
			
 
				 1. 確保以下專業術語的準確性：溫室氣體、碳排放、碳管理、碳盤查、碳權交易、碳足跡、淨零排放、碳權。
			
@@ -21,11 +20,6 @@ system_prompt = """你是一位專業的轉錄校對助理，專門處理有關
 
				 
			
 
				 請只根據提供的原文進行必要的更正，不要添加或刪除任何實質性內容。在修正時，請特別注意上下文，確保修正後的詞語符合整句話的語境。"""
			
 
				 
			
 
				-def num_tokens_from_string(string: str, encoding_name: str) -> int:
			
 
				-    encoding = tiktoken.get_encoding(encoding_name)
			
 
				-    num_tokens = len(encoding.encode(string))
			
 
				-    return num_tokens
			
 
				-
			
 
				 def transcribe(audio_file):
			
 
				     try:
			
 
				         transcript = client.audio.transcriptions.create(
			
@@ -37,7 +31,6 @@ def transcribe(audio_file):
 
				     except Exception as e:
			
 
				         print(f"轉錄時發生錯誤：{str(e)}")
			
 
				         return None
			
 
				-    
			
 
				 
			
 
				 def process_audio_file(file_path):
			
 
				     try:
			
@@ -73,9 +66,7 @@ def process_folder(folder_path):
 
				     print("\n=== 總結 ===")
			
 
				     print(f"處理的文件數：{processed_files}")
			
 
				 
			
 
				-############ 以諧音做調整 #################
			
 
				 def chinese_soundex(pinyin):
			
 
				-    # 簡化的中文拼音Soundex映射
			
 
				     soundex_map = {
			
 
				         'b': '1', 'p': '1', 'm': '1', 'f': '1',
			
 
				         'd': '2', 't': '2', 'n': '2', 'l': '2',
			
@@ -85,43 +76,43 @@ def chinese_soundex(pinyin):
 
				         'z': '6', 'c': '6', 's': '6'
			
 
				     }
			
 
				     
			
 
				-    code = pinyin[0].upper()  # 保留第一個字母
			
 
				+    code = pinyin[0].upper()
			
 
				+    tone = '0'
			
 
				+    
			
 
				     for char in pinyin[1:]:
			
 
				-        if char in soundex_map:
			
 
				+        if char.isdigit():
			
 
				+            tone = char
			
 
				+        elif char in soundex_map:
			
 
				             if len(code) == 1 or code[-1] != soundex_map[char]:
			
 
				                 code += soundex_map[char]
			
 
				         if len(code) == 4:
			
 
				             break
			
 
				     
			
 
				-    return code.ljust(4, '0')
			
 
				-
			
 
				+    return (code.ljust(4, '0') + tone)[:5]
			
 
				 
			
 
				 def compare_chinese_words(word1, word2):
			
 
				-    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE2, neutral_tone_with_five=True)])
			
 
				-    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE2, neutral_tone_with_five=True)])
			
 
				+    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
			
 
				+    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
			
 
				     
			
 
				     soundex1 = chinese_soundex(pinyin1)
			
 
				     soundex2 = chinese_soundex(pinyin2)
			
 
				     
			
 
				     return soundex1 == soundex2
			
 
				 
			
 
				-# 幾個有修正困難的先hard code
			
 
				 error_correction = {
			
 
				     "看拳": "碳權",
			
 
				     "看盤插": "碳盤查",
			
 
				     "盤插": "盤查",
			
 
				-    "看":"碳"
			
 
				+    "看": "碳"
			
 
				 }
			
 
				 
			
 
				 def fuzzy_correct_chinese(text, correct_terms):
			
 
				     words = jieba.cut(text)
			
 
				     corrected_words = []
			
 
				     for word in words:
			
 
				-        # 首先檢查是否在錯誤修正字典中
			
 
				         if word in error_correction:
			
 
				             corrected_words.append(error_correction[word])
			
 
				         else:
			
 
				-            # 如果不在錯誤修正字典中，則使用 Soundex 方法
			
 
				             for term in correct_terms:
			
 
				                 if compare_chinese_words(word, term):
			
 
				                     corrected_words.append(term)
			
@@ -130,19 +121,13 @@ def fuzzy_correct_chinese(text, correct_terms):
 
				                 corrected_words.append(word)
			
 
				     return ' '.join(corrected_words)
			
 
				 
			
 
				-
			
 
				-
			
 
				-################ 執行 ######################
			
 
				 def post_process_transcript(transcript, temperature=0):
			
 
				-    # 定義正確的術語列表
			
 
				-    correct_terms = ["碳", "溫室氣體", "碳排放", "排放", "碳管理", "管理", "碳盤查", "盤查", "碳權交易", "碳費"
			
 
				-                     , "碳權", "碳足跡", "足跡", "淨零排放", "零排放", "排放", "淨零"
			
 
				-                     , "氣候變遷法", "氣候", "氣候變遷", "法"
			
 
				-                     , "是什麼", "請解釋", "為什麼", "什麼意思"
			
 
				-                     , "台灣"
			
 
				-                     ]
			
 
				+    correct_terms = ["碳", "溫室氣體", "碳排放", "排放", "碳管理", "管理", "碳盤查", "盤查", "碳權交易", "碳費",
			
 
				+                     "碳權", "碳足跡", "足跡", "淨零排放", "零排放", "排放", "淨零",
			
 
				+                     "氣候變遷法", "氣候", "氣候變遷", "法",
			
 
				+                     "是什麼", "請解釋", "為什麼", "什麼意思",
			
 
				+                     "台灣"]
			
 
				     
			
 
				-    # 使用 Soundex 進行初步校正
			
 
				     corrected_transcript = fuzzy_correct_chinese(transcript, correct_terms)
			
 
				     
			
 
				     messages = [
			
@@ -151,14 +136,13 @@ def post_process_transcript(transcript, temperature=0):
 
				     ]
			
 
				 
			
 
				     response = client.chat.completions.create(
			
 
				-        model="gpt-4o",   #gpt-4o效果比3.5好很多
			
 
				+        model="gpt-4o",
			
 
				         temperature=temperature,
			
 
				         messages=messages
			
 
				     )
			
 
				 
			
 
				     return response.choices[0].message.content
			
 
				 
			
 
				-
			
 
				 def main():
			
 
				     parser = argparse.ArgumentParser(description="處理音頻文件使用 Whisper")
			
 
				     parser.add_argument("--file", help="要處理的單個音頻文件的路徑")