whisper.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. import os
  2. import argparse
  3. from openai import OpenAI
  4. from dotenv import load_dotenv
  5. import jieba
  6. from datetime import datetime
  7. from pypinyin import pinyin, Style
  8. load_dotenv('environment.env')
  9. client = OpenAI()
  10. system_prompt = """你是一位專業的轉錄校對助理,專門處理有關溫室氣體、碳排放和碳管理的對話轉錄。
  11. 你的任務是:
  12. 1. 確保以下專業術語的準確性:溫室氣體、碳排放、碳管理、碳盤查、碳權交易、碳足跡、淨零排放、碳權。
  13. 2. 在必要時添加適當的標點符號,如句號、逗號
  14. 3. 使用台灣的繁體中文,確保語言表達符合台灣的用語習慣。
  15. 4. 只更正明顯的錯誤或改善可讀性,不要改變原文的意思或結構。
  16. 5. 不要回答問題、解釋概念或添加任何不在原文中的信息。
  17. 6. 如果原文是一個問句,保持它的問句形式,不要提供答案。
  18. 請只根據提供的原文進行必要的更正,不要添加或刪除任何實質性內容。在修正時,請特別注意上下文,確保修正後的詞語符合整句話的語境。"""
  19. def transcribe(audio_file):
  20. try:
  21. transcript = client.audio.transcriptions.create(
  22. file=audio_file,
  23. model="whisper-1",
  24. response_format="text"
  25. )
  26. return transcript
  27. except Exception as e:
  28. print(f"轉錄時發生錯誤:{str(e)}")
  29. return None
  30. def save_output(file_name, raw_transcript, corrected_transcript):
  31. output_dir = "output"
  32. os.makedirs(output_dir, exist_ok=True)
  33. output_file = os.path.join(output_dir, "transcription_results.txt")
  34. with open(output_file, "a", encoding="utf-8") as f:
  35. f.write(f"\n{'='*50}\n")
  36. f.write(f"文件名: {file_name}\n")
  37. f.write(f"處理時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
  38. f.write("原始轉錄:\n")
  39. f.write(f"{raw_transcript}\n\n")
  40. f.write("修正後的轉錄:\n")
  41. f.write(f"{corrected_transcript}\n")
  42. def process_audio_file(file_path):
  43. try:
  44. with open(file_path, "rb") as audio_file:
  45. file_size = os.path.getsize(file_path) / (1024 * 1024) # 轉換為 MB
  46. if file_size > 25:
  47. print(f"警告:文件 {os.path.basename(file_path)} 大小為 {file_size:.2f} MB,超過了 25 MB 的限制。可能無法處理。")
  48. print(f"\n處理文件:{os.path.basename(file_path)}")
  49. raw_transcript = transcribe(audio_file)
  50. if raw_transcript is None:
  51. return
  52. print("\n原始轉錄:")
  53. print(raw_transcript)
  54. corrected_transcript = post_process_transcript(raw_transcript)
  55. print("\n修正後的轉錄:")
  56. print(corrected_transcript)
  57. save_output(os.path.basename(file_path), raw_transcript, corrected_transcript)
  58. except Exception as e:
  59. print(f"處理文件 {os.path.basename(file_path)} 時發生錯誤:{str(e)}")
  60. def process_folder(folder_path):
  61. processed_files = 0
  62. for filename in os.listdir(folder_path):
  63. if filename.lower().endswith((".mp3", ".wav", ".m4a")):
  64. file_path = os.path.join(folder_path, filename)
  65. process_audio_file(file_path)
  66. processed_files += 1
  67. print("\n=== 總結 ===")
  68. print(f"處理的文件數:{processed_files}")
  69. def chinese_soundex(pinyin_str):
  70. soundex_map = {
  71. 'b': '1', 'p': '1', 'm': '1', 'f': '1',
  72. 'd': '2', 'n': '2', 'l': '2',
  73. 'g': '3', 'k': '3', 'h': '3', 't': '3',
  74. 'j': '4', 'q': '4', 'x': '4',
  75. 'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
  76. 'z': '6', 'c': '6', 's': '6',
  77. 'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
  78. 'ong': '9', 'un': '9', 'uan': '9',
  79. 'i': 'A', 'u': 'A', 'v': 'A', # 'v' is used for 'ü' in some systems
  80. 'e': 'B', 'o': 'B',
  81. }
  82. code = ''
  83. tone = '0'
  84. i = 0
  85. while i < len(pinyin_str):
  86. if pinyin_str[i:i+2] in soundex_map:
  87. code += soundex_map[pinyin_str[i:i+2]]
  88. i += 2
  89. elif pinyin_str[i] in soundex_map:
  90. code += soundex_map[pinyin_str[i]]
  91. i += 1
  92. elif pinyin_str[i].isdigit():
  93. tone = pinyin_str[i]
  94. i += 1
  95. else:
  96. i += 1
  97. code = code[:1] + ''.join(sorted(set(code[1:])))
  98. return (code[:3] + tone).ljust(4, '0')
  99. def compare_chinese_words(word1, word2, tone_sensitive=True):
  100. pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
  101. pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
  102. soundex1 = chinese_soundex(pinyin1)
  103. soundex2 = chinese_soundex(pinyin2)
  104. if tone_sensitive:
  105. return soundex1 == soundex2
  106. else:
  107. return soundex1[:3] == soundex2[:3]
  108. error_correction = {
  109. "看拳": "碳權",
  110. "看盤插": "碳盤查",
  111. "盤插": "盤查",
  112. "看": "碳",
  113. "看權": "碳權"
  114. }
  115. def set_jieba_dictionary(dict_path):
  116. if os.path.exists(dict_path):
  117. jieba.set_dictionary(dict_path)
  118. jieba.initialize()
  119. print(f"Set jieba dictionary to {dict_path}")
  120. else:
  121. print(f"Dictionary file {dict_path} not found. Using default dictionary.")
  122. def fuzzy_correct_chinese(text, correct_terms):
  123. words = jieba.lcut(text)
  124. corrected_words = []
  125. for word in words:
  126. word_pinyin = ''.join([p[0] for p in pinyin(word, style=Style.NORMAL)])
  127. print(f"Term: {word}, Pinyin: {word_pinyin}") # Added print statement
  128. if word in error_correction:
  129. corrected_words.append(error_correction[word])
  130. else:
  131. for term in correct_terms:
  132. if compare_chinese_words(word, term, tone_sensitive=True):
  133. print(f"corrected: {word} -> {term}")
  134. corrected_words.append(term)
  135. break
  136. else:
  137. corrected_words.append(word)
  138. return ''.join(corrected_words)
  139. def post_process_transcript(transcript, temperature=0):
  140. correct_terms = ["溫室氣體", "碳排放", "碳管理", "碳盤查", "碳權交易", "碳費",
  141. "碳權", "碳足跡", "足跡", "淨零排放", "零排放", "淨零",
  142. "氣候變遷法", "氣候", "氣候變遷",
  143. "台灣"]
  144. corrected_transcript = fuzzy_correct_chinese(transcript, correct_terms)
  145. messages = [
  146. {"role": "system", "content": system_prompt},
  147. {"role": "user", "content": f"請校對並修正以下轉錄文本,但不要改變其原意或回答問題:\n\n{corrected_transcript}"}
  148. ]
  149. response = client.chat.completions.create(
  150. model="gpt-4",
  151. temperature=temperature,
  152. messages=messages
  153. )
  154. return response.choices[0].message.content
  155. def main():
  156. parser = argparse.ArgumentParser(description="處理音頻文件使用 Whisper")
  157. parser.add_argument("--file", help="要處理的單個音頻文件的路徑")
  158. parser.add_argument("--folder", default="data", help="包含音頻文件的文件夾路徑(默認:data)")
  159. parser.add_argument("--dict", default="dictionary/dictionary_reviewed_rows_weighted.txt")
  160. args = parser.parse_args()
  161. set_jieba_dictionary(args.dict)
  162. if args.file:
  163. if os.path.isfile(args.file):
  164. process_audio_file(args.file)
  165. else:
  166. print(f"錯誤:文件 '{args.file}' 不存在。")
  167. elif args.folder:
  168. if os.path.isdir(args.folder):
  169. process_folder(args.folder)
  170. else:
  171. print(f"錯誤:文件夾 '{args.folder}' 不存在。")
  172. else:
  173. print("錯誤:請指定一個文件(--file)或文件夾(--folder)來處理。")
  174. if __name__ == "__main__":
  175. main()