Explorar el Código

101-asr 英文修正

SherryLiu hace 5 meses
commit
9bfb5803ac
Se han modificado 6 ficheros con 543 adiciones y 0 borrados
  1. 152 0
      requirements.txt
  2. 58 0
      src/audio_processing.py
  3. 161 0
      src/brand_database_rows.csv
  4. 34 0
      src/config.py
  5. 67 0
      src/main_script.py
  6. 71 0
      src/text_processing.py

+ 152 - 0
requirements.txt

@@ -0,0 +1,152 @@
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.4.0
+attrs==24.2.0
+audioread==3.0.1
+babel==2.16.0
+bce-python-sdk==0.9.19
+blinker==1.8.2
+Bottleneck==1.4.0
+certifi==2024.7.4
+cffi==1.17.0
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+coloredlogs==15.0.1
+colorlog==6.8.2
+contourpy==1.3.0
+cycler==0.12.1
+Cython==3.0.11
+datasets==2.21.0
+decorator==5.1.1
+dill==0.3.4
+Distance==0.1.3
+distro==1.9.0
+editdistance==0.8.1
+fastapi==0.112.2
+filelock==3.15.4
+Flask==3.0.3
+flask-babel==4.0.0
+flatbuffers==24.3.25
+fonttools==4.53.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+future==1.0.0
+fuzzywuzzy==0.18.0
+g2p-en==2.1.0
+g2pM==0.1.2.5
+h11==0.14.0
+h5py==3.11.0
+httpcore==1.0.5
+httpx==0.27.2
+huggingface-hub==0.24.6
+humanfriendly==10.0
+idna==3.8
+inflect==7.3.1
+itsdangerous==2.2.0
+jieba==0.42.1
+Jinja2==3.1.4
+jiter==0.5.0
+joblib==1.4.2
+jsonlines==4.0.0
+kaldiio==2.18.0
+kiwisolver==1.4.5
+Levenshtein==0.25.1
+librosa==0.8.1
+llvmlite==0.43.0
+loguru==0.7.2
+lxml==5.3.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+mdurl==0.1.2
+mock==5.1.0
+more-itertools==10.4.0
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.12.2
+nara-wpe==0.0.10
+nltk==3.9.1
+numba==0.60.0
+numpy==2.0.2
+onnx==1.16.2
+onnxruntime==1.19.0
+openai==1.42.0
+packaging==24.1
+paddle2onnx==0.8.1
+paddleaudio==1.0.2
+paddlefsl==1.1.0
+paddlenlp==2.6.1
+paddlespeech==1.0.1
+paddlespeech-feat==0.1.0
+pandas==2.2.2
+pathos==0.2.8
+pattern_singleton==1.2.0
+pillow==10.4.0
+pip-autoremove==0.10.0
+platformdirs==4.2.2
+pooch==1.8.2
+portalocker==2.10.1
+pox==0.3.4
+ppft==1.7.6.8
+praatio==5.0.0
+prettytable==3.11.0
+protobuf==3.20.2
+psutil==6.0.0
+pyarrow==17.0.0
+pycparser==2.22
+pycryptodome==3.20.0
+pydantic==2.8.2
+pydantic_core==2.20.1
+Pygments==2.18.0
+pyparsing==3.1.4
+pypinyin==0.52.0
+pypinyin-dict==0.8.0
+pytest-runner==6.0.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-Levenshtein==0.25.1
+pytz==2024.1
+pyworld==0.3.4
+PyYAML==6.0.2
+rapidfuzz==3.9.6
+rarfile==4.2
+regex==2024.7.24
+requests==2.32.3
+resampy==0.2.2
+rich==13.8.0
+sacrebleu==2.4.3
+safetensors==0.4.4
+scikit-learn==1.5.1
+scipy==1.14.1
+sentencepiece==0.1.99
+seqeval==1.2.2
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soundfile==0.12.1
+starlette==0.38.2
+sympy==1.13.2
+tabulate==0.9.0
+TextGrid==1.6.1
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+timer==0.3.0
+tqdm==4.66.5
+typeguard==4.3.0
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+uvicorn==0.30.6
+visualdl==2.5.3
+wcwidth==0.2.13
+webrtcvad==2.0.10
+websockets==13.0.1
+Werkzeug==3.0.4
+xxhash==3.5.0
+yacs==0.1.8
+yarl==1.9.4
+zhon==2.0.2

+ 58 - 0
src/audio_processing.py

@@ -0,0 +1,58 @@
+from openai import OpenAI
+from config import SYSTEM_PROMPT, OPEN_API_KEY, SUPABASE_KEY, SUPABASE_URL
+from supabase import create_client, Client
+from text_processing import fuzzy_correct_chinese
+import csv
+
+client = OpenAI(api_key=OPEN_API_KEY)
+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
+
+def load_custom_vocab_from_csv(file_path):
+    custom_vocab = []
+    try:
+        with open(file_path, 'r', encoding='utf-8') as csvfile:
+            reader = csv.DictReader(csvfile)
+            custom_vocab = [row['brand'] for row in reader if 'brand' in row]
+    except Exception as e:
+        print(f"Error reading CSV file: {str(e)}")
+        print("Using empty vocabulary.")
+    return custom_vocab
+
+def transcribe(audio_file):
+    try:
+        custom_vocab = load_custom_vocab_from_csv('brand_database_rows.csv')
+        
+        transcript = client.audio.transcriptions.create(
+            file=audio_file,
+            model="whisper-1",
+            response_format="text", 
+            prompt=f"請注意以下詞彙:{custom_vocab}"
+        )
+        return transcript
+    except Exception as e:
+        print(f"轉錄時發生錯誤:{str(e)}")
+        return None
+
+def post_process_transcript(transcript, temperature=0):
+    corrected_transcript = fuzzy_correct_chinese(transcript)
+    
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"請校對並修正以下轉錄文本,但不要改變其原意或回答問題,也不要更動英文的大小寫:\n\n{corrected_transcript}"}
+    ]
+
+    response = client.chat.completions.create(
+        model="gpt-4",
+        temperature=temperature,
+        messages=messages
+    )
+
+    return response.choices[0].message.content
+
+def process_audio(audio_data):
+    raw_transcript = transcribe(audio_data)
+    print(raw_transcript)
+    if raw_transcript is None:
+        return None, None
+    corrected_transcript = post_process_transcript(raw_transcript)
+    return raw_transcript, corrected_transcript

+ 161 - 0
src/brand_database_rows.csv

@@ -0,0 +1,161 @@
+id,brand,category
+1,BALENCIAGA,
+2,BOSS,
+3,BURBERRY,
+4,CELINE,
+5,COS,
+6,COACH,
+7,Dior,
+8,FENDI,
+9,GUCCI,
+10,KENZO,
+11,Louis Vuitton,
+12,LV,
+13,MONTBLANC,
+14,POLO,
+15,Tory Burch,
+16,VERSACE,
+17,BAO BAO ISSEY MIYAKE,
+18,Berluti,
+19,BOTTEGA VENETA,
+20,ZEGNA,
+21,FERRAGAMO,
+22,LONGCHAMP,
+23,Loro Piana,
+24,maje,
+25,MICHAEL KORS,
+26,Moncler,
+27,PLEATS PLEASE,
+28,SAINT LAURENT,
+29,A. Lange & Söhne,
+30,BLANCPAIN,
+31,BOUCHERON,
+32,BREGUET,
+33,BREITLING,
+34,BVLGARI,
+35,Cartier,
+36,CHANEL,
+37,CHAUMET,
+38,CHOPARD,
+39,DAMIANI,
+40,DE BEERS,
+41,FRED,
+42,HARRY WINSTON,
+43,Grand Seiko,
+44,HUBLOT,
+45,IWC,
+46,JADEGIA,
+47,玉世家,
+48,JAEGER-LECOULTRE,
+49,LONGINES,
+50,MIKIMOTO,
+51,OMEGA,
+52,歐米茄,
+53,PANERAI,
+54,PATEK PHILIPPE,
+55,PIAGET,
+56,RADO,
+57,ROGER DUBUIS,
+58,Rolex,
+59,勞力士,
+60,Sincere Haute Horlogerie,
+61,TAG Heuer,
+62,Tiffany & Co.,
+63,TISSOT,
+64,TUDOR,
+65,帝舵表,
+66,VACHERON CONSTANTIN,
+67,Van Cleef & Arpels,
+68,PEDRO,
+69,2020EYEhaus,
+70,APM MONACO,
+71,BAO BAO ISSEY MIYAKE,
+72,CHARLES & KEITH,
+73,HOGAN,
+74,KANGOL,
+75,MIRROR,
+76,皇宣緣,
+77,PANDORA,
+78,Redline,
+79,SWAROVSKI,
+80,SWATCH,
+81,The Way Eyewear,
+82,TUMI,
+83,vacanza,
+84,A|X ARMANI EXCHANGE,
+85,adidas SWC,
+86,Benetton,
+87,Brooks Brothers,
+88,CALVIN KLEIN JEANS,
+89,Crocs,
+91,iROO,
+92,LULULEMON,
+93,Massimo Dutti,
+94,MLB,
+95,NB GREY Image Main Store,
+96,NB GREY,
+97,NIKE KICKS LOUNGE,
+98,Onitsuka Tiger,
+99,鬼塚虎,
+100,PEDRO,
+101,PORTER INTERNATIONAL,
+102,ROOTS,
+103,THE NORTH FACE,
+104,THE NORTH FACE Taipei 101 store,
+105,TOMMY HILFIGER,
+106,ZARA,
+107,ABC Cooking Studio,
+108,Apple Taipei 101,
+109,Apple,
+110,Bang & Olufsen,
+111,Devialet,
+112,FamilyMart,
+113,LAMY,
+114,Sony,
+115,Sony 台北 101 直營店,
+116,Taipei 101 Observation Deck,
+117,Taipei Fubon Bank,
+118,World Gym Elite,
+119,台北101觀景台,
+120,台北富邦銀行,
+121,全家便利商店,
+122,Aesop,
+123,Chanel Beauty,
+124,CREED,
+125,Dior Beauty,
+126,JO MALONE LONDON,
+127,SISLEY,
+128,Yves Saint Laurent Beauté,
+129,SwissKubiK,
+130,adidas,
+131,CHANEL WATCH STORE,
+132,寶格麗,
+133,天梭表,
+134,瑞士雷達表,
+135,卡地亞台北旗艦店,
+136,香奈兒腕錶專門店,
+137,百年靈,
+138,浪琴表,
+139,香奈兒,
+140,ACERA,
+141,乾唐軒,
+142,AMBI SPACE ONE,
+143,The one at Taipei 101,
+144,Toyama Xiangtang,
+145,富山香堂,
+146,FushanKodo,
+147,sugarfina,
+148,MK,
+149,YSL,
+150,HW,
+151,GS,
+152,PP,
+153,SHH,
+154,VC,
+155,VCA,
+157,BV,
+156,C&K,
+158,CK,
+90,GIORDANO LADIES,
+159,new balance,
+160,YSL Beauté,

+ 34 - 0
src/config.py

@@ -0,0 +1,34 @@
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+SUPABASE_URL: str =  os.environ.get('SUPABASE_URL')
+SUPABASE_KEY: str = os.environ.get('SUPABASE_KEY')
+OPEN_API_KEY: str = os.environ.get('OPENAI_API_KEY')
+
+if not SUPABASE_URL or not SUPABASE_KEY:
+    raise ValueError("SUPABASE_URL and SUPABASE_KEY must be set in the .env file")
+
+
+SYSTEM_PROMPT = """你是一位專業的轉錄校對助理,專門處理有關品牌的對話轉錄。
+你的任務是:
+1. 確保中英文品牌的正確性,大小寫不要更動。
+2. 在必要時添加適當的標點符號,如句號、逗號
+3. 使用台灣的繁體中文或英文,確保語言表達符合台灣的用語習慣。
+4. 只更正明顯的錯誤或改善可讀性,不要改變原文的意思或結構。
+5. 不要回答問題、解釋概念或添加任何不在原文中的信息。
+6. 如果原文是一個問句,保持它的問句形式,不要提供答案。
+
+請只根據提供的原文進行必要的更正,不要添加或刪除任何實質性內容。在修正時,請特別注意上下文,確保修正後的詞語符合整句話的語境。"""
+
+CORRECT_TERMS = [
+    "品牌", "101", "一零一"
+]
+
+ERROR_CORRECTION = {
+    "庫治": "cucci",
+    "Coles": "cos",
+    "粉絲": "versace",
+    "St. Lawrence": "saint laurent"
+}

+ 67 - 0
src/main_script.py

@@ -0,0 +1,67 @@
+import sys
+from dictionary_loader import load_word_database_dictionary_from_supabase
+from audio_processing import process_audio
+
+def initialize():
+    word_database_success = load_word_database_dictionary_from_supabase()
+    if not word_database_success:
+        print("Warning: Word Database Dictionary loading failed. Proceeding with default dictionary.")
+
+
+def process_audio_file(audio_file):
+    try:
+        result = process_audio(audio_file)
+        if isinstance(result, tuple) and len(result) == 2:
+            return result
+        else:
+            print("Unexpected result from process_audio")
+            return None, None
+    except Exception as e:
+        print(f"Error processing audio: {str(e)}")
+        return None, None
+    
+# 加入檢查user是否詢問特定問題
+def main(audio_file_path):
+    initialize()
+    try:
+        with open(audio_file_path, "rb") as audio_file:
+            raw_transcript, corrected_transcript = process_audio_file(audio_file)
+        
+        if raw_transcript and corrected_transcript:
+            print(f"Raw transcript: {raw_transcript}")
+            print(f"Corrected transcript: {corrected_transcript}")
+            
+        else:
+            print("Audio processing failed.")
+    except FileNotFoundError:
+        print(f"Error: The file '{audio_file_path}' was not found.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {str(e)}")
+        import traceback
+        traceback.print_exc()
+
+
+## 原本的main
+# def main(audio_file_path):
+#     initialize()
+#     try:
+#         with open(audio_file_path, "rb") as audio_file:
+#             result = process_audio_file(audio_file)
+        
+#         if result:
+#             print(result)
+#         else:
+#             print("Audio processing failed.")
+#     except FileNotFoundError:
+#         print(f"Error: The file '{audio_file_path}' was not found.")
+#     except Exception as e:
+#         print(f"An unexpected error occurred: {str(e)}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python script_name.py <audio_file_path>")
+        sys.exit(1)
+    
+    audio_file_path = sys.argv[1]
+    main(audio_file_path)

+ 71 - 0
src/text_processing.py

@@ -0,0 +1,71 @@
+import jieba
+from pypinyin import pinyin, Style
+from config import CORRECT_TERMS, ERROR_CORRECTION
+
+def chinese_soundex(pinyin_str):
+    soundex_map = {
+        'b': '1', 'p': '1', 'm': '1', 'f': '1',
+        'd': '2', 'n': '2', 'l': '2',
+        'g': '3', 'k': '3', 'h': '3', 't': '3',
+        'j': '4', 'q': '4', 'x': '4',
+        'zh': '5', 'ch': '5', 'sh': '5', 'r': '5',
+        'z': '6', 'c': '6', 's': '6',
+        'an': '7', 'ang': '7', 'en': '8', 'eng': '8', 'in': '8', 'ing': '8',
+        'ong': '9', 'un': '9', 'uan': '9',
+        'i': 'A', 'u': 'A', 'v': 'A',
+        'e': 'B', 'o': 'B',
+    }
+    
+    code = ''
+    tone = '0'
+    i = 0
+    while i < len(pinyin_str):
+        if pinyin_str[i:i+2] in soundex_map:
+            code += soundex_map[pinyin_str[i:i+2]]
+            i += 2
+        elif pinyin_str[i] in soundex_map:
+            code += soundex_map[pinyin_str[i]]
+            i += 1
+        elif pinyin_str[i].isdigit():
+            tone = pinyin_str[i]
+            i += 1
+        else:
+            i += 1
+    
+    code = code[:1] + ''.join(sorted(set(code[1:])))
+    return (code[:3] + tone).ljust(4, '0')
+
+def compare_chinese_words(word1, word2, tone_sensitive=True):
+    pinyin1 = ''.join([p[0] for p in pinyin(word1, style=Style.TONE3, neutral_tone_with_five=True)])
+    pinyin2 = ''.join([p[0] for p in pinyin(word2, style=Style.TONE3, neutral_tone_with_five=True)])
+    
+    soundex1 = chinese_soundex(pinyin1)
+    # print(soundex1)
+    soundex2 = chinese_soundex(pinyin2)
+    # print('soundex2', soundex2)
+
+    if tone_sensitive:
+        return soundex1 == soundex2
+    else:
+        return soundex1[:3] == soundex2[:3]
+
+def fuzzy_correct_chinese(text):
+    words = jieba.lcut(text)
+    corrected_words = []
+    for word in words:
+        if word.isalpha():
+            corrected_words.append(word)
+            continue
+        word_pinyin = ''.join([p[0] for p in pinyin(word, style=Style.NORMAL)])
+        # print(f"Term: {word}, Pinyin: {word_pinyin}")
+        if word in ERROR_CORRECTION:
+            corrected_words.append(ERROR_CORRECTION[word])
+        else:
+            for term in CORRECT_TERMS:
+                if compare_chinese_words(word, term, tone_sensitive=True):
+                    # print(f"corrected: {word} -> {term}")
+                    corrected_words.append(term)
+                    break
+            else:
+                corrected_words.append(word)
+    return ''.join(corrected_words)