16 Commits 1413ab5d4b ... 9bfb5803ac

Tác giả SHA1 Thông báo Ngày
  SherryLiu 9bfb5803ac 101-asr 英文修正 7 tháng trước cách đây
  steven 1413ab5d4b update requirements.txt 7 tháng trước cách đây
  steven cbfb282505 First push 7 tháng trước cách đây
  SherryLiu c1583d2700 save outputs 8 tháng trước cách đây
  SherryLiu d70258aa45 remove unnecessary file 8 tháng trước cách đây
  SherryLiu d4938c6ef8 remove unnecessary files 8 tháng trước cách đây
  SherryLiu 6ae7ccd08e Rename READNE.md to README.md 8 tháng trước cách đây
  SherryLiu 4171ad03da Merge branch 'main' of http://git.choozmo.com:3000/sherry/ASR 8 tháng trước cách đây
  SherryLiu ce79ae2cd5 rename readme 8 tháng trước cách đây
  SherryLiu 63aacf68c9 update README 8 tháng trước cách đây
  SherryLiu 58cc6cecf6 Initial commit with whisper.py, README.md, and requirements.txt 8 tháng trước cách đây
  SherryLiu 44f4860e9c update readme 8 tháng trước cách đây
  SherryLiu fed1681308 added tone 8 tháng trước cách đây
  SherryLiu 2daf12682d wip 8 tháng trước cách đây
  SherryLiu 50e3407e89 clean up requirements.txt 8 tháng trước cách đây
  SherryLiu 1022f3c5a8 finished 8 tháng trước cách đây

+ 0 - 3
src/.env

@@ -1,3 +0,0 @@
-SUPABASE_URL = "http://139.144.120.184:8000"
-SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyAgCiAgICAicm9sZSI6ICJzZXJ2aWNlX3JvbGUiLAogICAgImlzcyI6ICJzdXBhYmFzZS1kZW1vIiwKICAgICJpYXQiOiAxNjQxNzY5MjAwLAogICAgImV4cCI6IDE3OTk1MzU2MDAKfQ.DaYlNEoUrrEn2Ig7tqibS-PHK5vgusbcbo7X36XVt4Q"
-OPENAI_API_KEY = "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"

BIN
src/__pycache__/audio_processing.cpython-38.pyc


BIN
src/__pycache__/config.cpython-38.pyc


BIN
src/__pycache__/dictionary_loader.cpython-38.pyc


BIN
src/__pycache__/text_processing.cpython-38.pyc


+ 15 - 16
src/audio_processing.py

@@ -2,27 +2,26 @@ from openai import OpenAI
 from config import SYSTEM_PROMPT, OPEN_API_KEY, SUPABASE_KEY, SUPABASE_URL
 from supabase import create_client, Client
 from text_processing import fuzzy_correct_chinese
+import csv
 
 client = OpenAI(api_key=OPEN_API_KEY)
 supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
 
+def load_custom_vocab_from_csv(file_path):
+    custom_vocab = []
+    try:
+        with open(file_path, 'r', encoding='utf-8') as csvfile:
+            reader = csv.DictReader(csvfile)
+            custom_vocab = [row['brand'] for row in reader if 'brand' in row]
+    except Exception as e:
+        print(f"Error reading CSV file: {str(e)}")
+        print("Using empty vocabulary.")
+    return custom_vocab
+
 def transcribe(audio_file):
     try:
-        table_name = "brand_database"
-        response = supabase.table(table_name).select("brand", "category").execute()
-        custom_vocab = []
-        if response.data:
-            for item in response.data:
-                custom_vocab.append({item['brand']})
-        else:
-            print(f"No data found or an error occurred: {response.error}")
-            print("Using default dictionary as Supabase data couldn't be fetched.")
-            custom_vocab = ["FENDI", "BOSS", "BALENCIAGA", "BURBERRY", "CELINE", "COS", 
-                            "COACH", "Dior", "FENDI", "GUCCI", "KENZO", "Louis Vuitton", 
-                            "LV", "MONTBLANC", "POLO", "TORY BURCH", "VERSACE", "BV",  
-                            "BAO BAO ISSEY MIYAKE", "BERLUTI", "BOTTEGA VENETA", "ZEGNA", 
-                            "FERRAGAMO", "LONGCHAMP", "Loro Piana", "maje", "MICHAEL KORS", 
-                            "MONCLER", "PLEATS PLEASE", "SAINT LAURENT"]
+        custom_vocab = load_custom_vocab_from_csv('brand_database_rows.csv')
+        
         transcript = client.audio.transcriptions.create(
             file=audio_file,
             model="whisper-1",
@@ -56,4 +55,4 @@ def process_audio(audio_data):
     if raw_transcript is None:
         return None, None
     corrected_transcript = post_process_transcript(raw_transcript)
-    return raw_transcript, corrected_transcript
+    return raw_transcript, corrected_transcript

+ 161 - 0
src/brand_database_rows.csv

@@ -0,0 +1,161 @@
+id,brand,category
+1,BALENCIAGA,
+2,BOSS,
+3,BURBERRY,
+4,CELINE,
+5,COS,
+6,COACH,
+7,Dior,
+8,FENDI,
+9,GUCCI,
+10,KENZO,
+11,Louis Vuitton,
+12,LV,
+13,MONTBLANC,
+14,POLO,
+15,Tory Burch,
+16,VERSACE,
+17,BAO BAO ISSEY MIYAKE,
+18,Berluti,
+19,BOTTEGA VENETA,
+20,ZEGNA,
+21,FERRAGAMO,
+22,LONGCHAMP,
+23,Loro Piana,
+24,maje,
+25,MICHAEL KORS,
+26,Moncler,
+27,PLEATS PLEASE,
+28,SAINT LAURENT,
+29,A. Lange & Söhne,
+30,BLANCPAIN,
+31,BOUCHERON,
+32,BREGUET,
+33,BREITLING,
+34,BVLGARI,
+35,Cartier,
+36,CHANEL,
+37,CHAUMET,
+38,CHOPARD,
+39,DAMIANI,
+40,DE BEERS,
+41,FRED,
+42,HARRY WINSTON,
+43,Grand Seiko,
+44,HUBLOT,
+45,IWC,
+46,JADEGIA,
+47,玉世家,
+48,JAEGER-LECOULTRE,
+49,LONGINES,
+50,MIKIMOTO,
+51,OMEGA,
+52,歐米茄,
+53,PANERAI,
+54,PATEK PHILIPPE,
+55,PIAGET,
+56,RADO,
+57,ROGER DUBUIS,
+58,Rolex,
+59,勞力士,
+60,Sincere Haute Horlogerie,
+61,TAG Heuer,
+62,Tiffany & Co.,
+63,TISSOT,
+64,TUDOR,
+65,帝舵表,
+66,VACHERON CONSTANTIN,
+67,Van Cleef & Arpels,
+68,PEDRO,
+69,2020EYEhaus,
+70,APM MONACO,
+71,BAO BAO ISSEY MIYAKE,
+72,CHARLES & KEITH,
+73,HOGAN,
+74,KANGOL,
+75,MIRROR,
+76,皇宣緣,
+77,PANDORA,
+78,Redline,
+79,SWAROVSKI,
+80,SWATCH,
+81,The Way Eyewear,
+82,TUMI,
+83,vacanza,
+84,A|X ARMANI EXCHANGE,
+85,adidas SWC,
+86,Benetton,
+87,Brooks Brothers,
+88,CALVIN KLEIN JEANS,
+89,Crocs,
+91,iROO,
+92,LULULEMON,
+93,Massimo Dutti,
+94,MLB,
+95,NB GREY Image Main Store,
+96,NB GREY,
+97,NIKE KICKS LOUNGE,
+98,Onitsuka Tiger,
+99,鬼塚虎,
+100,PEDRO,
+101,PORTER INTERNATIONAL,
+102,ROOTS,
+103,THE NORTH FACE,
+104,THE NORTH FACE Taipei 101 store,
+105,TOMMY HILFIGER,
+106,ZARA,
+107,ABC Cooking Studio,
+108,Apple Taipei 101,
+109,Apple,
+110,Bang & Olufsen,
+111,Devialet,
+112,FamilyMart,
+113,LAMY,
+114,Sony,
+115,Sony 台北 101 直營店,
+116,Taipei 101 Observation Deck,
+117,Taipei Fubon Bank,
+118,World Gym Elite,
+119,台北101觀景台,
+120,台北富邦銀行,
+121,全家便利商店,
+122,Aesop,
+123,Chanel Beauty,
+124,CREED,
+125,Dior Beauty,
+126,JO MALONE LONDON,
+127,SISLEY,
+128,Yves Saint Laurent Beauté,
+129,SwissKubiK,
+130,adidas,
+131,CHANEL WATCH STORE,
+132,寶格麗,
+133,天梭表,
+134,瑞士雷達表,
+135,卡地亞台北旗艦店,
+136,香奈兒腕錶專門店,
+137,百年靈,
+138,浪琴表,
+139,香奈兒,
+140,ACERA,
+141,乾唐軒,
+142,AMBI SPACE ONE,
+143,The one at Taipei 101,
+144,Toyama Xiangtang,
+145,富山香堂,
+146,FushanKodo,
+147,sugarfina,
+148,MK,
+149,YSL,
+150,HW,
+151,GS,
+152,PP,
+153,SHH,
+154,VC,
+155,VCA,
+157,BV,
+156,C&K,
+158,CK,
+90,GIORDANO LADIES,
+159,new balance,
+160,YSL Beauté,

+ 4 - 1
src/config.py

@@ -27,5 +27,8 @@ CORRECT_TERMS = [
 ]
 
 ERROR_CORRECTION = {
-
+    "庫治": "cucci",
+    "Coles": "cos",
+    "粉絲": "versace",
+    "St. Lawrence": "saint laurent"
 }

+ 0 - 27
src/dictionary_loader.py

@@ -1,27 +0,0 @@
-import io
-import jieba
-from supabase import create_client, Client
-from config import SUPABASE_URL, SUPABASE_KEY
-
-supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
-
-
-def load_word_database_dictionary_from_supabase():
-    table_name = "word_database"
-    response = supabase.table(table_name).select("term", "weight", "type").execute()
-    
-    if response.data:
-        dict_data = io.StringIO()
-        for item in response.data:
-            dict_data.write(f"{item['term']} {item['weight']} {item['type']}\n")
-        
-        dict_data.seek(0)
-        jieba.load_userdict(dict_data)
-        # print("Loaded dictionary from Supabase")
-        return True
-    else:
-        print(f"No data found or an error occurred: {response.error}")
-        print("Using default dictionary as Supabase data couldn't be fetched.")
-        return False
-
-