Ver código fonte

8/26 cache 票券

Mia 4 meses atrás
pai
commit
b12d3d811c
6 arquivos alterados com 169 adições e 70 exclusões
  1. 5 1
      .gitignore
  2. 0 1
      101_restart.sh
  3. BIN
      api/__pycache__/db_router.cpython-312.pyc
  4. 106 55
      api/db_router.py
  5. 1 0
      requirements.txt
  6. 57 13
      sherry/semantic_search.py

+ 5 - 1
.gitignore

@@ -4,4 +4,8 @@ sherry/chroma_db/
 sherry/__pycache__/
 chroma_db/
 token.pickle
-log/
+log/
+chroma_db_en/
+chroma_db_ch/
+chroma_db_jp/
+.env

+ 0 - 1
101_restart.sh

@@ -2,6 +2,5 @@ tmux new -d -s  101
 tmux new-window -t 101:0 -d
 tmux send-keys -t 101:0 "source 101evn/bin/activate" Enter
 tmux send-keys -t 101:0 "cd /home/mia/101" Enter
-tmux send-keys -t 101:0 "export OPENAI_API_KEY=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" Enter
 tmux send-keys -t 101:0 "sudo python3 main.py" Enter
 

BIN
api/__pycache__/db_router.cpython-312.pyc


+ 106 - 55
api/db_router.py

@@ -376,28 +376,8 @@ def insert_table(data: dataform):
 
         return {"state": 500 , "message" : str(e)}
     
-# @dbRouter.post("/video_save_into_cache")
-# def message_not_in_cache(video_name : Annotated[str, Field(description="檔案請丟進/home/mia/101/static/video_cache/others/資料夾裡")],client_message_id :str  = None,question:str = None):
-#     try:
-#         data = []
-#         if client_message_id :
-#             data, count = supabase.table('client_message').select('*').eq("id",client_message_id).execute()
-#         elif question:
-#             data, count = supabase.table('client_message').select('*').eq("question",question).execute()
-
-#         info = data[1][0]
-
-#         response = supabase.table('video_cache').insert({"question": info["question"],"answer":info["answer"],"video_url":f"/static/video_cache/others/{video_name}"}).execute()
-        
-#         response = supabase.table('client_message').delete().eq('id', info["id"]).execute()
-        
-#         return {"state": 200 , "message" : "success"}
     
-#     except Exception as e:
-
-#         return {"state": 500 , "message" : str(e)}
-    
-from sherry.semantic_search import ask_question
+from sherry.semantic_search import ask_question,ask_question_find_brand
     
 @dbRouter.post("/video_cache")
 def video_cache(client_message :str,language:str ="ch"):
@@ -413,6 +393,8 @@ def video_cache(client_message :str,language:str ="ch"):
 
         result = ask_question(client_message,language=language)
 
+        data = search_date(client_message,language=language)
+
         # result[0]["answer"]
 
         if result == None :
@@ -420,55 +402,124 @@ def video_cache(client_message :str,language:str ="ch"):
 
         # data, count = supabase.table("log_record").insert({"question":client_message, "answer":result[0]["answer"]}).execute()
         
-        return {"state": 200 , "message" : result }
+        return {"state": 200 , "message" : result ,"data":data}
     
     except Exception as e:
 
         return {"state": 500 , "message" : str(e)}
 
 
-# from openai import OpenAI
-# import json
-
-# client = OpenAI(
-#     # This is the default and can be omitted
-#     api_key=os.environ.get("OPENAI_API_KEY"),
-# )
-
-# def access_openai(prompt_value):
-#     chat_completion = client.chat.completions.create(
-#         messages=[
-#             {
-#                 "role": "user",
-#                 "content": f"請將以下的內容翻譯為文:\n\n {prompt_value}",
-#             }
-#         ],
-#         model="gpt-3.5-turbo",
-#     )
+from openai import OpenAI
+import json
+
+client = OpenAI(
+    # This is the default and can be omitted
+    api_key=os.environ.get("OPENAI_API_KEY"),
+)
+
+def access_openai(prompt_value):
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": f"請將以下的內容翻譯為文:\n\n {prompt_value}",
+            }
+        ],
+        model="gpt-3.5-turbo",
+    )
     
 
-#     return chat_completion.choices[0].message.content
+    return chat_completion.choices[0].message.content
 
 
 
-# @dbRouter.post("/translate")
-# def translate():
-#     try:
-#         response = supabase.table('video_cache').select('*').eq('language', 'ch').execute()
+@dbRouter.post("/translate")
+def translate():
+    try:
+        response = supabase.table('video_cache').select('*').eq('language', 'ch').execute()
 
-#         datas = response.data
+        datas = response.data
 
-#         for data in datas :
-#             translated_question = access_openai(data['question'])
-#             translated_answer = access_openai(data['answer'])
+        for data in datas :
+            translated_question = access_openai(data['question'])
+            translated_answer = access_openai(data['answer'])
 
-#             print(data['question'])
-#             print(translated_question)
+            print(data['question'])
+            print(translated_question)
 
-#             insert = supabase.table('client_message').insert({"client_id":"0", "question":translated_question,"answer":translated_answer,"language":"en"}).execute()
+            insert = supabase.table('client_message').insert({"client_id":"0", "question":translated_question,"answer":translated_answer,"language":"ko"}).execute()
 
-#         return {"state": 200 }
+        return {"state": 200 }
     
-#     except Exception as e:
+    except Exception as e:
+
+        return {"state": 500 , "message" : str(e)}
+
+import spacy
+import jieba
 
-#         return {"state": 500 , "message" : str(e)}
+@dbRouter.post("/search_date")
+def search_date(question:str,language:str="ch"):
+    try:
+        global nlp,exclude_conditions
+        if language == "ch":
+            
+            nlp = spacy.load("zh_core_web_sm")
+            exclude_languages = ["韓文", "日文", "英文"]
+            
+        elif language == "en":
+            nlp = spacy.load("en_core_web_sm")
+            exclude_languages = ["韓文", "日文", "中文"]
+            
+
+        # 處理輸入
+        doc = jieba.lcut(question)
+
+        # 提取關鍵字
+        keywords =  [word for word in doc if len(word) > 1] 
+
+        print(keywords)
+
+        # 構築條件
+        brand_query = supabase.from_("101_brand").select("*").eq("language",language)
+    
+        keywords_condition = ",".join([f"tags.ilike.%{keyword}%" for keyword in keywords])
+
+        # 查询 101_brand 表
+        brand_query = brand_query.or_(keywords_condition)
+
+        # 排除其他國家語言標籤
+        # for lang in exclude_languages:
+        #     brand_query = brand_query.not_.ilike("tags", f"%{lang}%")
+
+        brand_results = brand_query.execute()
+
+        # 查詢 101_ticket 表
+        ticket_query = supabase.from_("101_ticket").select("*").or_(keywords_condition)
+
+        # 排除其他國家語言標籤
+        for lang in exclude_languages:
+            ticket_query = ticket_query.not_.ilike("tags", f"%{lang}%")
+
+        ticket_results = ticket_query.execute()
+        
+        merged_results = []
+        for record in ticket_results.data:
+            merged_results.append({
+                "type": record.get("type"),
+                "info": record
+            })
+
+        # 格式化 `101_brand` 的结果
+        for record in brand_results.data:
+            merged_results.append({
+                "type": record.get("type"),
+                "info": record
+            })
+
+
+        return merged_results
+    
+    except Exception as e:
+
+        return {"state": 500 , "message" : str(e)}

+ 1 - 0
requirements.txt

@@ -132,3 +132,4 @@ websockets==12.0
 whisper==1.1.10
 yarl==1.9.4
 zope.interface==6.4
+jieba

+ 57 - 13
sherry/semantic_search.py

@@ -18,6 +18,10 @@ supabase_url = os.getenv("SUPABASE_URL")
 supabase_key = os.getenv("SUPABASE_KEY")
 supabase: Client = create_client(supabase_url, supabase_key)
 
+from apscheduler.schedulers.background import BackgroundScheduler
+from typing import AsyncIterator
+scheduler = BackgroundScheduler(timezone="Asia/Taipei")
+
 ############# Load data #############
 # def extract_field(doc, field_name):
 #     for line in doc.page_content.split('\n'):
@@ -33,13 +37,18 @@ supabase: Client = create_client(supabase_url, supabase_key)
 # ####### load data from supabase #######
 # embeddings_model = OpenAIEmbeddings()
 
+vectorstore_list ={}
+question_id_map_list = {}
+
 def generated(language:str ="ch"):
-    global response,count,question,ids,question_id_map,vectorstore
+    global response,count,ids
     response,count = supabase.table("video_cache").select("question","id").eq("language",language).order("id").execute()
     data = response[1]
     question = [item['question'] for item in data if 'question' in item]
+    #print(question)
     ids = [item['id'] for item in data if 'id' in item]
     question_id_map = {item['question']: item['id'] for item in data if 'id' in item and 'question' in item}
+    question_id_map_list[language] = question_id_map
 
     ########## generate embedding ###########
     embedding = embeddings_model.embed_documents(question)
@@ -50,18 +59,32 @@ def generated(language:str ="ch"):
 
     ######### Vector Store ##########
     # Put pre-compute embeddings to vector store. ## save to disk
+    persist_directory = f"./chroma_db_{language}"
+
     vectorstore = Chroma.from_texts(
-        texts=question,
+        texts=question_id_map_list[language],
         embedding=embeddings_model,
-        persist_directory="./chroma_db"
+        persist_directory=persist_directory
         )
 
-    vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)
+    #vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)
+    vectorstore_list[language] = vectorstore
+
+    print(f"gernerate {language}")
+    #print(question_id_map_list)
+
+generated("ch")
+generated("en")
+generated("jp")
 
-    print("gernerate")
+scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "ch"})
+scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "en"})
+scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "jp"})
 
-def get_id_by_question(question):
-    return question_id_map.get(question)
+scheduler.start()
+
+def get_id_by_question(question,language):
+    return question_id_map_list[language].get(question)
 
 # print(question)
 # created_at = []
@@ -79,13 +102,16 @@ def get_id_by_question(question):
 
 
 def ask_question(question:str, SIMILARITY_THRESHOLD:int = 0.83,language:str ="ch"):
-    generated(language=language)
+    # generated(language=language)
+    print(language)
+    vectorstore = vectorstore_list[language]
+    print(vectorstore)
     docs_and_scores = vectorstore.similarity_search_with_relevance_scores(question, k=1)
     doc, score = docs_and_scores[0]
     print(doc,score)
     
     if score >= SIMILARITY_THRESHOLD:
-        id = get_id_by_question(doc.page_content)
+        id = get_id_by_question(doc.page_content,language)
         data,count = supabase.table("video_cache").select("*").eq("id",id).execute()
 
         if data[1][0]["answer"] == None :
@@ -94,16 +120,34 @@ def ask_question(question:str, SIMILARITY_THRESHOLD:int = 0.83,language:str ="ch
         return data[1]
     else:
         return None
+    
+def ask_question_find_brand(question:str):
+    # 使用 OpenAI 模型生成查询
+    # 使用 OpenAI ChatCompletion 模型生成关键词
+    response = openai.ChatCompletion.create(
+        model="gpt-4",  # 选择 GPT-4 模型
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": f"Extract keywords from the following text for a database search: {question}"}
+        ],
+        max_tokens=50,
+        temperature=0.5,
+    )
+
+    # 提取模型返回的关键词
+    keywords = response.choices[0].message['content'].strip().split(", ")
+
+    return keywords
 
 
 if __name__ == "__main__" :
     ####### load from disk  #######
     query = "美食街在哪裡"
-    docs = vectorstore.similarity_search(query)
-    print(f"Query: {query}  | 最接近文檔:{docs[0].page_content}")
+    #docs = vectorstore.similarity_search(query)
+    #print(f"Query: {query}  | 最接近文檔:{docs[0].page_content}")
 
     ####### Query it #########
     query = "101可以帶狗嗎"
-    docs = vectorstore.similarity_search(query)
-    print(f"Query: {query}  | 最接近文檔:{docs[0].page_content}")
+    #docs = vectorstore.similarity_search(query)
+    #print(f"Query: {query}  | 最接近文檔:{docs[0].page_content}")