|
@@ -18,6 +18,10 @@ supabase_url = os.getenv("SUPABASE_URL")
|
|
supabase_key = os.getenv("SUPABASE_KEY")
|
|
supabase_key = os.getenv("SUPABASE_KEY")
|
|
supabase: Client = create_client(supabase_url, supabase_key)
|
|
supabase: Client = create_client(supabase_url, supabase_key)
|
|
|
|
|
|
|
|
+from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
|
+from typing import AsyncIterator
|
|
|
|
+scheduler = BackgroundScheduler(timezone="Asia/Taipei")
|
|
|
|
+
|
|
############# Load data #############
|
|
############# Load data #############
|
|
# def extract_field(doc, field_name):
|
|
# def extract_field(doc, field_name):
|
|
# for line in doc.page_content.split('\n'):
|
|
# for line in doc.page_content.split('\n'):
|
|
@@ -33,13 +37,18 @@ supabase: Client = create_client(supabase_url, supabase_key)
|
|
# ####### load data from supabase #######
|
|
# ####### load data from supabase #######
|
|
# embeddings_model = OpenAIEmbeddings()
|
|
# embeddings_model = OpenAIEmbeddings()
|
|
|
|
|
|
|
|
+vectorstore_list ={}
|
|
|
|
+question_id_map_list = {}
|
|
|
|
+
|
|
def generated(language:str ="ch"):
|
|
def generated(language:str ="ch"):
|
|
- global response,count,question,ids,question_id_map,vectorstore
|
|
|
|
|
|
+ global response,count,ids
|
|
response,count = supabase.table("video_cache").select("question","id").eq("language",language).order("id").execute()
|
|
response,count = supabase.table("video_cache").select("question","id").eq("language",language).order("id").execute()
|
|
data = response[1]
|
|
data = response[1]
|
|
question = [item['question'] for item in data if 'question' in item]
|
|
question = [item['question'] for item in data if 'question' in item]
|
|
|
|
+ #print(question)
|
|
ids = [item['id'] for item in data if 'id' in item]
|
|
ids = [item['id'] for item in data if 'id' in item]
|
|
question_id_map = {item['question']: item['id'] for item in data if 'id' in item and 'question' in item}
|
|
question_id_map = {item['question']: item['id'] for item in data if 'id' in item and 'question' in item}
|
|
|
|
+ question_id_map_list[language] = question_id_map
|
|
|
|
|
|
########## generate embedding ###########
|
|
########## generate embedding ###########
|
|
embedding = embeddings_model.embed_documents(question)
|
|
embedding = embeddings_model.embed_documents(question)
|
|
@@ -50,18 +59,32 @@ def generated(language:str ="ch"):
|
|
|
|
|
|
######### Vector Store ##########
|
|
######### Vector Store ##########
|
|
# Put pre-compute embeddings to vector store. ## save to disk
|
|
# Put pre-compute embeddings to vector store. ## save to disk
|
|
|
|
+ persist_directory = f"./chroma_db_{language}"
|
|
|
|
+
|
|
vectorstore = Chroma.from_texts(
|
|
vectorstore = Chroma.from_texts(
|
|
- texts=question,
|
|
|
|
|
|
+ texts=question_id_map_list[language],
|
|
embedding=embeddings_model,
|
|
embedding=embeddings_model,
|
|
- persist_directory="./chroma_db"
|
|
|
|
|
|
+ persist_directory=persist_directory
|
|
)
|
|
)
|
|
|
|
|
|
- vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)
|
|
|
|
|
|
+ #vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)
|
|
|
|
+ vectorstore_list[language] = vectorstore
|
|
|
|
+
|
|
|
|
+ print(f"gernerate {language}")
|
|
|
|
+ #print(question_id_map_list)
|
|
|
|
+
|
|
|
|
+generated("ch")
|
|
|
|
+generated("en")
|
|
|
|
+generated("jp")
|
|
|
|
|
|
- print("gernerate")
|
|
|
|
|
|
+scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "ch"})
|
|
|
|
+scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "en"})
|
|
|
|
+scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "jp"})
|
|
|
|
|
|
-def get_id_by_question(question):
|
|
|
|
- return question_id_map.get(question)
|
|
|
|
|
|
+scheduler.start()
|
|
|
|
+
|
|
|
|
+def get_id_by_question(question,language):
|
|
|
|
+ return question_id_map_list[language].get(question)
|
|
|
|
|
|
# print(question)
|
|
# print(question)
|
|
# created_at = []
|
|
# created_at = []
|
|
@@ -79,13 +102,16 @@ def get_id_by_question(question):
|
|
|
|
|
|
|
|
|
|
def ask_question(question:str, SIMILARITY_THRESHOLD:int = 0.83,language:str ="ch"):
|
|
def ask_question(question:str, SIMILARITY_THRESHOLD:int = 0.83,language:str ="ch"):
|
|
- generated(language=language)
|
|
|
|
|
|
+ # generated(language=language)
|
|
|
|
+ print(language)
|
|
|
|
+ vectorstore = vectorstore_list[language]
|
|
|
|
+ print(vectorstore)
|
|
docs_and_scores = vectorstore.similarity_search_with_relevance_scores(question, k=1)
|
|
docs_and_scores = vectorstore.similarity_search_with_relevance_scores(question, k=1)
|
|
doc, score = docs_and_scores[0]
|
|
doc, score = docs_and_scores[0]
|
|
print(doc,score)
|
|
print(doc,score)
|
|
|
|
|
|
if score >= SIMILARITY_THRESHOLD:
|
|
if score >= SIMILARITY_THRESHOLD:
|
|
- id = get_id_by_question(doc.page_content)
|
|
|
|
|
|
+ id = get_id_by_question(doc.page_content,language)
|
|
data,count = supabase.table("video_cache").select("*").eq("id",id).execute()
|
|
data,count = supabase.table("video_cache").select("*").eq("id",id).execute()
|
|
|
|
|
|
if data[1][0]["answer"] == None :
|
|
if data[1][0]["answer"] == None :
|
|
@@ -94,16 +120,34 @@ def ask_question(question:str, SIMILARITY_THRESHOLD:int = 0.83,language:str ="ch
|
|
return data[1]
|
|
return data[1]
|
|
else:
|
|
else:
|
|
return None
|
|
return None
|
|
|
|
+
|
|
|
|
+def ask_question_find_brand(question:str):
|
|
|
|
+ # 使用 OpenAI 模型生成查询
|
|
|
|
+ # 使用 OpenAI ChatCompletion 模型生成关键词
|
|
|
|
+ response = openai.ChatCompletion.create(
|
|
|
|
+ model="gpt-4", # 选择 GPT-4 模型
|
|
|
|
+ messages=[
|
|
|
|
+ {"role": "system", "content": "You are a helpful assistant."},
|
|
|
|
+ {"role": "user", "content": f"Extract keywords from the following text for a database search: {question}"}
|
|
|
|
+ ],
|
|
|
|
+ max_tokens=50,
|
|
|
|
+ temperature=0.5,
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # 提取模型返回的关键词
|
|
|
|
+ keywords = response.choices[0].message['content'].strip().split(", ")
|
|
|
|
+
|
|
|
|
+ return keywords
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__" :
|
|
if __name__ == "__main__" :
|
|
####### load from disk #######
|
|
####### load from disk #######
|
|
query = "美食街在哪裡"
|
|
query = "美食街在哪裡"
|
|
- docs = vectorstore.similarity_search(query)
|
|
|
|
- print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")
|
|
|
|
|
|
+ #docs = vectorstore.similarity_search(query)
|
|
|
|
+ #print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")
|
|
|
|
|
|
####### Query it #########
|
|
####### Query it #########
|
|
query = "101可以帶狗嗎"
|
|
query = "101可以帶狗嗎"
|
|
- docs = vectorstore.similarity_search(query)
|
|
|
|
- print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")
|
|
|
|
|
|
+ #docs = vectorstore.similarity_search(query)
|
|
|
|
+ #print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")
|
|
|
|
|