### Python = 3.9 import os from dotenv import load_dotenv load_dotenv() import openai openai_api_key = os.getenv("OPENAI_API_KEY") openai.api_key = openai_api_key from langchain_openai import OpenAIEmbeddings embeddings_model = OpenAIEmbeddings() from langchain_community.document_loaders.csv_loader import CSVLoader from langchain_chroma import Chroma from supabase import create_client, Client supabase_url = os.getenv("SUPABASE_URL") supabase_key = os.getenv("SUPABASE_KEY") supabase: Client = create_client(supabase_url, supabase_key) from apscheduler.schedulers.background import BackgroundScheduler from typing import AsyncIterator scheduler = BackgroundScheduler(timezone="Asia/Taipei") ############# Load data ############# # def extract_field(doc, field_name): # for line in doc.page_content.split('\n'): # if line.startswith(f"{field_name}:"): # return line.split(':', 1)[1].strip() # return None # loader = CSVLoader(file_path="video_cache_rows.csv") # data = loader.load() # field_name = "question" # question = [extract_field(doc, field_name) for doc in data] # ####### load data from supabase ####### # embeddings_model = OpenAIEmbeddings() vectorstore_list ={} question_id_map_list = {} def generated(language:str ="ch"): global response,count,ids response,count = supabase.table("video_cache").select("question","id").eq("language",language).order("id").execute() data = response[1] question = [item['question'] for item in data if 'question' in item] #print(question) ids = [item['id'] for item in data if 'id' in item] question_id_map = {item['question']: item['id'] for item in data if 'id' in item and 'question' in item} question_id_map_list[language] = question_id_map ########## generate embedding ########### embedding = embeddings_model.embed_documents(question) ########## Write embedding to the supabase table ####### # for id, new_embedding in zip(ids, embedding): # supabase.table("video_cache_rows_duplicate").insert({"embedding": embedding.tolist()}).eq("id", id).execute() ######### Vector Store ########## # Put pre-compute embeddings to vector store. ## save to disk persist_directory = f"./chroma_db_{language}" vectorstore = Chroma.from_texts( texts=question_id_map_list[language], embedding=embeddings_model, persist_directory=persist_directory ) #vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model) vectorstore_list[language] = vectorstore print(f"gernerate {language}") #print(question_id_map_list) generated("ch") generated("en") generated("jp") generated("ko") scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "ch"}) scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "en"}) scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "jp"}) scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "ko"}) scheduler.start() def get_id_by_question(question,language): return question_id_map_list[language].get(question) # print(question) # created_at = [] # question = [] # ids = [] # answer = [] # video_url = [] # for item in data: # ids.append(item['id']) # created_at.append(item['created_at']) # question.append(item['question']) # answer.append(item['answer']) # video_url.append(item['video_url']) def ask_question(question:str, SIMILARITY_THRESHOLD:int = 0.83,language:str ="ch"): # generated(language=language) print(language) vectorstore = vectorstore_list[language] print(vectorstore) docs_and_scores = vectorstore.similarity_search_with_relevance_scores(question, k=1) doc, score = docs_and_scores[0] print(doc,score) if score >= SIMILARITY_THRESHOLD: id = get_id_by_question(doc.page_content,language) data,count = supabase.table("video_cache").select("*").eq("id",id).execute() if data[1][0]["answer"] == None : return None return data[1] else: return None def ask_question_find_brand(question:str): # 使用 OpenAI 模型生成查询 # 使用 OpenAI ChatCompletion 模型生成关键词 response = openai.ChatCompletion.create( model="gpt-4", # 选择 GPT-4 模型 messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": f"Extract keywords from the following text for a database search: {question}"} ], max_tokens=50, temperature=0.5, ) # 提取模型返回的关键词 keywords = response.choices[0].message['content'].strip().split(", ") return keywords if __name__ == "__main__" : ####### load from disk ####### query = "美食街在哪裡" #docs = vectorstore.similarity_search(query) #print(f"Query: {query} | 最接近文檔:{docs[0].page_content}") ####### Query it ######### query = "101可以帶狗嗎" #docs = vectorstore.similarity_search(query) #print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")