### Python = 3.9 import os from dotenv import load_dotenv load_dotenv('.env') import openai openai_api_key = os.getenv("OPENAI_API_KEY") openai.api_key = openai_api_key from langchain_openai import OpenAIEmbeddings embeddings_model = OpenAIEmbeddings() from langchain_community.document_loaders.csv_loader import CSVLoader from langchain_community.vectorstores import Chroma import pandas as pd import re from langchain_community.embeddings.openai import OpenAIEmbeddings from langchain_community.vectorstores import SupabaseVectorStore from supabase.client import create_client def create_qa_vectordb(supabase, vectordb_directory="./chroma_db_carbon_questions"): if os.path.isdir(vectordb_directory): vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model) vectorstore.delete_collection() response = supabase.table("QA_database").select("Question, Answer").execute() questions = [row["Question"] for row in response.data] ######### generate embedding ########### # embedding = embeddings_model.embed_documents(questions) ########## Write embedding to the supabase table ####### # for id, new_embedding in zip(ids, embedding): # supabase.table("video_cache_rows_duplicate").insert({"embedding": embedding.tolist()}).eq("id", id).execute() ########### Vector Store ############# # Put pre-compute embeddings to vector store. ## save to disk vectorstore = Chroma.from_texts( texts=questions, embedding=embeddings_model, persist_directory=vectordb_directory ) return vectorstore # vectorstore = Chroma(persist_directory="./chroma_db_carbon_questions", embedding_function=embeddings_model) def semantic_cache(supabase, q, SIMILARITY_THRESHOLD=0.83, k=1, vectordb_directory="./chroma_db_carbon_questions"): if os.path.isdir(vectordb_directory): vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model) else: print("create new vector db ...") vectorstore = create_qa_vectordb(supabase, vectordb_directory) docs_and_scores = vectorstore.similarity_search_with_relevance_scores(q, k=1) doc, score = docs_and_scores[0] if score >= SIMILARITY_THRESHOLD: cache_question = doc.page_content response = supabase.table("QA_database").select("Question, Answer").eq("Question", cache_question).execute() # qa_df = pd.DataFrame(response.data) # print(response.data[0]) answer = response.data[0]["Answer"] # video_cache = response.data[0]["video_cache"] return cache_question, answer else: return None, None