123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101 |
- ### Python = 3.9
- import os
- from dotenv import load_dotenv
- load_dotenv('.env')
- import openai
- openai_api_key = os.getenv("OPENAI_API_KEY")
- openai.api_key = openai_api_key
- from langchain_openai import OpenAIEmbeddings
- embeddings_model = OpenAIEmbeddings()
- from langchain_community.document_loaders.csv_loader import CSVLoader
- from langchain_community.vectorstores import Chroma
- import pandas as pd
- import re
- from langchain_community.embeddings.openai import OpenAIEmbeddings
- from langchain_community.vectorstores import SupabaseVectorStore
- from supabase.client import create_client
- def grandson_vectordb(vectordb_directory = "./chroma_grandson"):
- questions = ['我的跨損啊', "我要看孫子"]
-
- vectorstore = Chroma.from_texts(
- texts=questions,
- embedding=embeddings_model,
- persist_directory=vectordb_directory
- )
- return vectorstore
- def grandson_semantic_cache(q, SIMILARITY_THRESHOLD=0.83, k=1, vectordb_directory = "./chroma_grandson"):
- vectordb_directory = "./chroma_grandson"
- if os.path.isdir(vectordb_directory):
- vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
- else:
- print("create new vector db ...")
- vectorstore = grandson_vectordb(vectordb_directory)
- docs_and_scores = vectorstore.similarity_search_with_relevance_scores(q, k=1)
- doc, score = docs_and_scores[0]
-
- if score >= SIMILARITY_THRESHOLD:
- cache_question = doc.page_content
- answer = "你有三個孫子,男生在國小念書,你要看他照片嗎"
- return cache_question, answer
- else:
- return None, None
-
- def create_qa_vectordb(supabase, vectordb_directory="./chroma_db_carbon_questions"):
- if os.path.isdir(vectordb_directory):
- vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
- vectorstore.delete_collection()
- response = supabase.table("QA_database").select("Question, Answer").execute()
- questions = [row["Question"] for row in response.data]
- ######### generate embedding ###########
- # embedding = embeddings_model.embed_documents(questions)
- ########## Write embedding to the supabase table #######
- # for id, new_embedding in zip(ids, embedding):
- # supabase.table("video_cache_rows_duplicate").insert({"embedding": embedding.tolist()}).eq("id", id).execute()
- ########### Vector Store #############
- # Put pre-compute embeddings to vector store. ## save to disk
- vectorstore = Chroma.from_texts(
- texts=questions,
- embedding=embeddings_model,
- persist_directory=vectordb_directory
- )
-
- return vectorstore
- # vectorstore = Chroma(persist_directory="./chroma_db_carbon_questions", embedding_function=embeddings_model)
- def semantic_cache(supabase, q, SIMILARITY_THRESHOLD=0.83, k=1, vectordb_directory="./chroma_db_carbon_questions"):
- if os.path.isdir(vectordb_directory):
- vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
- else:
- print("create new vector db ...")
- vectorstore = create_qa_vectordb(supabase, vectordb_directory)
- docs_and_scores = vectorstore.similarity_search_with_relevance_scores(q, k=1)
- doc, score = docs_and_scores[0]
-
- if score >= SIMILARITY_THRESHOLD:
- cache_question = doc.page_content
- response = supabase.table("QA_database").select("Question, Answer").eq("Question", cache_question).execute()
- # qa_df = pd.DataFrame(response.data)
- # print(response.data[0])
- answer = response.data[0]["Answer"]
- # video_cache = response.data[0]["video_cache"]
- return cache_question, answer
- else:
- return None, None
|