semantic_search.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. ### Python = 3.9
  2. import os
  3. from dotenv import load_dotenv
  4. load_dotenv('.env')
  5. import openai
  6. openai_api_key = os.getenv("OPENAI_API_KEY")
  7. openai.api_key = openai_api_key
  8. from langchain_openai import OpenAIEmbeddings
  9. embeddings_model = OpenAIEmbeddings()
  10. from langchain_community.document_loaders.csv_loader import CSVLoader
  11. from langchain_community.vectorstores import Chroma
  12. import pandas as pd
  13. import re
  14. from langchain_community.embeddings.openai import OpenAIEmbeddings
  15. from langchain_community.vectorstores import SupabaseVectorStore
  16. from supabase.client import create_client
  17. def create_qa_vectordb(supabase, vectordb_directory="./chroma_db_carbon_questions"):
  18. if os.path.isdir(vectordb_directory):
  19. vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
  20. vectorstore.delete_collection()
  21. response = supabase.table("QA_database").select("Question, Answer").execute()
  22. questions = [row["Question"] for row in response.data]
  23. ######### generate embedding ###########
  24. # embedding = embeddings_model.embed_documents(questions)
  25. ########## Write embedding to the supabase table #######
  26. # for id, new_embedding in zip(ids, embedding):
  27. # supabase.table("video_cache_rows_duplicate").insert({"embedding": embedding.tolist()}).eq("id", id).execute()
  28. ########### Vector Store #############
  29. # Put pre-compute embeddings to vector store. ## save to disk
  30. vectorstore = Chroma.from_texts(
  31. texts=questions,
  32. embedding=embeddings_model,
  33. persist_directory=vectordb_directory
  34. )
  35. return vectorstore
  36. # vectorstore = Chroma(persist_directory="./chroma_db_carbon_questions", embedding_function=embeddings_model)
  37. def semantic_cache(supabase, q, SIMILARITY_THRESHOLD=0.83, k=1, vectordb_directory="./chroma_db_carbon_questions"):
  38. if os.path.isdir(vectordb_directory):
  39. vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
  40. else:
  41. print("create new vector db ...")
  42. vectorstore = create_qa_vectordb(supabase, vectordb_directory)
  43. docs_and_scores = vectorstore.similarity_search_with_relevance_scores(q, k=1)
  44. doc, score = docs_and_scores[0]
  45. if score >= SIMILARITY_THRESHOLD:
  46. cache_question = doc.page_content
  47. response = supabase.table("QA_database").select("Question, Answer").eq("Question", cache_question).execute()
  48. # qa_df = pd.DataFrame(response.data)
  49. # print(response.data[0])
  50. answer = response.data[0]["Answer"]
  51. # video_cache = response.data[0]["video_cache"]
  52. return cache_question, answer
  53. else:
  54. return None, None