semantic_search.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. ### Python = 3.9
  2. import os
  3. from dotenv import load_dotenv
  4. load_dotenv('.env')
  5. import openai
  6. openai_api_key = os.getenv("OPENAI_API_KEY")
  7. openai.api_key = openai_api_key
  8. from langchain_openai import OpenAIEmbeddings
  9. embeddings_model = OpenAIEmbeddings()
  10. from langchain_community.document_loaders.csv_loader import CSVLoader
  11. from langchain_community.vectorstores import Chroma
  12. import pandas as pd
  13. import re
  14. from langchain_community.embeddings.openai import OpenAIEmbeddings
  15. from langchain_community.vectorstores import SupabaseVectorStore
  16. from supabase.client import create_client
  17. def create_qa_vectordb(supabase, vectordb_directory="./chroma_db"):
  18. if os.path.isdir(vectordb_directory):
  19. vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
  20. vectorstore.delete_collection()
  21. response = supabase.table("INNOLUX_cache").select("question, answer").execute()
  22. questions = [row["question"] for row in response.data]
  23. vectorstore = Chroma.from_texts(
  24. texts=questions,
  25. embedding=embeddings_model,
  26. persist_directory=vectordb_directory
  27. )
  28. return vectorstore
  29. def semantic_cache(supabase, q, SIMILARITY_THRESHOLD=0.83, k=1, vectordb_directory="./chroma_db"):
  30. if os.path.isdir(vectordb_directory):
  31. vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
  32. else:
  33. print("create new vector db ...")
  34. vectorstore = create_qa_vectordb(supabase, vectordb_directory)
  35. docs_and_scores = vectorstore.similarity_search_with_relevance_scores(q, k=1)
  36. doc, score = docs_and_scores[0]
  37. print(score)
  38. if score >= SIMILARITY_THRESHOLD:
  39. cache_question = doc.page_content
  40. print(cache_question)
  41. # response = supabase.table("INNOLUX_cache").select("question, answer").eq("question", cache_question).execute()
  42. response = supabase.table("INNOLUX_cache").select("question, answer, video_url").eq("question", cache_question).execute()
  43. print(response.data)
  44. answer = response.data[0]["answer"]
  45. video_cache = response.data[0]["video_url"]
  46. return cache_question, answer, video_cache
  47. else:
  48. return None, None, None