semantic_search.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. ### Python = 3.9
  2. import os
  3. from dotenv import load_dotenv
  4. load_dotenv('.env')
  5. import openai
  6. openai_api_key = os.getenv("OPENAI_API_KEY")
  7. openai.api_key = openai_api_key
  8. from langchain_openai import OpenAIEmbeddings
  9. embeddings_model = OpenAIEmbeddings()
  10. from langchain_community.document_loaders.csv_loader import CSVLoader
  11. from langchain_community.vectorstores import Chroma
  12. import pandas as pd
  13. import re
  14. from langchain_community.embeddings.openai import OpenAIEmbeddings
  15. from langchain_community.vectorstores import SupabaseVectorStore
  16. from supabase.client import create_client
  17. def grandson_vectordb(vectordb_directory = "./chroma_grandson"):
  18. questions = ['我的跨損啊', "我要看孫子"]
  19. vectorstore = Chroma.from_texts(
  20. texts=questions,
  21. embedding=embeddings_model,
  22. persist_directory=vectordb_directory
  23. )
  24. return vectorstore
  25. def grandson_semantic_cache(q, SIMILARITY_THRESHOLD=0.83, k=1, vectordb_directory = "./chroma_grandson"):
  26. vectordb_directory = "./chroma_grandson"
  27. if os.path.isdir(vectordb_directory):
  28. vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
  29. else:
  30. print("create new vector db ...")
  31. vectorstore = grandson_vectordb(vectordb_directory)
  32. docs_and_scores = vectorstore.similarity_search_with_relevance_scores(q, k=1)
  33. doc, score = docs_and_scores[0]
  34. if score >= SIMILARITY_THRESHOLD:
  35. cache_question = doc.page_content
  36. answer = "你有三個孫子,男生在國小念書,你要看他照片嗎"
  37. return cache_question, answer
  38. else:
  39. return None, None
  40. def create_qa_vectordb(supabase, vectordb_directory="./chroma_db_carbon_questions"):
  41. if os.path.isdir(vectordb_directory):
  42. vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
  43. vectorstore.delete_collection()
  44. response = supabase.table("QA_database").select("Question, Answer").execute()
  45. questions = [row["Question"] for row in response.data]
  46. ######### generate embedding ###########
  47. # embedding = embeddings_model.embed_documents(questions)
  48. ########## Write embedding to the supabase table #######
  49. # for id, new_embedding in zip(ids, embedding):
  50. # supabase.table("video_cache_rows_duplicate").insert({"embedding": embedding.tolist()}).eq("id", id).execute()
  51. ########### Vector Store #############
  52. # Put pre-compute embeddings to vector store. ## save to disk
  53. vectorstore = Chroma.from_texts(
  54. texts=questions,
  55. embedding=embeddings_model,
  56. persist_directory=vectordb_directory
  57. )
  58. return vectorstore
  59. # vectorstore = Chroma(persist_directory="./chroma_db_carbon_questions", embedding_function=embeddings_model)
  60. def semantic_cache(supabase, q, SIMILARITY_THRESHOLD=0.83, k=1, vectordb_directory="./chroma_db_carbon_questions"):
  61. if os.path.isdir(vectordb_directory):
  62. vectorstore = Chroma(persist_directory=vectordb_directory, embedding_function=embeddings_model)
  63. else:
  64. print("create new vector db ...")
  65. vectorstore = create_qa_vectordb(supabase, vectordb_directory)
  66. docs_and_scores = vectorstore.similarity_search_with_relevance_scores(q, k=1)
  67. doc, score = docs_and_scores[0]
  68. if score >= SIMILARITY_THRESHOLD:
  69. cache_question = doc.page_content
  70. response = supabase.table("QA_database").select("Question, Answer").eq("Question", cache_question).execute()
  71. # qa_df = pd.DataFrame(response.data)
  72. # print(response.data[0])
  73. answer = response.data[0]["Answer"]
  74. # video_cache = response.data[0]["video_cache"]
  75. return cache_question, answer
  76. else:
  77. return None, None