101_semantic_search.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. ### Python = 3.9
  2. import os
  3. from dotenv import load_dotenv
  4. load_dotenv('environment.env')
  5. import openai
  6. openai_api_key = os.getenv("OPENAI_API_KEY")
  7. openai.api_key = openai_api_key
  8. from langchain_openai import OpenAIEmbeddings
  9. embeddings_model = OpenAIEmbeddings()
  10. from langchain_community.document_loaders.csv_loader import CSVLoader
  11. from langchain_chroma import Chroma
  12. # from supabase import create_client, Client
  13. # supabase_url = os.getenv("SUPABASE_URL")
  14. # supabase_key = os.getenv("SUPABASE_KEY")
  15. # supabase: Client = create_client(supabase_url, supabase_key)
  16. ############# Load data #############
  17. def extract_field(doc, field_name):
  18. for line in doc.page_content.split('\n'):
  19. if line.startswith(f"{field_name}:"):
  20. return line.split(':', 1)[1].strip()
  21. return None
  22. loader = CSVLoader(file_path="video_cache_rows.csv")
  23. data = loader.load()
  24. field_name = "question"
  25. question = [extract_field(doc, field_name) for doc in data]
  26. # ####### load data from supabase #######
  27. # embeddings_model = OpenAIEmbeddings()
  28. # response = supabase.table("video_cache_rows").select("question").execute()
  29. # data = response.data
  30. # created_at = []
  31. # question = []
  32. # ids = []
  33. # answer = []
  34. # video_url = []
  35. # for item in data:
  36. # ids.append(item['id'])
  37. # created_at.append(item['created_at'])
  38. # question.append(item['question'])
  39. # answer.append(item['answer'])
  40. # video_url.append(item['video_url'])
  41. ########## generate embedding ###########
  42. embedding = embeddings_model.embed_documents(question)
  43. ########## Write embedding to the supabase table #######
  44. # for id, new_embedding in zip(ids, embedding):
  45. # supabase.table("video_cache_rows_duplicate").insert({"embedding": embedding.tolist()}).eq("id", id).execute()
  46. ######### Vector Store ##########
  47. # Put pre-compute embeddings to vector store. ## save to disk
  48. vectorstore = Chroma.from_texts(
  49. texts=question,
  50. embedding=embeddings_model,
  51. persist_directory="./chroma_db"
  52. )
  53. ####### load from disk #######
  54. query = "101可以帶狗嗎"
  55. vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)
  56. docs = vectorstore.similarity_search(query)
  57. print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")
  58. ####### Query it #########
  59. query = "101可以帶狗嗎"
  60. docs = vectorstore.similarity_search(query)
  61. print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")