il y a 10 mois · 85e06ab358
--- a/101_semantic_search.py
+++ b/101_semantic_search.py
@@ -0,0 +1,76 @@
 
				+### Python = 3.9
			
 
				+import os
			
 
				+from dotenv import load_dotenv
			
 
				+load_dotenv('environment.env')
			
 
				+
			
 
				+import openai 
			
 
				+openai_api_key = os.getenv("OPENAI_API_KEY")
			
 
				+openai.api_key = openai_api_key
			
 
				+
			
 
				+from langchain_openai import OpenAIEmbeddings
			
 
				+embeddings_model = OpenAIEmbeddings()
			
 
				+
			
 
				+from langchain_community.document_loaders.csv_loader import CSVLoader
			
 
				+from langchain_chroma import Chroma
			
 
				+
			
 
				+# from supabase import create_client, Client 
			
 
				+# supabase_url = os.getenv("SUPABASE_URL")
			
 
				+# supabase_key = os.getenv("SUPABASE_KEY")
			
 
				+# supabase: Client = create_client(supabase_url, supabase_key)
			
 
				+
			
 
				+############# Load data #############
			
 
				+def extract_field(doc, field_name):
			
 
				+    for line in doc.page_content.split('\n'):
			
 
				+        if line.startswith(f"{field_name}:"):
			
 
				+            return line.split(':', 1)[1].strip()
			
 
				+    return None
			
 
				+
			
 
				+loader = CSVLoader(file_path="video_cache_rows.csv")
			
 
				+data = loader.load()
			
 
				+field_name = "question"
			
 
				+question = [extract_field(doc, field_name) for doc in data]
			
 
				+
			
 
				+# ####### load data from supabase #######
			
 
				+# embeddings_model = OpenAIEmbeddings()
			
 
				+# response = supabase.table("video_cache_rows").select("question").execute()
			
 
				+# data = response.data 
			
 
				+# created_at = []
			
 
				+# question = []
			
 
				+# ids = []
			
 
				+# answer = []
			
 
				+# video_url = []
			
 
				+
			
 
				+# for item in data:
			
 
				+#     ids.append(item['id'])
			
 
				+#     created_at.append(item['created_at'])
			
 
				+#     question.append(item['question'])
			
 
				+#     answer.append(item['answer'])
			
 
				+#     video_url.append(item['video_url'])
			
 
				+
			
 
				+
			
 
				+########## generate embedding ###########
			
 
				+embedding = embeddings_model.embed_documents(question)
			
 
				+
			
 
				+########## Write embedding to the supabase table  #######
			
 
				+# for id, new_embedding in zip(ids, embedding):
			
 
				+#     supabase.table("video_cache_rows_duplicate").insert({"embedding": embedding.tolist()}).eq("id", id).execute()
			
 
				+
			
 
				+######### Vector Store ##########
			
 
				+# Put pre-compute embeddings to vector store. ## save to disk
			
 
				+vectorstore = Chroma.from_texts(
			
 
				+    texts=question,
			
 
				+    embedding=embeddings_model,
			
 
				+    persist_directory="./chroma_db"
			
 
				+    )
			
 
				+
			
 
				+####### load from disk  #######
			
 
				+query = "101可以帶狗嗎"
			
 
				+vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)
			
 
				+docs = vectorstore.similarity_search(query)
			
 
				+print(f"Query: {query}  | 最接近文檔：{docs[0].page_content}")
			
 
				+
			
 
				+####### Query it #########
			
 
				+query = "101可以帶狗嗎"
			
 
				+docs = vectorstore.similarity_search(query)
			
 
				+print(f"Query: {query}  | 最接近文檔：{docs[0].page_content}")
			
 
				+
			
--- a/requirements.txt
+++ b/requirements.txt