|
@@ -0,0 +1,76 @@
|
|
|
+### Python = 3.9
|
|
|
+import os
|
|
|
+from dotenv import load_dotenv
|
|
|
+load_dotenv('environment.env')
|
|
|
+
|
|
|
+import openai
|
|
|
+openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
|
+openai.api_key = openai_api_key
|
|
|
+
|
|
|
+from langchain_openai import OpenAIEmbeddings
|
|
|
+embeddings_model = OpenAIEmbeddings()
|
|
|
+
|
|
|
+from langchain_community.document_loaders.csv_loader import CSVLoader
|
|
|
+from langchain_chroma import Chroma
|
|
|
+
|
|
|
+# from supabase import create_client, Client
|
|
|
+# supabase_url = os.getenv("SUPABASE_URL")
|
|
|
+# supabase_key = os.getenv("SUPABASE_KEY")
|
|
|
+# supabase: Client = create_client(supabase_url, supabase_key)
|
|
|
+
|
|
|
+############# Load data #############
|
|
|
+def extract_field(doc, field_name):
|
|
|
+ for line in doc.page_content.split('\n'):
|
|
|
+ if line.startswith(f"{field_name}:"):
|
|
|
+ return line.split(':', 1)[1].strip()
|
|
|
+ return None
|
|
|
+
|
|
|
+loader = CSVLoader(file_path="video_cache_rows.csv")
|
|
|
+data = loader.load()
|
|
|
+field_name = "question"
|
|
|
+question = [extract_field(doc, field_name) for doc in data]
|
|
|
+
|
|
|
+# ####### load data from supabase #######
|
|
|
+# embeddings_model = OpenAIEmbeddings()
|
|
|
+# response = supabase.table("video_cache_rows").select("question").execute()
|
|
|
+# data = response.data
|
|
|
+# created_at = []
|
|
|
+# question = []
|
|
|
+# ids = []
|
|
|
+# answer = []
|
|
|
+# video_url = []
|
|
|
+
|
|
|
+# for item in data:
|
|
|
+# ids.append(item['id'])
|
|
|
+# created_at.append(item['created_at'])
|
|
|
+# question.append(item['question'])
|
|
|
+# answer.append(item['answer'])
|
|
|
+# video_url.append(item['video_url'])
|
|
|
+
|
|
|
+
|
|
|
+########## generate embedding ###########
|
|
|
+embedding = embeddings_model.embed_documents(question)
|
|
|
+
|
|
|
+########## Write embedding to the supabase table #######
|
|
|
+# for id, new_embedding in zip(ids, embedding):
|
|
|
+# supabase.table("video_cache_rows_duplicate").insert({"embedding": embedding.tolist()}).eq("id", id).execute()
|
|
|
+
|
|
|
+######### Vector Store ##########
|
|
|
+# Put pre-compute embeddings to vector store. ## save to disk
|
|
|
+vectorstore = Chroma.from_texts(
|
|
|
+ texts=question,
|
|
|
+ embedding=embeddings_model,
|
|
|
+ persist_directory="./chroma_db"
|
|
|
+ )
|
|
|
+
|
|
|
+####### load from disk #######
|
|
|
+query = "101可以帶狗嗎"
|
|
|
+vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)
|
|
|
+docs = vectorstore.similarity_search(query)
|
|
|
+print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")
|
|
|
+
|
|
|
+####### Query it #########
|
|
|
+query = "101可以帶狗嗎"
|
|
|
+docs = vectorstore.similarity_search(query)
|
|
|
+print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")
|
|
|
+
|