12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- ### Python = 3.9
- import os
- from dotenv import load_dotenv
- load_dotenv('../environment.env')
- import openai
- openai_api_key = os.getenv("OPENAI_API_KEY")
- openai.api_key = openai_api_key
- from langchain_openai import OpenAIEmbeddings
- embeddings_model = OpenAIEmbeddings()
- from langchain_community.document_loaders.csv_loader import CSVLoader
- from langchain_chroma import Chroma
- # from supabase import create_client, Client
- # supabase_url = os.getenv("SUPABASE_URL")
- # supabase_key = os.getenv("SUPABASE_KEY")
- # supabase: Client = create_client(supabase_url, supabase_key)
- # ############# Load data #############
- # def extract_field(doc, field_name):
- # for line in doc.page_content.split('\n'):
- # if line.startswith(f"{field_name}:"):
- # return line.split(':', 1)[1].strip()
- # return None
- # loader = CSVLoader(file_path="../video_cache_rows.csv")
- # data = loader.load()
- # field_name = "question"
- # question = [extract_field(doc, field_name) for doc in data]
- # ####### load data from supabase #######
- # embeddings_model = OpenAIEmbeddings()
- # response = supabase.table("video_cache_rows").select("question").execute()
- # data = response.data
- # created_at = []
- # question = []
- # ids = []
- # answer = []
- # video_url = []
- # for item in data:
- # ids.append(item['id'])
- # created_at.append(item['created_at'])
- # question.append(item['question'])
- # answer.append(item['answer'])
- # video_url.append(item['video_url'])
- ########## generate embedding ###########
- # embedding = embeddings_model.embed_documents(question)
- ########## Write embedding to the supabase table #######
- # for id, new_embedding in zip(ids, embedding):
- # supabase.table("video_cache_rows_duplicate").insert({"embedding": embedding.tolist()}).eq("id", id).execute()
- # ######### Vector Store ##########
- # # Put pre-compute embeddings to vector store. ## save to disk
- # vectorstore = Chroma.from_texts(
- # texts=question,
- # embedding=embeddings_model,
- # persist_directory="./chroma_db"
- # )
- # ####### load from disk #######
- # query = ["狗狗可以進101嗎", "哪裡有賣珍奶", "101幾點關門", "101星期天有開嗎", "球鞋哪裡有賣"]
- vectorstore = Chroma(persist_directory="../chroma_db", embedding_function=embeddings_model)
- # docs = vectorstore.similarity_search(query)
- # print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")
- ####### Query it #########
- def search_similarity(query, SIMILARITY_THRESHOLD):
- docs_and_scores = vectorstore.similarity_search_with_relevance_scores(query, k=1)
- doc, score = docs_and_scores[0]
- if score >= SIMILARITY_THRESHOLD:
- print(f"Query: {query} | 最接近文檔:{doc.page_content} | score:{round(score, 2)}" )
- else:
- print(f"Query: {query} | 沒有相關資訊 | score:{round(score, 2)}")
- query = ["狗狗可以進101嗎", "哪裡有賣珍奶", "遺失物品哪裡找", "嬰兒車可以進電梯嗎", "101幾點關門", "101星期天有開嗎", "球鞋哪裡有賣", "殘障人士租用輪椅", "停車多少錢", "觀景台導覽", "觀景台電梯速度", "我去哪裡買觀景台的票", "觀景台的票多少錢", "101有透明地板嗎", "如何辦退稅", "紀念品可以退稅嗎", "哪裡可以退稅", "101網路可以訂票嗎"]
- SIMILARITY_THRESHOLD = 0.83
- for i in query:
- search_similarity(i, SIMILARITY_THRESHOLD)
- # Define a similarity threshold
- # print('docs', docs_and_scores)
|