123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155 |
- ### Python = 3.9
- import os
- from dotenv import load_dotenv
- load_dotenv()
- import openai
- openai_api_key = os.getenv("OPENAI_API_KEY")
- openai.api_key = openai_api_key
- from langchain_openai import OpenAIEmbeddings
- embeddings_model = OpenAIEmbeddings()
- from langchain_community.document_loaders.csv_loader import CSVLoader
- from langchain_chroma import Chroma
- from supabase import create_client, Client
- supabase_url = os.getenv("SUPABASE_URL")
- supabase_key = os.getenv("SUPABASE_KEY")
- supabase: Client = create_client(supabase_url, supabase_key)
- from apscheduler.schedulers.background import BackgroundScheduler
- from typing import AsyncIterator
- scheduler = BackgroundScheduler(timezone="Asia/Taipei")
- ############# Load data #############
- # def extract_field(doc, field_name):
- # for line in doc.page_content.split('\n'):
- # if line.startswith(f"{field_name}:"):
- # return line.split(':', 1)[1].strip()
- # return None
- # loader = CSVLoader(file_path="video_cache_rows.csv")
- # data = loader.load()
- # field_name = "question"
- # question = [extract_field(doc, field_name) for doc in data]
- # ####### load data from supabase #######
- # embeddings_model = OpenAIEmbeddings()
- vectorstore_list ={}
- question_id_map_list = {}
- def generated(language:str ="ch"):
- global response,count,ids
- response,count = supabase.table("video_cache").select("question","id").eq("language",language).order("id").execute()
- data = response[1]
- question = [item['question'] for item in data if 'question' in item]
- #print(question)
- ids = [item['id'] for item in data if 'id' in item]
- question_id_map = {item['question']: item['id'] for item in data if 'id' in item and 'question' in item}
- question_id_map_list[language] = question_id_map
- ########## generate embedding ###########
- embedding = embeddings_model.embed_documents(question)
- ########## Write embedding to the supabase table #######
- # for id, new_embedding in zip(ids, embedding):
- # supabase.table("video_cache_rows_duplicate").insert({"embedding": embedding.tolist()}).eq("id", id).execute()
- ######### Vector Store ##########
- # Put pre-compute embeddings to vector store. ## save to disk
- persist_directory = f"./chroma_db_{language}"
- vectorstore = Chroma.from_texts(
- texts=question_id_map_list[language],
- embedding=embeddings_model,
- persist_directory=persist_directory
- )
- #vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)
- vectorstore_list[language] = vectorstore
- print(f"gernerate {language}")
- #print(question_id_map_list)
- generated("ch")
- generated("en")
- generated("jp")
- generated("ko")
- scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "ch"})
- scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "en"})
- scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "jp"})
- scheduler.add_job(generated, 'cron' ,hour='*/2',kwargs={"language" : "ko"})
- scheduler.start()
- def get_id_by_question(question,language):
- return question_id_map_list[language].get(question)
- # print(question)
- # created_at = []
- # question = []
- # ids = []
- # answer = []
- # video_url = []
- # for item in data:
- # ids.append(item['id'])
- # created_at.append(item['created_at'])
- # question.append(item['question'])
- # answer.append(item['answer'])
- # video_url.append(item['video_url'])
- def ask_question(question:str, SIMILARITY_THRESHOLD:int = 0.83,language:str ="ch"):
- # generated(language=language)
- print(language)
- vectorstore = vectorstore_list[language]
- print(vectorstore)
- docs_and_scores = vectorstore.similarity_search_with_relevance_scores(question, k=1)
- doc, score = docs_and_scores[0]
- print(doc,score)
-
- if score >= SIMILARITY_THRESHOLD:
- id = get_id_by_question(doc.page_content,language)
- data,count = supabase.table("video_cache").select("*").eq("id",id).execute()
- if data[1][0]["answer"] == None :
- return None
- return data[1]
- else:
- return None
-
- def ask_question_find_brand(question:str):
- # 使用 OpenAI 模型生成查询
- # 使用 OpenAI ChatCompletion 模型生成关键词
- response = openai.ChatCompletion.create(
- model="gpt-4", # 选择 GPT-4 模型
- messages=[
- {"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": f"Extract keywords from the following text for a database search: {question}"}
- ],
- max_tokens=50,
- temperature=0.5,
- )
- # 提取模型返回的关键词
- keywords = response.choices[0].message['content'].strip().split(", ")
- return keywords
- if __name__ == "__main__" :
- ####### load from disk #######
- query = "美食街在哪裡"
- #docs = vectorstore.similarity_search(query)
- #print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")
- ####### Query it #########
- query = "101可以帶狗嗎"
- #docs = vectorstore.similarity_search(query)
- #print(f"Query: {query} | 最接近文檔:{docs[0].page_content}")
|