import os from dotenv import load_dotenv from langchain_core.documents import Document from langchain_openai import OpenAIEmbeddings from json import loads from sqlalchemy import create_engine import pandas as pd import numpy as np import faiss import pickle # Get the current script's directory current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(current_dir) env_path = os.path.join(parent_dir, 'environment.env') load_dotenv(env_path) URI = os.getenv("SUPABASE_URI") openai_api_key = os.getenv("OPENAI_API_KEY") EMBEDDINGS_FILE = 'qa_embeddings.pkl' FAISS_INDEX_FILE = 'qa_faiss_index.bin' CSV_FILE = 'log_record_rows.csv' def gen_doc_from_database(): engine = create_engine(URI, echo=True) df = pd.read_sql_table("log_record", engine.connect()) result = df[['question', 'answer']].to_json(orient='id', force_ascii=False) result = loads(result) df = pd.DataFrame(result).T df.drop_duplicates(subset=['question', 'answer'], keep='first', inplace=True) print(f"Number of records after removing duplicates: {len(df)}") qa_doc = [] for i in range(len(df)): Question = df.iloc[i]['question'] Answer = df.iloc[i]['answer'] context = f'question: {Question}\nanswer: {Answer}' doc = Document(page_content=context) qa_doc.append(doc) return qa_doc, df def gen_doc_from_csv(csv_filename=CSV_FILE): csv_path = os.path.join(current_dir, csv_filename) df = pd.read_csv(csv_path) df.drop_duplicates(subset=['question', 'answer'], keep='first', inplace=True) print(f"Number of records after removing duplicates: {len(df)}") qa_doc = [] for _, row in df.iterrows(): Question = row['question'] Answer = row['answer'] context = f'question: {Question}\nanswer: {Answer}' doc = Document(page_content=context) qa_doc.append(doc) return qa_doc, df def create_embeddings(docs): embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) print("Creating embeddings...") doc_embeddings = embeddings.embed_documents([doc.page_content for doc in docs]) print(f"Created {len(doc_embeddings)} embeddings.") print(f"Each embedding is a vector of length {len(doc_embeddings[0])}.") return doc_embeddings def create_faiss_index(embeddings): dimension = len(embeddings[0]) index = faiss.IndexFlatL2(dimension) index.add(np.array(embeddings).astype('float32')) return index def save_embeddings(embeddings, docs, df): with open(os.path.join(current_dir, EMBEDDINGS_FILE), 'wb') as f: pickle.dump({'embeddings': embeddings, 'docs': docs, 'df': df}, f) index = create_faiss_index(embeddings) faiss.write_index(index, os.path.join(current_dir, FAISS_INDEX_FILE)) print(f"Saved embeddings to {EMBEDDINGS_FILE} and FAISS index to {FAISS_INDEX_FILE}") def load_embeddings(): embeddings_path = os.path.join(current_dir, EMBEDDINGS_FILE) faiss_path = os.path.join(current_dir, FAISS_INDEX_FILE) if os.path.exists(embeddings_path) and os.path.exists(faiss_path): with open(embeddings_path, 'rb') as f: data = pickle.load(f) index = faiss.read_index(faiss_path) print("Loaded existing embeddings and FAISS index from files") return data['embeddings'], data['docs'], data['df'], index else: raise FileNotFoundError("Embeddings or FAISS index file not found. Please run embeddings.py first.") def similarity_search(query, index, docs, k=3, threshold=0.83, method='logistic', sigma=1.0): embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY")) query_vector = embeddings.embed_query(query) # 增加 k 值以提高找到足夠唯一文檔的機會 D, I = index.search(np.array([query_vector]).astype('float32'), k * 2) results = [] seen_docs = set() # 用於跟踪已經看到的文檔 for dist, idx in zip(D[0], I[0]): if method == 'logistic': similarity = 1 / (1 + dist) elif method == 'exponential': similarity = np.exp(-dist) elif method == 'gaussian': similarity = np.exp(-dist**2 / (2 * sigma**2)) else: raise ValueError("Unknown similarity method") if similarity >= threshold: doc_content = docs[idx].page_content if doc_content not in seen_docs: # 檢查是否已經添加過這個文檔 results.append((docs[idx], similarity)) seen_docs.add(doc_content) if len(results) == k: # 如果我們已經找到了 k 個唯一的文檔,就停止搜索 break return results def main(): # Choose which method to use for generating documents use_csv = True # Set this to False if you want to use the database instead if use_csv: docs, df = gen_doc_from_csv() else: docs, df = gen_doc_from_database() embeddings = create_embeddings(docs) save_embeddings(embeddings, docs, df) if __name__ == "__main__": main()