|
@@ -0,0 +1,140 @@
|
|
|
+import os
|
|
|
+from dotenv import load_dotenv
|
|
|
+from langchain_core.documents import Document
|
|
|
+from langchain_openai import OpenAIEmbeddings
|
|
|
+from json import loads
|
|
|
+from sqlalchemy import create_engine
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+import faiss
|
|
|
+import pickle
|
|
|
+
|
|
|
+# Get the current script's directory
|
|
|
+current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
+parent_dir = os.path.dirname(current_dir)
|
|
|
+env_path = os.path.join(parent_dir, 'environment.env')
|
|
|
+load_dotenv(env_path)
|
|
|
+
|
|
|
+URI = os.getenv("SUPABASE_URI")
|
|
|
+openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
|
+
|
|
|
+EMBEDDINGS_FILE = 'qa_embeddings.pkl'
|
|
|
+FAISS_INDEX_FILE = 'qa_faiss_index.bin'
|
|
|
+CSV_FILE = 'log_record_rows.csv'
|
|
|
+
|
|
|
+def gen_doc_from_database():
|
|
|
+ engine = create_engine(URI, echo=True)
|
|
|
+ df = pd.read_sql_table("log_record", engine.connect())
|
|
|
+ result = df[['question', 'answer']].to_json(orient='id', force_ascii=False)
|
|
|
+ result = loads(result)
|
|
|
+ df = pd.DataFrame(result).T
|
|
|
+ df.drop_duplicates(subset=['question', 'answer'], keep='first', inplace=True)
|
|
|
+ print(f"Number of records after removing duplicates: {len(df)}")
|
|
|
+
|
|
|
+ qa_doc = []
|
|
|
+ for i in range(len(df)):
|
|
|
+ Question = df.iloc[i]['question']
|
|
|
+ Answer = df.iloc[i]['answer']
|
|
|
+ context = f'question: {Question}\nanswer: {Answer}'
|
|
|
+ doc = Document(page_content=context)
|
|
|
+ qa_doc.append(doc)
|
|
|
+ return qa_doc, df
|
|
|
+
|
|
|
+def gen_doc_from_csv(csv_filename=CSV_FILE):
|
|
|
+ csv_path = os.path.join(current_dir, csv_filename)
|
|
|
+ df = pd.read_csv(csv_path)
|
|
|
+ df.drop_duplicates(subset=['question', 'answer'], keep='first', inplace=True)
|
|
|
+ print(f"Number of records after removing duplicates: {len(df)}")
|
|
|
+
|
|
|
+ qa_doc = []
|
|
|
+ for _, row in df.iterrows():
|
|
|
+ Question = row['question']
|
|
|
+ Answer = row['answer']
|
|
|
+ context = f'question: {Question}\nanswer: {Answer}'
|
|
|
+ doc = Document(page_content=context)
|
|
|
+ qa_doc.append(doc)
|
|
|
+ return qa_doc, df
|
|
|
+
|
|
|
+def create_embeddings(docs):
|
|
|
+ embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
|
|
+ print("Creating embeddings...")
|
|
|
+ doc_embeddings = embeddings.embed_documents([doc.page_content for doc in docs])
|
|
|
+ print(f"Created {len(doc_embeddings)} embeddings.")
|
|
|
+ print(f"Each embedding is a vector of length {len(doc_embeddings[0])}.")
|
|
|
+ return doc_embeddings
|
|
|
+
|
|
|
+def create_faiss_index(embeddings):
|
|
|
+ dimension = len(embeddings[0])
|
|
|
+ index = faiss.IndexFlatL2(dimension)
|
|
|
+ index.add(np.array(embeddings).astype('float32'))
|
|
|
+ return index
|
|
|
+
|
|
|
+def save_embeddings(embeddings, docs, df):
|
|
|
+ with open(os.path.join(current_dir, EMBEDDINGS_FILE), 'wb') as f:
|
|
|
+ pickle.dump({'embeddings': embeddings, 'docs': docs, 'df': df}, f)
|
|
|
+
|
|
|
+ index = create_faiss_index(embeddings)
|
|
|
+ faiss.write_index(index, os.path.join(current_dir, FAISS_INDEX_FILE))
|
|
|
+
|
|
|
+ print(f"Saved embeddings to {EMBEDDINGS_FILE} and FAISS index to {FAISS_INDEX_FILE}")
|
|
|
+
|
|
|
+def load_embeddings():
|
|
|
+ embeddings_path = os.path.join(current_dir, EMBEDDINGS_FILE)
|
|
|
+ faiss_path = os.path.join(current_dir, FAISS_INDEX_FILE)
|
|
|
+
|
|
|
+ if os.path.exists(embeddings_path) and os.path.exists(faiss_path):
|
|
|
+ with open(embeddings_path, 'rb') as f:
|
|
|
+ data = pickle.load(f)
|
|
|
+
|
|
|
+ index = faiss.read_index(faiss_path)
|
|
|
+
|
|
|
+ print("Loaded existing embeddings and FAISS index from files")
|
|
|
+ return data['embeddings'], data['docs'], data['df'], index
|
|
|
+ else:
|
|
|
+ raise FileNotFoundError("Embeddings or FAISS index file not found. Please run embeddings.py first.")
|
|
|
+
|
|
|
+def similarity_search(query, index, docs, k=3, threshold=0.83, method='logistic', sigma=1.0):
|
|
|
+ embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
|
|
|
+ query_vector = embeddings.embed_query(query)
|
|
|
+
|
|
|
+ # 增加 k 值以提高找到足夠唯一文檔的機會
|
|
|
+ D, I = index.search(np.array([query_vector]).astype('float32'), k * 2)
|
|
|
+
|
|
|
+ results = []
|
|
|
+ seen_docs = set() # 用於跟踪已經看到的文檔
|
|
|
+
|
|
|
+ for dist, idx in zip(D[0], I[0]):
|
|
|
+ if method == 'logistic':
|
|
|
+ similarity = 1 / (1 + dist)
|
|
|
+ elif method == 'exponential':
|
|
|
+ similarity = np.exp(-dist)
|
|
|
+ elif method == 'gaussian':
|
|
|
+ similarity = np.exp(-dist**2 / (2 * sigma**2))
|
|
|
+ else:
|
|
|
+ raise ValueError("Unknown similarity method")
|
|
|
+
|
|
|
+ if similarity >= threshold:
|
|
|
+ doc_content = docs[idx].page_content
|
|
|
+ if doc_content not in seen_docs: # 檢查是否已經添加過這個文檔
|
|
|
+ results.append((docs[idx], similarity))
|
|
|
+ seen_docs.add(doc_content)
|
|
|
+
|
|
|
+ if len(results) == k: # 如果我們已經找到了 k 個唯一的文檔,就停止搜索
|
|
|
+ break
|
|
|
+
|
|
|
+ return results
|
|
|
+
|
|
|
+def main():
|
|
|
+ # Choose which method to use for generating documents
|
|
|
+ use_csv = True # Set this to False if you want to use the database instead
|
|
|
+
|
|
|
+ if use_csv:
|
|
|
+ docs, df = gen_doc_from_csv()
|
|
|
+ else:
|
|
|
+ docs, df = gen_doc_from_database()
|
|
|
+
|
|
|
+ embeddings = create_embeddings(docs)
|
|
|
+ save_embeddings(embeddings, docs, df)
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|