| 
					
				 | 
			
			
				@@ -0,0 +1,140 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import os 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from dotenv import load_dotenv 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from langchain_core.documents import Document 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from langchain_openai import OpenAIEmbeddings 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from json import loads 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from sqlalchemy import create_engine 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import pandas as pd 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import numpy as np 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import faiss 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import pickle 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# Get the current script's directory 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+current_dir = os.path.dirname(os.path.abspath(__file__)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+parent_dir = os.path.dirname(current_dir) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+env_path = os.path.join(parent_dir, 'environment.env') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+load_dotenv(env_path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+URI = os.getenv("SUPABASE_URI") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+openai_api_key = os.getenv("OPENAI_API_KEY") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+EMBEDDINGS_FILE = 'qa_embeddings.pkl' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+FAISS_INDEX_FILE = 'qa_faiss_index.bin' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+CSV_FILE = 'log_record_rows.csv' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def gen_doc_from_database(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    engine = create_engine(URI, echo=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    df = pd.read_sql_table("log_record", engine.connect())   
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    result = df[['question', 'answer']].to_json(orient='id', force_ascii=False) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    result = loads(result) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    df = pd.DataFrame(result).T 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    df.drop_duplicates(subset=['question', 'answer'], keep='first', inplace=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(f"Number of records after removing duplicates: {len(df)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    qa_doc = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for i in range(len(df)): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        Question = df.iloc[i]['question'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        Answer = df.iloc[i]['answer'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        context = f'question: {Question}\nanswer: {Answer}' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        doc = Document(page_content=context) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        qa_doc.append(doc) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return qa_doc, df 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def gen_doc_from_csv(csv_filename=CSV_FILE): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    csv_path = os.path.join(current_dir, csv_filename) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    df = pd.read_csv(csv_path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    df.drop_duplicates(subset=['question', 'answer'], keep='first', inplace=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(f"Number of records after removing duplicates: {len(df)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    qa_doc = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for _, row in df.iterrows(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        Question = row['question'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        Answer = row['answer'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        context = f'question: {Question}\nanswer: {Answer}' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        doc = Document(page_content=context) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        qa_doc.append(doc) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return qa_doc, df 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def create_embeddings(docs): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print("Creating embeddings...") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    doc_embeddings = embeddings.embed_documents([doc.page_content for doc in docs]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(f"Created {len(doc_embeddings)} embeddings.") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(f"Each embedding is a vector of length {len(doc_embeddings[0])}.") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return doc_embeddings 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def create_faiss_index(embeddings): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    dimension = len(embeddings[0]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    index = faiss.IndexFlatL2(dimension) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    index.add(np.array(embeddings).astype('float32')) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return index 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def save_embeddings(embeddings, docs, df): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    with open(os.path.join(current_dir, EMBEDDINGS_FILE), 'wb') as f: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        pickle.dump({'embeddings': embeddings, 'docs': docs, 'df': df}, f) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    index = create_faiss_index(embeddings) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    faiss.write_index(index, os.path.join(current_dir, FAISS_INDEX_FILE)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(f"Saved embeddings to {EMBEDDINGS_FILE} and FAISS index to {FAISS_INDEX_FILE}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def load_embeddings(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    embeddings_path = os.path.join(current_dir, EMBEDDINGS_FILE) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    faiss_path = os.path.join(current_dir, FAISS_INDEX_FILE) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if os.path.exists(embeddings_path) and os.path.exists(faiss_path): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        with open(embeddings_path, 'rb') as f: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            data = pickle.load(f) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+         
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        index = faiss.read_index(faiss_path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+         
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print("Loaded existing embeddings and FAISS index from files") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return data['embeddings'], data['docs'], data['df'], index 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        raise FileNotFoundError("Embeddings or FAISS index file not found. Please run embeddings.py first.") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def similarity_search(query, index, docs, k=3, threshold=0.83, method='logistic', sigma=1.0): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    query_vector = embeddings.embed_query(query) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 增加 k 值以提高找到足夠唯一文檔的機會 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    D, I = index.search(np.array([query_vector]).astype('float32'), k * 2) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    results = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    seen_docs = set()  # 用於跟踪已經看到的文檔 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for dist, idx in zip(D[0], I[0]): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if method == 'logistic': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            similarity = 1 / (1 + dist) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        elif method == 'exponential': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            similarity = np.exp(-dist) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        elif method == 'gaussian': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            similarity = np.exp(-dist**2 / (2 * sigma**2)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            raise ValueError("Unknown similarity method") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+         
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if similarity >= threshold: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            doc_content = docs[idx].page_content 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if doc_content not in seen_docs:  # 檢查是否已經添加過這個文檔 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                results.append((docs[idx], similarity)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                seen_docs.add(doc_content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if len(results) == k:  # 如果我們已經找到了 k 個唯一的文檔,就停止搜索 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return results 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # Choose which method to use for generating documents 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    use_csv = True  # Set this to False if you want to use the database instead 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if use_csv: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        docs, df = gen_doc_from_csv() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        docs, df = gen_doc_from_database() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    embeddings = create_embeddings(docs) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    save_embeddings(embeddings, docs, df) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+if __name__ == "__main__": 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    main() 
			 |