123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- import os
- from dotenv import load_dotenv
- from langchain_core.documents import Document
- from langchain_openai import OpenAIEmbeddings
- from json import loads
- from sqlalchemy import create_engine
- import pandas as pd
- import numpy as np
- import faiss
- import pickle
- # Get the current script's directory
- current_dir = os.path.dirname(os.path.abspath(__file__))
- parent_dir = os.path.dirname(current_dir)
- env_path = os.path.join(parent_dir, 'environment.env')
- load_dotenv(env_path)
- URI = os.getenv("SUPABASE_URI")
- openai_api_key = os.getenv("OPENAI_API_KEY")
- EMBEDDINGS_FILE = 'qa_embeddings.pkl'
- FAISS_INDEX_FILE = 'qa_faiss_index.bin'
- CSV_FILE = 'log_record_rows.csv'
- def gen_doc_from_database():
- engine = create_engine(URI, echo=True)
- df = pd.read_sql_table("log_record", engine.connect())
- result = df[['question', 'answer']].to_json(orient='id', force_ascii=False)
- result = loads(result)
- df = pd.DataFrame(result).T
- df.drop_duplicates(subset=['question', 'answer'], keep='first', inplace=True)
- print(f"Number of records after removing duplicates: {len(df)}")
-
- qa_doc = []
- for i in range(len(df)):
- Question = df.iloc[i]['question']
- Answer = df.iloc[i]['answer']
- context = f'question: {Question}\nanswer: {Answer}'
- doc = Document(page_content=context)
- qa_doc.append(doc)
- return qa_doc, df
- def gen_doc_from_csv(csv_filename=CSV_FILE):
- csv_path = os.path.join(current_dir, csv_filename)
- df = pd.read_csv(csv_path)
- df.drop_duplicates(subset=['question', 'answer'], keep='first', inplace=True)
- print(f"Number of records after removing duplicates: {len(df)}")
-
- qa_doc = []
- for _, row in df.iterrows():
- Question = row['question']
- Answer = row['answer']
- context = f'question: {Question}\nanswer: {Answer}'
- doc = Document(page_content=context)
- qa_doc.append(doc)
- return qa_doc, df
- def create_embeddings(docs):
- embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
- print("Creating embeddings...")
- doc_embeddings = embeddings.embed_documents([doc.page_content for doc in docs])
- print(f"Created {len(doc_embeddings)} embeddings.")
- print(f"Each embedding is a vector of length {len(doc_embeddings[0])}.")
- return doc_embeddings
- def create_faiss_index(embeddings):
- dimension = len(embeddings[0])
- index = faiss.IndexFlatL2(dimension)
- index.add(np.array(embeddings).astype('float32'))
- return index
- def save_embeddings(embeddings, docs, df):
- with open(os.path.join(current_dir, EMBEDDINGS_FILE), 'wb') as f:
- pickle.dump({'embeddings': embeddings, 'docs': docs, 'df': df}, f)
-
- index = create_faiss_index(embeddings)
- faiss.write_index(index, os.path.join(current_dir, FAISS_INDEX_FILE))
-
- print(f"Saved embeddings to {EMBEDDINGS_FILE} and FAISS index to {FAISS_INDEX_FILE}")
- def load_embeddings():
- embeddings_path = os.path.join(current_dir, EMBEDDINGS_FILE)
- faiss_path = os.path.join(current_dir, FAISS_INDEX_FILE)
-
- if os.path.exists(embeddings_path) and os.path.exists(faiss_path):
- with open(embeddings_path, 'rb') as f:
- data = pickle.load(f)
-
- index = faiss.read_index(faiss_path)
-
- print("Loaded existing embeddings and FAISS index from files")
- return data['embeddings'], data['docs'], data['df'], index
- else:
- raise FileNotFoundError("Embeddings or FAISS index file not found. Please run embeddings.py first.")
- def similarity_search(query, index, docs, k=3, threshold=0.8, method='logistic', sigma=1.0):
- embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
- query_vector = embeddings.embed_query(query)
-
- # 增加 k 值以提高找到足夠唯一文檔的機會
- D, I = index.search(np.array([query_vector]).astype('float32'), k * 2)
-
- results = []
- seen_docs = set() # 用於跟踪已經看到的文檔
-
- for dist, idx in zip(D[0], I[0]):
- if method == 'logistic':
- similarity = 1 / (1 + dist)
- elif method == 'exponential':
- similarity = np.exp(-dist)
- elif method == 'gaussian':
- similarity = np.exp(-dist**2 / (2 * sigma**2))
- else:
- raise ValueError("Unknown similarity method")
-
- if similarity >= threshold:
- doc_content = docs[idx].page_content
- if doc_content not in seen_docs: # 檢查是否已經添加過這個文檔
- results.append((docs[idx], similarity))
- seen_docs.add(doc_content)
-
- if len(results) == k: # 如果我們已經找到了 k 個唯一的文檔,就停止搜索
- break
-
- return results
- def main():
- # Choose which method to use for generating documents
- use_csv = True # Set this to False if you want to use the database instead
-
- if use_csv:
- docs, df = gen_doc_from_csv()
- else:
- docs, df = gen_doc_from_database()
-
- embeddings = create_embeddings(docs)
- save_embeddings(embeddings, docs, df)
- if __name__ == "__main__":
- main()
|