Browse Source

remove unnecessary files

SherryLiu 7 months ago
parent
commit
a7beb3efe6
5 changed files with 0 additions and 1049 deletions
  1. 0 177
      Indexing_Split.py
  2. 0 162
      RAG_app_copy.py
  3. 0 296
      RAG_strategy.py
  4. 0 192
      add_vectordb.py
  5. 0 222
      faiss_index.py

+ 0 - 177
Indexing_Split.py

@@ -1,177 +0,0 @@
-from dotenv import load_dotenv
-load_dotenv('environment.env')
-
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.embeddings import OllamaEmbeddings
-from langchain_community.vectorstores import Chroma
-from langchain_community.document_loaders import TextLoader
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_core.documents import Document
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_community.document_loaders import Docx2txtLoader
-from langchain_community.document_loaders import WebBaseLoader
-from PyPDF2 import PdfReader
-from langchain.docstore.document import Document
-from json import loads
-import pandas as pd
-from sqlalchemy import create_engine
-
-from langchain.prompts import ChatPromptTemplate
-from langchain_openai import ChatOpenAI
-from langchain_core.output_parsers import StrOutputParser
-from langchain import hub
-from tqdm import tqdm
-
-# __import__('pysqlite3')
-# import sys
-# sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
-
-from datasets import Dataset 
-from ragas import evaluate
-from ragas.metrics import (
-    answer_relevancy,
-    faithfulness,
-    context_recall,
-    context_precision,
-)
-import pandas as pd
-import os
-import glob
-import openai
-
-URI = os.getenv("SUPABASE_URI")
-openai_api_key = os.getenv("OPENAI_API_KEY")
-openai.api_key = openai_api_key
-
-from RAG_strategy import multi_query, naive_rag
-
-
-def create_retriever(path='Documents', extension="pdf"):
-    txt_files = glob.glob(os.path.join(path, f"*.{extension}"))
-    
-    doc = []
-    for file_path in txt_files:
-        doc.append(file_path)
-    
-    def load_and_split(file_list):
-        chunks = []
-        for file in file_list:
-            if file.endswith(".txt"):
-                loader = TextLoader(file, encoding='utf-8')
-            elif file.endswith(".pdf"):
-                loader = PyPDFLoader(file)
-            elif file.endswith(".docx"):
-                loader = Docx2txtLoader(file)
-            else:
-                raise ValueError(f"Unsupported file extension: {file}")
-            
-
-            docs = loader.load()
-
-            # Split
-            if file.endswith(".docx"):
-                # separators = ["\n\n\u25cb", "\n\n\u25cf"]
-                # text_splitter = RecursiveCharacterTextSplitter(separators=separators, chunk_size=500, chunk_overlap=0)
-                separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']
-                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=300, chunk_overlap=0)
-            else:
-                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)
-            
-            splits = text_splitter.split_documents(docs)
-
-            chunks.extend(splits)
-
-        return chunks
-
-    # Index
-    docs = load_and_split(doc)
-    qa_history_doc = gen_doc_from_history()
-    docs.extend(qa_history_doc)
-    # web_doc = web_data(os.path.join(path, 'web_url.csv'))
-    # docs.extend(web_doc)
-
-    # vectorstore
-    # vectorstore = Chroma.from_texts(texts=docs, embedding=OpenAIEmbeddings())
-    # vectorstore = Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings(openai_api_key=openai_api_key))
-    # vectorstore = Chroma.from_documents(documents=docs, embedding=OllamaEmbeddings(model="llama3", num_gpu=1))
-    vectorstore = Chroma.from_documents(documents=docs, embedding=OllamaEmbeddings(model="gemma2"))
-
-    vectorstore.persist()
-
-    retriever = vectorstore.as_retriever()
-
-    return retriever
-
-def web_data(url_file):
-    df = pd.read_csv(url_file, header = 0)
-    url_list = df['url'].to_list()
-
-    loader = WebBaseLoader(url_list)
-    docs = loader.load()
-
-    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-                chunk_size=1000, chunk_overlap=0)
-    splits = text_splitter.split_documents(docs)
-    
-    return splits
-
-def gen_doc_from_history():
-    engine = create_engine(URI, echo=True)
-
-    df = pd.read_sql_table("systex_records", engine.connect())  
-    df.fillna('', inplace=True)
-    result = df.to_json(orient='index', force_ascii=False)
-    result = loads(result)
-
-
-    df = pd.DataFrame(result).T
-    qa_history_doc = []
-    for i in range(len(df)):
-        if df.iloc[i]['used_as_document'] is not True: continue
-        Question = df.iloc[i]['Question']
-        Answer = df.iloc[i]['Answer']
-        context = f'Question: {Question}\nAnswer: {Answer}'
-        
-        doc =  Document(page_content=context, metadata={"source": "History"})
-        qa_history_doc.append(doc)
-        # print(doc)
-
-    return qa_history_doc
-
-def gen_doc_from_database():
-    engine = create_engine(URI, echo=True)
-
-    df = pd.read_sql_table("QA_database", engine.connect())  
-    # df.fillna('', inplace=True)
-    result = df[['Question', 'Answer']].to_json(orient='index', force_ascii=False)
-    result = loads(result)
-
-
-    df = pd.DataFrame(result).T
-    qa_doc = []
-    for i in range(len(df)):
-        # if df.iloc[i]['used_as_document'] is not True: continue
-        Question = df.iloc[i]['Question']
-        Answer = df.iloc[i]['Answer']
-        context = f'Question: {Question}\nAnswer: {Answer}'
-        
-        doc = Document(page_content=context, metadata={"source": "History"})
-        qa_doc.append(doc)
-        # print(doc)
-
-    return qa_doc
-
-if __name__ == "__main__":
-
-    retriever = create_retriever(path='./Documents', extension="pdf")
-    question = 'CEV系統可以支援盤查到什麼程度'
-    final_answer, reference_docs = multi_query(question, retriever)
-    print(question, final_answer)
-    question = 'CEV系統依循標準為何'
-    final_answer, reference_docs = multi_query(question, retriever)
-    print(question, final_answer)
-
-
-
-

+ 0 - 162
RAG_app_copy.py

@@ -1,162 +0,0 @@
-from dotenv import load_dotenv
-load_dotenv('environment.env')
-
-from fastapi import FastAPI, HTTPException, status, Body, Depends
-from fastapi.middleware.cors import CORSMiddleware
-from contextlib import asynccontextmanager
-from pydantic import BaseModel
-from typing import List, Optional
-import uvicorn
-
-from sqlalchemy import create_engine
-import pandas as pd
-import datetime
-import json
-from json import loads
-import time
-from langchain.callbacks import get_openai_callback
-
-from langchain_openai import OpenAIEmbeddings
-from RAG_strategy import multi_query, naive_rag, naive_rag_for_qapairs
-
-import os
-from supabase.client import Client, create_client
-from add_vectordb import GetVectorStore
-import openai
-
-# Get API log
-import logging
-logger = logging.getLogger("uvicorn.error")
-
-openai_api_key = os.getenv("OPENAI_API_KEY")
-URI = os.getenv("SUPABASE_URI")
-openai.api_key = openai_api_key
-
-global_retriever = None
-
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    global global_retriever
-    global vector_store
-    
-    start = time.time()
-
-    supabase_url = os.getenv("SUPABASE_URL")
-    supabase_key = os.getenv("SUPABASE_KEY")
-    document_table = "documents"
-    supabase: Client = create_client(supabase_url, supabase_key)
-
-    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
-    vector_store = GetVectorStore(embeddings, supabase, document_table)
-    global_retriever = vector_store.as_retriever(search_kwargs={"k": 4})
-
-    print(f"Initialization time: {time.time() - start}")
-    yield
-
-def get_retriever():
-    return global_retriever
-
-def get_vector_store():
-    return vector_store
-
-app = FastAPI(lifespan=lifespan)
-
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-@app.get("/answer2")
-def multi_query_answer(question, retriever=Depends(get_retriever)):
-    try:
-        start = time.time()
-
-        with get_openai_callback() as cb:
-            final_answer, reference_docs = multi_query(question, retriever, chat_history=[])
-
-        processing_time = time.time() - start
-        print(f"Processing time: {processing_time}")
-        save_history(question, final_answer, reference_docs, cb, processing_time)
-
-        return {"Answer": final_answer}
-    except Exception as e:
-        logger.error(f"Error in /answer2 endpoint: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-class ChatHistoryItem(BaseModel):
-    q: str
-    a: str
-
-@app.post("/answer_with_history")
-def multi_query_answer(question: Optional[str] = '', chat_history: List[ChatHistoryItem] = Body(...), retriever=Depends(get_retriever)):
-    start = time.time()
-    
-    chat_history = [(item.q, item.a) for item in chat_history if item.a != ""]
-    print(f"Chat history: {chat_history}")
-    
-    with get_openai_callback() as cb:
-        final_answer, reference_docs = multi_query(question, retriever, chat_history)
-    processing_time = time.time() - start
-    print(f"Processing time: {processing_time}")
-    save_history(question, final_answer, reference_docs, cb, processing_time)
-
-    return {"Answer": final_answer}
-
-@app.post("/answer_with_history2")
-def multi_query_answer(question: Optional[str] = '', extension: Optional[str] = 'pdf', chat_history: List[ChatHistoryItem] = Body(...), retriever=Depends(get_retriever)):
-    start = time.time()
-
-    retriever = vector_store.as_retriever(search_kwargs={"k": 4, 'filter': {'extension':extension}})
-    
-    chat_history = [(item.q, item.a) for item in chat_history if item.a != ""]
-    print(f"Chat history: {chat_history}")
-    
-    with get_openai_callback() as cb:
-        final_answer, reference_docs = multi_query(question, retriever, chat_history)
-    processing_time = time.time() - start
-    print(f"Processing time: {processing_time}")
-    save_history(question, final_answer, reference_docs, cb, processing_time)
-
-    return {"Answer": final_answer}
-
-def save_history(question, answer, reference, cb, processing_time):
-    record = {
-        'Question': [question],
-        'Answer': [answer],
-        'Total_Tokens': [cb.total_tokens],
-        'Total_Cost': [cb.total_cost],
-        'Processing_time': [processing_time],
-        'Contexts': [str(reference)]
-    }
-    df = pd.DataFrame(record)
-    engine = create_engine(URI)
-    df.to_sql(name='systex_records', con=engine, index=False, if_exists='append')
-
-class history_output(BaseModel):
-    Question: str
-    Answer: str
-    Contexts: str
-    Total_Tokens: int
-    Total_Cost: float
-    Processing_time: float
-    Time: datetime.datetime
-
-@app.get('/history', response_model=List[history_output])
-async def get_history():
-    engine = create_engine(URI, echo=True)
-
-    df = pd.read_sql_table("systex_records", engine.connect())  
-    df.fillna('', inplace=True)
-    result = df.to_json(orient='index', force_ascii=False)
-    result = loads(result)
-    return result.values()
-
-@app.get("/")
-def read_root():
-    return {"message": "Welcome to the Carbon Chatbot API"}
-
-if __name__ == "__main__":
-    uvicorn.run("RAG_app_copy:app", host='127.0.0.1', port=8081, reload=True)

+ 0 - 296
RAG_strategy.py

@@ -1,296 +0,0 @@
-from langchain.prompts import ChatPromptTemplate
-from langchain.load import dumps, loads
-from langchain_core.output_parsers import StrOutputParser
-from langchain_openai import ChatOpenAI
-from langchain_community.llms import Ollama
-from langchain_community.chat_models import ChatOllama
-from operator import itemgetter
-from langchain_core.runnables import RunnablePassthrough
-from langchain import hub
-from langchain.globals import set_llm_cache
-from langchain import PromptTemplate
-import subprocess
-import json
-from typing import Any, List, Optional, Dict
-from langchain_core.callbacks import CallbackManagerForLLMRun
-from langchain_core.language_models import BaseChatModel
-from langchain_core.messages import BaseMessage, AIMessage, HumanMessage, SystemMessage
-from langchain_core.outputs import ChatResult, ChatGeneration
-from pydantic import Field
-
-from langchain_core.runnables import (
-    RunnableBranch,
-    RunnableLambda,
-    RunnableParallel,
-    RunnablePassthrough,
-)
-
-from datasets import Dataset 
-from ragas import evaluate
-from ragas.metrics import (
-    answer_relevancy,
-    faithfulness,
-    context_recall,
-    context_precision,
-)
-import os
-from dotenv import load_dotenv
-load_dotenv('environment.env')
-
-from langchain.cache import SQLiteCache
-from langchain_openai import OpenAIEmbeddings
-from langchain.globals import set_llm_cache
-
-import requests
-import openai
-openai_api_key = os.getenv("OPENAI_API_KEY")
-openai.api_key = openai_api_key
-URI = os.getenv("SUPABASE_URI")
-
-# 設置緩存,以減少對API的重複請求。使用SQLite
-set_llm_cache(SQLiteCache(database_path=".langchain.db"))
-
-system_prompt: str = "你是一個來自台灣的AI助理,你的名字是 TAIDE,樂於以台灣人的立場幫助使用者,會用繁體中文回答問題。"
-
-class OllamaChatModel(BaseChatModel):
-    model_name: str = Field(default="taide-local")
-
-    def _generate(
-            self,
-            messages: List[BaseMessage],
-            stop: Optional[List[str]] = None,
-            run_manager: Optional[CallbackManagerForLLMRun] = None,
-            **kwargs: Any,
-    ) -> ChatResult:
-        formatted_messages = []
-        for msg in messages:
-            if isinstance(msg, HumanMessage):
-                formatted_messages.append({"role": "user", "content": msg.content})
-            elif isinstance(msg, AIMessage):
-                formatted_messages.append({"role": "assistant", "content": msg.content})
-            elif isinstance(msg, SystemMessage):
-                 formatted_messages.append({"role": "system", "content": msg.content})
-
-        prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
-        for msg in formatted_messages:
-            if msg['role'] == 'user':
-                prompt += f"{msg['content']} [/INST]"
-            elif msg['role'] == "assistant":
-                prompt += f"{msg['content']} </s><s>[INST]"
-
-        command = ["ollama", "run", self.model_name, prompt]
-        result = subprocess.run(command, capture_output=True, text=True)
-
-        if result.returncode != 0:
-            raise Exception(f"Ollama command failed: {result.stderr}")
-        
-        content = result.stdout.strip()
-
-        message = AIMessage(content=content)
-        generation = ChatGeneration(message=message)
-        return ChatResult(generations=[generation])
-    
-    @property
-    def _llm_type(self) -> str:
-        return "ollama-chat-model"
-    
-taide_llm = OllamaChatModel(model_name="taide-local")
-
-def multi_query(question, retriever, chat_history):
-    def multi_query_chain():
-        template = """You are an AI language model assistant. Your task is to generate three 
-        different versions of the given user question to retrieve relevant documents from a vector 
-        database. By generating multiple perspectives on the user question, your goal is to help
-        the user overcome some of the limitations of the distance-based similarity search. 
-        Provide these alternative questions separated by newlines. 
-
-        You must return original question also, which means that you return 1 original version + 3 different versions = 4 questions.
-        
-        Original question: {question}"""
-        prompt_perspectives = ChatPromptTemplate.from_template(template)
-
-        generate_queries = (
-            prompt_perspectives 
-            | taide_llm
-            | StrOutputParser() 
-            | (lambda x: x.split("\n"))
-        )
-
-        return generate_queries
-
-    def get_unique_union(documents: List[list]):
-        flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
-        unique_docs = list(set(flattened_docs))
-        return [loads(doc) for doc in unique_docs]
-
-    _search_query = get_search_query()
-    modified_question = _search_query.invoke({"question":question, "chat_history": chat_history})
-    print(modified_question)
-
-    generate_queries = multi_query_chain()
-
-    retrieval_chain = generate_queries | retriever.map() | get_unique_union
-    docs = retrieval_chain.invoke({"question":modified_question})
-
-    answer = multi_query_rag_prompt(retrieval_chain, modified_question)
-
-    return answer, docs
-
-def multi_query_rag_prompt(retrieval_chain, question):
-    template = """Answer the following question based on this context:
-
-    {context}
-
-    Question: {question}
-    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw. If the question is in English, then the output will be in English.
-    You should not mention anything about "根據提供的文件內容" or other similar terms.
-    If you don't know the answer, just say that "很抱歉,目前我無法回答您的問題,請將您的詢問發送至 test@email.com 以便獲得更進一步的幫助,謝謝。I'm sorry I cannot answer your question. Please send your question to test@email.com for further assistance. Thank you."
-    """
-
-    prompt = ChatPromptTemplate.from_template(template)
-    context = retrieval_chain.invoke({"question": question})
-    print(f"Retrieved context: {context[:200]}...")  # Print first 200 chars of context
-
-    final_rag_chain = (
-        {"context": retrieval_chain, 
-        "question": itemgetter("question")} 
-        | prompt
-        | taide_llm
-        | StrOutputParser()
-    )
-
-    print(f"Sending question to model: {question}")
-    try:
-        answer = final_rag_chain.invoke({"question": question})
-        print(f"Received answer: {answer}")
-        return answer
-    except Exception as e:
-        print(f"Error invoking rag_chain: {e}")
-        return "Error occurred while processing the question."
-
-def get_search_query():
-    _template = """Rewrite the following query by incorporating relevant context from the conversation history.
-    The rewritten query should:
-    
-    - Preserve the core intent and meaning of the original query
-    - Expand and clarify the query to make it more specific and informative for retrieving relevant context
-    - Avoid introducing new topics or queries that deviate from the original query
-    - DONT EVER ANSWER the Original query, but instead focus on rephrasing and expanding it into a new query
-    - The rewritten query should be in its original language.
-    
-    Return ONLY the rewritten query text, without any additional formatting or explanations.
-    
-    Conversation History:
-    {chat_history}
-    
-    Original query: [{question}]
-    
-    Rewritten query: 
-    """
-    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
-
-    def _format_chat_history(chat_history: List[tuple[str, str]]) -> List:
-        buffer = []
-        for human, ai in chat_history:
-            buffer.append(HumanMessage(content=human))
-            buffer.append(AIMessage(content=ai))
-        return buffer
-
-    _search_query = RunnableBranch(
-        (
-            RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
-                run_name="HasChatHistoryCheck"
-            ),
-            RunnablePassthrough.assign(
-                chat_history=lambda x: _format_chat_history(x["chat_history"])
-            )
-            | CONDENSE_QUESTION_PROMPT
-            | ChatOpenAI()
-            | StrOutputParser(),
-        ),
-        RunnableLambda(lambda x : x["question"]),
-    )
-
-    return _search_query
-
-def naive_rag(question, retriever):
-    prompt = hub.pull("rlm/rag-prompt")
-
-    def format_docs(docs):
-        return "\n\n".join(doc.page_content for doc in docs)
-
-    reference = retriever.get_relevant_documents(question)
-    
-    rag_chain = (
-        {"context": retriever | format_docs, "question": RunnablePassthrough()}
-        | prompt
-        | taide_llm
-        | StrOutputParser()
-    )
-
-    answer = rag_chain.invoke(question)
-
-    return answer, reference
-
-def naive_rag_for_qapairs(question, retriever):
-    template = """You are an assistant for question-answering tasks. 
-    Use the following pieces of retrieved context to answer the question. 
-    Following retrieved context is question-answer pairs of historical QA, Find the suitable answer from the qa pairs
-    If you can not find the suitable answer, just return "False". 
-    Use three sentences maximum and Do not make up the answer.
-
-    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw.
-
-    {context}
-
-    Question: {question}
-    """
-    prompt = PromptTemplate.from_template(template)
-
-    llm = ChatOpenAI(model_name="gpt-4-0125-preview")
-
-    def format_docs(docs):
-        return "\n\n".join(doc.page_content for doc in docs)
-
-    reference = retriever.get_relevant_documents(question)
-    
-    rag_chain = (
-        {"context": retriever | format_docs, "question": RunnablePassthrough()}
-        | prompt
-        | llm
-        | StrOutputParser()
-    )
-
-    answer = rag_chain.invoke(question)
-
-    return answer, reference
-
-def rag_score(question, ground_truth, answer, reference_docs):
-    datasets = {
-              "question": [question],
-              "answer": [answer],
-              "contexts": [reference_docs],
-              "ground_truths": [[ground_truth]]
-            }
-    evalsets = Dataset.from_dict(datasets)
-
-    result = evaluate(
-        evalsets,
-        metrics=[
-            context_precision,
-            faithfulness,
-            answer_relevancy,
-            context_recall,
-        ],
-    )
-
-    result_df = result.to_pandas()
-    print(result_df.head())
-    result_df.to_csv('ragas_rag.csv')
-    return result
-
-def print_current_model(llm):
-    if isinstance(llm, OllamaChatModel):
-        print(f"Currently using model: {llm.model_name}")
-    else:
-        pass

+ 0 - 192
add_vectordb.py

@@ -1,192 +0,0 @@
-from dotenv import load_dotenv
-load_dotenv('environment.env')
-
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.vectorstores import Chroma
-from langchain_community.document_loaders import TextLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_community.document_loaders import Docx2txtLoader
-
-import os
-import glob
-import openai
-
-from langchain_community.vectorstores import SupabaseVectorStore
-from langchain_openai import OpenAIEmbeddings
-from supabase.client import Client, create_client
-
-
-def get_data_list(data_list=None, path=None, extension=None, update=False):
-    files = data_list or glob.glob(os.path.join(path, f"*.{extension}"))
-    if update:    
-        doc = files.copy()
-    else:
-        existed_data = check_existed_data(supabase)
-        doc = []
-        for file_path in files:
-            filename = os.path.basename(file_path)
-            if filename not in existed_data:
-                doc.append(file_path)
-
-    return doc
-
-
-def read_and_split_files(data_list=None, path=None, extension=None, update=False):
-
-    def load_and_split(file_list):
-        chunks = []
-        for file in file_list:
-            if file.endswith(".txt"):
-                loader = TextLoader(file, encoding='utf-8')
-            elif file.endswith(".pdf"):
-                loader = PyPDFLoader(file)
-            elif file.endswith(".docx"):
-                loader = Docx2txtLoader(file)
-            else:
-                print(f"Unsupported file extension: {file}")
-                continue
-
-            docs = loader.load()
-
-            # Split
-            if file.endswith(".docx"):
-                separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']
-                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=300, chunk_overlap=0)
-            else:
-                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)
-            splits = text_splitter.split_documents(docs)
-
-            chunks.extend(splits)
-
-        return chunks
-
-
-    doc = get_data_list(data_list=data_list, path=path, extension=extension, update=update)
-    # Index
-    docs = load_and_split(doc)
-
-    return docs
-
-def create_ids(docs):
-    # Create a dictionary to count occurrences of each page in each document
-    page_counter = {}
-
-    # List to store the resulting IDs
-    document_ids = []
-
-    # Generate IDs
-    for doc in [docs[i].metadata for i in range(len(docs))]:
-        source = doc['source']
-        file_name = os.path.basename(source).split('.')[0]
-
-        if "page" in doc.keys():
-            page = doc['page']
-            key = f"{source}_{page}"
-        else:
-            key = f"{source}"
-
-        if key not in page_counter:
-            page_counter[key] = 1
-        else:
-            page_counter[key] += 1
-        
-        if "page" in doc.keys():
-            doc_id = f"{file_name} | page {page} | chunk {page_counter[key]}"
-        else:
-            doc_id = f"{file_name} | chunk {page_counter[key]}"
-
-        
-        document_ids.append(doc_id)
-
-    return document_ids
-
-def get_document(data_list=None, path=None, extension=None, update=False):
-    docs = read_and_split_files(data_list=data_list, path=path, extension=extension, update=update)
-    document_ids = create_ids(docs)
-
-    for doc in docs:
-        doc.metadata['source'] = os.path.basename(doc.metadata['source'])
-        # print(doc.metadata)
-
-    # document_metadatas = [{'source': doc.metadata['source'], 'page': doc.metadata['page'], 'chunk': int(id.split("chunk ")[-1])} for doc, id in zip(docs, document_ids)]
-    document_metadatas = []
-
-    for doc, id in zip(docs, document_ids):
-        chunk_number = int(id.split("chunk ")[-1])
-        doc.metadata['chunk'] = chunk_number
-        doc.metadata['extension'] = os.path.basename(doc.metadata['source']).split(".")[-1]
-        document_metadatas.append(doc.metadata)
-
-    documents = [docs.metadata['source'].split(".")[0] + docs.page_content for docs in docs]
-
-    return document_ids, documents, document_metadatas
-
-def check_existed_data(supabase):
-    response = supabase.table('documents').select("id, metadata").execute()
-    existed_data = list(set([data['metadata']['source'] for data in response.data]))
-    # existed_data = [(data['id'], data['metadata']['source']) for data in response.data]
-    return existed_data
-
-class GetVectorStore(SupabaseVectorStore):
-    def __init__(self, embeddings, supabase, table_name):
-        super().__init__(embedding=embeddings, client=supabase, table_name=table_name, query_name="match_documents")
-
-    def insert(self, documents, document_metadatas):
-        self.add_texts(
-            texts=documents,
-            metadatas=document_metadatas,
-        )
-
-    def delete(self, file_list):
-        for file_name in file_list:
-            self._client.table(self.table_name).delete().eq('metadata->>source', file_name).execute()
-
-    def update(self, documents, document_metadatas, update_existing_data=False):
-        if not document_metadatas:  # no new data
-            return
-
-        if update_existing_data:
-            file_list = list(set(metadata['source'] for metadata in document_metadatas))
-            self.delete(file_list)
-
-        self.insert(documents, document_metadatas)
-
-if __name__ == "__main__":
-
-    load_dotenv()
-    supabase_url = os.environ.get("SUPABASE_URL")
-    supabase_key = os.environ.get("SUPABASE_KEY")
-    openai_api_key = os.getenv("OPENAI_API_KEY")
-    openai.api_key = openai_api_key
-    document_table = "documents"
-    supabase: Client = create_client(supabase_url, supabase_key)
-
-    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
-
-    # get vector store
-    vector_store = GetVectorStore(embeddings, supabase, document_table)
-
-    # update data (old + new / all new / all old)
-    path = "/Documents"
-    extension = "pdf"
-    # file = None
-
-    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
-    # file = [os.path.join(path, file) for file in file_list]
-    file_list = glob.glob(os.path.join(path, "*"))
-    print(file_list)
-    
-    update = True
-    document_ids, documents, document_metadatas = get_document(data_list=file_list, path=path, extension=extension, update=update)
-    vector_store.update(documents, document_metadatas, update_existing_data=update)
-
-    # insert new data (all new)
-    # vector_store.insert(documents, document_metadatas)
-
-    # delete data
-    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
-    # vector_store.delete(file_list)
-
-    # get retriver
-    # retriever = vector_store.as_retriever(search_kwargs={"k": 6})

+ 0 - 222
faiss_index.py

@@ -1,222 +0,0 @@
-import faiss
-import numpy as np
-import json
-from tqdm import tqdm
-from time import time
-from RAG_strategy import multi_query
-from ragas import evaluate
-from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision
-from datasets import Dataset
-from typing import List, Callable
-
-from dotenv import load_dotenv
-load_dotenv('environment.env')
-import os
-import pickle
-from supabase.client import Client, create_client
-from langchain_openai import OpenAIEmbeddings
-from add_vectordb import GetVectorStore
-from sqlalchemy import create_engine
-import pandas as pd
-from langchain_core.documents import Document
-
-# Load environment variables
-supabase_url = os.getenv("SUPABASE_URL")
-supabase_key = os.getenv("SUPABASE_KEY")
-openai_api_key = os.getenv("OPENAI_API_KEY")
-document_table = "documents"
-URI = os.getenv("SUPABASE_URI")
-
-# Initialize Supabase client
-supabase: Client = create_client(supabase_url, supabase_key)
-
-# Initialize embeddings
-embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
-
-# Initialize original vector store
-original_vector_store = GetVectorStore(embeddings, supabase, document_table)
-original_retriever = original_vector_store.as_retriever(search_kwargs={"k": 4})
-
-
-
-def download_embeddings():
-    response = supabase.table(document_table).select("id, embedding, metadata, content").execute()
-    embeddings = []
-    ids = []
-    metadatas = []
-    contents = []
-    for item in response.data:
-        # Parse the embedding string into a list of floats
-        embedding = json.loads(item['embedding'])
-        embeddings.append(embedding)
-        ids.append(item['id'])
-        metadatas.append(item['metadata'])
-        contents.append(item['content'])
-    return np.array(embeddings, dtype=np.float32), ids, metadatas, contents
-
-def create_faiss_index(embeddings):
-    dimension = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dimension)
-    index.add(embeddings)
-    return index
-
-def save_faiss_index(index, file_path):
-    faiss.write_index(index, file_path)
-    print(f"FAISS index saved to {file_path}")
-
-def load_faiss_index(file_path):
-    if os.path.exists(file_path):
-        index = faiss.read_index(file_path)
-        print(f"FAISS index loaded from {file_path}")
-        return index
-    return None
-
-def save_metadata(ids, metadatas, contents, file_path):
-    with open(file_path, 'wb') as f:
-        pickle.dump((ids, metadatas, contents), f)
-    print(f"Metadata saved to {file_path}")
-
-def load_metadata(file_path):
-    if os.path.exists(file_path):
-        with open(file_path, 'rb') as f:
-            ids, metadatas, contents = pickle.load(f)
-        print(f"Metadata loaded from {file_path}")
-        return ids, metadatas, contents
-    return None, None, None
-
-def search_faiss(index, query_vector, k=4):
-    # Convert query_vector to a numpy array if it's not already
-    if not isinstance(query_vector, np.ndarray):
-        query_vector = np.array(query_vector)
-    
-    # Ensure the query_vector is 2D
-    if query_vector.ndim == 1:
-        query_vector = query_vector.reshape(1, -1)
-    
-    distances, indices = index.search(query_vector, k)
-    return distances[0], indices[0]
-
-class FAISSRetriever:
-    def __init__(self, index, ids, metadatas, contents):
-        self.index = index
-        self.ids = ids
-        self.metadatas = metadatas
-        self.contents = contents
-        self.embeddings_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
-
-    def get_relevant_documents(self, query: str) -> List[Document]:
-        query_vector = self.embeddings_model.embed_query(query)
-        _, indices = search_faiss(self.index, query_vector)
-        return [
-            Document(page_content=self.contents[i], metadata=self.metadatas[i])
-            for i in indices
-        ]
-
-    def as_retriever(self, search_kwargs=None):
-        return self
-
-    def map(self) -> Callable[[List[str]], List[List[Document]]]:
-        def _map(queries: List[str]) -> List[List[Document]]:
-            return [self.get_relevant_documents(query) for query in queries]
-        return _map
-
-def load_qa_pairs():
-    df = pd.read_csv("QA_database_rows.csv")
-    return df['Question'].tolist(), df['Answer'].tolist()
-
-if __name__ == "__main__":
-    faiss_index_path = "faiss_index.bin"
-    metadata_path = "faiss_metadata.pkl"
-
-    # Try to load existing FAISS index and metadata
-    index = load_faiss_index(faiss_index_path)
-    ids, metadatas, contents = load_metadata(metadata_path)
-
-    if index is None or ids is None:
-        print("FAISS index or metadata not found. Creating new index...")
-        print("Downloading embeddings from Supabase...")
-        embeddings, ids, metadatas, contents = download_embeddings()
-
-        print("Creating FAISS index...")
-        index = create_faiss_index(embeddings)
-
-        # Save the index and metadata
-        save_faiss_index(index, faiss_index_path)
-        save_metadata(ids, metadatas, contents, metadata_path)
-    else:
-        print("Using existing FAISS index and metadata.")
-
-    print("Creating FAISS retriever...")
-    faiss_retriever = FAISSRetriever(index, ids, metadatas, contents)
-
-    # Load QA pairs from database
-    questions, ground_truths = load_qa_pairs()
-
-    # Compare performance
-    for question, ground_truth in zip(questions, ground_truths):
-        print(f"\nQuestion: {question}")
-
-        # Measure time for FAISS retrieval
-        start_time = time()
-        faiss_answer, faiss_docs = multi_query(question, faiss_retriever, chat_history=[])
-        faiss_time = time() - start_time
-        print(f"FAISS Answer: {faiss_answer}")
-        print(f"FAISS Time: {faiss_time:.4f} seconds")
-
-        # Measure time for original retrieval
-        start_time = time()
-        original_answer, original_docs = multi_query(question, original_retriever, chat_history=[])
-        original_time = time() - start_time
-        print(f"Original Answer: {original_answer}")
-        print(f"Original Time: {original_time:.4f} seconds")
-
-        # RAGAS evaluation for FAISS
-        faiss_datasets = {
-            "question": [question],
-            "answer": [faiss_answer],
-            "contexts": [[doc.page_content for doc in faiss_docs]],
-            "ground_truths": [[ground_truth]]
-        }
-        faiss_evalsets = Dataset.from_dict(faiss_datasets)
-
-        faiss_result = evaluate(
-            faiss_evalsets,
-            metrics=[
-                context_precision,
-                faithfulness,
-                answer_relevancy,
-                context_recall,
-            ],
-        )
-
-        print("FAISS RAGAS Evaluation:")
-        print(faiss_result.to_pandas())
-
-        # RAGAS evaluation for Original retriever
-        original_datasets = {
-            "question": [question],
-            "answer": [original_answer],
-            "contexts": [[doc.page_content for doc in original_docs]],
-            "ground_truths": [[ground_truth]]
-        }
-        original_evalsets = Dataset.from_dict(original_datasets)
-
-        original_result = evaluate(
-            original_evalsets,
-            metrics=[
-                context_precision,
-                faithfulness,
-                answer_relevancy,
-                context_recall,
-            ],
-        )
-
-        print("Original RAGAS Evaluation:")
-        print(original_result.to_pandas())
-
-    print("\nPerformance comparison complete.")
-
-# Key points:
-
-# If both the index and metadata (ids, metadatas, contents) are found, we don't need to download embeddings or recreate the index. The FAISS index already contains the embeddings.
-# The FAISSRetriever class doesn't need the raw embeddings. It uses the FAISS index for similarity search and the metadata (ids, metadatas, contents) for returning document information.