1 anno fa · a7beb3efe6
--- a/Indexing_Split.py
+++ b/Indexing_Split.py
@@ -1,177 +0,0 @@
 
				-from dotenv import load_dotenv

			
 
				-load_dotenv('environment.env')

			
 
				-

			
 
				-from langchain_openai import OpenAIEmbeddings

			
 
				-from langchain_community.embeddings import OllamaEmbeddings

			
 
				-from langchain_community.vectorstores import Chroma

			
 
				-from langchain_community.document_loaders import TextLoader

			
 
				-from langchain.text_splitter import CharacterTextSplitter

			
 
				-from langchain_text_splitters import RecursiveCharacterTextSplitter

			
 
				-from langchain_core.documents import Document

			
 
				-from langchain_community.document_loaders import PyPDFLoader

			
 
				-from langchain_community.document_loaders import Docx2txtLoader

			
 
				-from langchain_community.document_loaders import WebBaseLoader

			
 
				-from PyPDF2 import PdfReader

			
 
				-from langchain.docstore.document import Document

			
 
				-from json import loads

			
 
				-import pandas as pd

			
 
				-from sqlalchemy import create_engine

			
 
				-

			
 
				-from langchain.prompts import ChatPromptTemplate

			
 
				-from langchain_openai import ChatOpenAI

			
 
				-from langchain_core.output_parsers import StrOutputParser

			
 
				-from langchain import hub

			
 
				-from tqdm import tqdm

			
 
				-

			
 
				-# __import__('pysqlite3')

			
 
				-# import sys

			
 
				-# sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")

			
 
				-

			
 
				-from datasets import Dataset 

			
 
				-from ragas import evaluate

			
 
				-from ragas.metrics import (

			
 
				-    answer_relevancy,

			
 
				-    faithfulness,

			
 
				-    context_recall,

			
 
				-    context_precision,

			
 
				-)

			
 
				-import pandas as pd

			
 
				-import os

			
 
				-import glob

			
 
				-import openai

			
 
				-

			
 
				-URI = os.getenv("SUPABASE_URI")

			
 
				-openai_api_key = os.getenv("OPENAI_API_KEY")

			
 
				-openai.api_key = openai_api_key

			
 
				-

			
 
				-from RAG_strategy import multi_query, naive_rag

			
 
				-

			
 
				-

			
 
				-def create_retriever(path='Documents', extension="pdf"):

			
 
				-    txt_files = glob.glob(os.path.join(path, f"*.{extension}"))

			
 
				-    

			
 
				-    doc = []

			
 
				-    for file_path in txt_files:

			
 
				-        doc.append(file_path)

			
 
				-    

			
 
				-    def load_and_split(file_list):

			
 
				-        chunks = []

			
 
				-        for file in file_list:

			
 
				-            if file.endswith(".txt"):

			
 
				-                loader = TextLoader(file, encoding='utf-8')

			
 
				-            elif file.endswith(".pdf"):

			
 
				-                loader = PyPDFLoader(file)

			
 
				-            elif file.endswith(".docx"):

			
 
				-                loader = Docx2txtLoader(file)

			
 
				-            else:

			
 
				-                raise ValueError(f"Unsupported file extension: {file}")

			
 
				-            

			
 
				-

			
 
				-            docs = loader.load()

			
 
				-

			
 
				-            # Split

			
 
				-            if file.endswith(".docx"):

			
 
				-                # separators = ["\n\n\u25cb", "\n\n\u25cf"]

			
 
				-                # text_splitter = RecursiveCharacterTextSplitter(separators=separators, chunk_size=500, chunk_overlap=0)

			
 
				-                separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']

			
 
				-                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=300, chunk_overlap=0)

			
 
				-            else:

			
 
				-                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)

			
 
				-            

			
 
				-            splits = text_splitter.split_documents(docs)

			
 
				-

			
 
				-            chunks.extend(splits)

			
 
				-

			
 
				-        return chunks

			
 
				-

			
 
				-    # Index

			
 
				-    docs = load_and_split(doc)

			
 
				-    qa_history_doc = gen_doc_from_history()

			
 
				-    docs.extend(qa_history_doc)

			
 
				-    # web_doc = web_data(os.path.join(path, 'web_url.csv'))

			
 
				-    # docs.extend(web_doc)

			
 
				-

			
 
				-    # vectorstore

			
 
				-    # vectorstore = Chroma.from_texts(texts=docs, embedding=OpenAIEmbeddings())

			
 
				-    # vectorstore = Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings(openai_api_key=openai_api_key))

			
 
				-    # vectorstore = Chroma.from_documents(documents=docs, embedding=OllamaEmbeddings(model="llama3", num_gpu=1))

			
 
				-    vectorstore = Chroma.from_documents(documents=docs, embedding=OllamaEmbeddings(model="gemma2"))

			
 
				-

			
 
				-    vectorstore.persist()

			
 
				-

			
 
				-    retriever = vectorstore.as_retriever()

			
 
				-

			
 
				-    return retriever

			
 
				-

			
 
				-def web_data(url_file):

			
 
				-    df = pd.read_csv(url_file, header = 0)

			
 
				-    url_list = df['url'].to_list()

			
 
				-

			
 
				-    loader = WebBaseLoader(url_list)

			
 
				-    docs = loader.load()

			
 
				-

			
 
				-    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(

			
 
				-                chunk_size=1000, chunk_overlap=0)

			
 
				-    splits = text_splitter.split_documents(docs)

			
 
				-    

			
 
				-    return splits

			
 
				-

			
 
				-def gen_doc_from_history():

			
 
				-    engine = create_engine(URI, echo=True)

			
 
				-

			
 
				-    df = pd.read_sql_table("systex_records", engine.connect())  

			
 
				-    df.fillna('', inplace=True)

			
 
				-    result = df.to_json(orient='index', force_ascii=False)

			
 
				-    result = loads(result)

			
 
				-

			
 
				-

			
 
				-    df = pd.DataFrame(result).T

			
 
				-    qa_history_doc = []

			
 
				-    for i in range(len(df)):

			
 
				-        if df.iloc[i]['used_as_document'] is not True: continue

			
 
				-        Question = df.iloc[i]['Question']

			
 
				-        Answer = df.iloc[i]['Answer']

			
 
				-        context = f'Question: {Question}\nAnswer: {Answer}'

			
 
				-        

			
 
				-        doc =  Document(page_content=context, metadata={"source": "History"})

			
 
				-        qa_history_doc.append(doc)

			
 
				-        # print(doc)

			
 
				-

			
 
				-    return qa_history_doc

			
 
				-

			
 
				-def gen_doc_from_database():

			
 
				-    engine = create_engine(URI, echo=True)

			
 
				-

			
 
				-    df = pd.read_sql_table("QA_database", engine.connect())  

			
 
				-    # df.fillna('', inplace=True)

			
 
				-    result = df[['Question', 'Answer']].to_json(orient='index', force_ascii=False)

			
 
				-    result = loads(result)

			
 
				-

			
 
				-

			
 
				-    df = pd.DataFrame(result).T

			
 
				-    qa_doc = []

			
 
				-    for i in range(len(df)):

			
 
				-        # if df.iloc[i]['used_as_document'] is not True: continue

			
 
				-        Question = df.iloc[i]['Question']

			
 
				-        Answer = df.iloc[i]['Answer']

			
 
				-        context = f'Question: {Question}\nAnswer: {Answer}'

			
 
				-        

			
 
				-        doc = Document(page_content=context, metadata={"source": "History"})

			
 
				-        qa_doc.append(doc)

			
 
				-        # print(doc)

			
 
				-

			
 
				-    return qa_doc

			
 
				-

			
 
				-if __name__ == "__main__":

			
 
				-

			
 
				-    retriever = create_retriever(path='./Documents', extension="pdf")

			
 
				-    question = 'CEV系統可以支援盤查到什麼程度'

			
 
				-    final_answer, reference_docs = multi_query(question, retriever)

			
 
				-    print(question, final_answer)

			
 
				-    question = 'CEV系統依循標準為何'

			
 
				-    final_answer, reference_docs = multi_query(question, retriever)

			
 
				-    print(question, final_answer)

			
 
				-

			
 
				-

			
 
				-

			
 
				-

			
--- a/RAG_app_copy.py
+++ b/RAG_app_copy.py
@@ -1,162 +0,0 @@
 
				-from dotenv import load_dotenv

			
 
				-load_dotenv('environment.env')

			
 
				-

			
 
				-from fastapi import FastAPI, HTTPException, status, Body, Depends

			
 
				-from fastapi.middleware.cors import CORSMiddleware

			
 
				-from contextlib import asynccontextmanager

			
 
				-from pydantic import BaseModel

			
 
				-from typing import List, Optional

			
 
				-import uvicorn

			
 
				-

			
 
				-from sqlalchemy import create_engine

			
 
				-import pandas as pd

			
 
				-import datetime

			
 
				-import json

			
 
				-from json import loads

			
 
				-import time

			
 
				-from langchain.callbacks import get_openai_callback

			
 
				-

			
 
				-from langchain_openai import OpenAIEmbeddings

			
 
				-from RAG_strategy import multi_query, naive_rag, naive_rag_for_qapairs

			
 
				-

			
 
				-import os

			
 
				-from supabase.client import Client, create_client

			
 
				-from add_vectordb import GetVectorStore

			
 
				-import openai

			
 
				-

			
 
				-# Get API log

			
 
				-import logging

			
 
				-logger = logging.getLogger("uvicorn.error")

			
 
				-

			
 
				-openai_api_key = os.getenv("OPENAI_API_KEY")

			
 
				-URI = os.getenv("SUPABASE_URI")

			
 
				-openai.api_key = openai_api_key

			
 
				-

			
 
				-global_retriever = None

			
 
				-

			
 
				-@asynccontextmanager

			
 
				-async def lifespan(app: FastAPI):

			
 
				-    global global_retriever

			
 
				-    global vector_store

			
 
				-    

			
 
				-    start = time.time()

			
 
				-

			
 
				-    supabase_url = os.getenv("SUPABASE_URL")

			
 
				-    supabase_key = os.getenv("SUPABASE_KEY")

			
 
				-    document_table = "documents"

			
 
				-    supabase: Client = create_client(supabase_url, supabase_key)

			
 
				-

			
 
				-    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

			
 
				-    vector_store = GetVectorStore(embeddings, supabase, document_table)

			
 
				-    global_retriever = vector_store.as_retriever(search_kwargs={"k": 4})

			
 
				-

			
 
				-    print(f"Initialization time: {time.time() - start}")

			
 
				-    yield

			
 
				-

			
 
				-def get_retriever():

			
 
				-    return global_retriever

			
 
				-

			
 
				-def get_vector_store():

			
 
				-    return vector_store

			
 
				-

			
 
				-app = FastAPI(lifespan=lifespan)

			
 
				-

			
 
				-app.add_middleware(

			
 
				-    CORSMiddleware,

			
 
				-    allow_origins=["*"],

			
 
				-    allow_credentials=True,

			
 
				-    allow_methods=["*"],

			
 
				-    allow_headers=["*"],

			
 
				-)

			
 
				-

			
 
				-@app.get("/answer2")

			
 
				-def multi_query_answer(question, retriever=Depends(get_retriever)):

			
 
				-    try:

			
 
				-        start = time.time()

			
 
				-

			
 
				-        with get_openai_callback() as cb:

			
 
				-            final_answer, reference_docs = multi_query(question, retriever, chat_history=[])

			
 
				-

			
 
				-        processing_time = time.time() - start

			
 
				-        print(f"Processing time: {processing_time}")

			
 
				-        save_history(question, final_answer, reference_docs, cb, processing_time)

			
 
				-

			
 
				-        return {"Answer": final_answer}

			
 
				-    except Exception as e:

			
 
				-        logger.error(f"Error in /answer2 endpoint: {e}")

			
 
				-        raise HTTPException(status_code=500, detail=str(e))

			
 
				-

			
 
				-class ChatHistoryItem(BaseModel):

			
 
				-    q: str

			
 
				-    a: str

			
 
				-

			
 
				-@app.post("/answer_with_history")

			
 
				-def multi_query_answer(question: Optional[str] = '', chat_history: List[ChatHistoryItem] = Body(...), retriever=Depends(get_retriever)):

			
 
				-    start = time.time()

			
 
				-    

			
 
				-    chat_history = [(item.q, item.a) for item in chat_history if item.a != ""]

			
 
				-    print(f"Chat history: {chat_history}")

			
 
				-    

			
 
				-    with get_openai_callback() as cb:

			
 
				-        final_answer, reference_docs = multi_query(question, retriever, chat_history)

			
 
				-    processing_time = time.time() - start

			
 
				-    print(f"Processing time: {processing_time}")

			
 
				-    save_history(question, final_answer, reference_docs, cb, processing_time)

			
 
				-

			
 
				-    return {"Answer": final_answer}

			
 
				-

			
 
				-@app.post("/answer_with_history2")

			
 
				-def multi_query_answer(question: Optional[str] = '', extension: Optional[str] = 'pdf', chat_history: List[ChatHistoryItem] = Body(...), retriever=Depends(get_retriever)):

			
 
				-    start = time.time()

			
 
				-

			
 
				-    retriever = vector_store.as_retriever(search_kwargs={"k": 4, 'filter': {'extension':extension}})

			
 
				-    

			
 
				-    chat_history = [(item.q, item.a) for item in chat_history if item.a != ""]

			
 
				-    print(f"Chat history: {chat_history}")

			
 
				-    

			
 
				-    with get_openai_callback() as cb:

			
 
				-        final_answer, reference_docs = multi_query(question, retriever, chat_history)

			
 
				-    processing_time = time.time() - start

			
 
				-    print(f"Processing time: {processing_time}")

			
 
				-    save_history(question, final_answer, reference_docs, cb, processing_time)

			
 
				-

			
 
				-    return {"Answer": final_answer}

			
 
				-

			
 
				-def save_history(question, answer, reference, cb, processing_time):

			
 
				-    record = {

			
 
				-        'Question': [question],

			
 
				-        'Answer': [answer],

			
 
				-        'Total_Tokens': [cb.total_tokens],

			
 
				-        'Total_Cost': [cb.total_cost],

			
 
				-        'Processing_time': [processing_time],

			
 
				-        'Contexts': [str(reference)]

			
 
				-    }

			
 
				-    df = pd.DataFrame(record)

			
 
				-    engine = create_engine(URI)

			
 
				-    df.to_sql(name='systex_records', con=engine, index=False, if_exists='append')

			
 
				-

			
 
				-class history_output(BaseModel):

			
 
				-    Question: str

			
 
				-    Answer: str

			
 
				-    Contexts: str

			
 
				-    Total_Tokens: int

			
 
				-    Total_Cost: float

			
 
				-    Processing_time: float

			
 
				-    Time: datetime.datetime

			
 
				-

			
 
				-@app.get('/history', response_model=List[history_output])

			
 
				-async def get_history():

			
 
				-    engine = create_engine(URI, echo=True)

			
 
				-

			
 
				-    df = pd.read_sql_table("systex_records", engine.connect())  

			
 
				-    df.fillna('', inplace=True)

			
 
				-    result = df.to_json(orient='index', force_ascii=False)

			
 
				-    result = loads(result)

			
 
				-    return result.values()

			
 
				-

			
 
				-@app.get("/")

			
 
				-def read_root():

			
 
				-    return {"message": "Welcome to the Carbon Chatbot API"}

			
 
				-

			
 
				-if __name__ == "__main__":

			
 
				-    uvicorn.run("RAG_app_copy:app", host='127.0.0.1', port=8081, reload=True)
			
--- a/RAG_strategy.py
+++ b/RAG_strategy.py
@@ -1,296 +0,0 @@
 
				-from langchain.prompts import ChatPromptTemplate

			
 
				-from langchain.load import dumps, loads

			
 
				-from langchain_core.output_parsers import StrOutputParser

			
 
				-from langchain_openai import ChatOpenAI

			
 
				-from langchain_community.llms import Ollama

			
 
				-from langchain_community.chat_models import ChatOllama

			
 
				-from operator import itemgetter

			
 
				-from langchain_core.runnables import RunnablePassthrough

			
 
				-from langchain import hub

			
 
				-from langchain.globals import set_llm_cache

			
 
				-from langchain import PromptTemplate

			
 
				-import subprocess

			
 
				-import json

			
 
				-from typing import Any, List, Optional, Dict

			
 
				-from langchain_core.callbacks import CallbackManagerForLLMRun

			
 
				-from langchain_core.language_models import BaseChatModel

			
 
				-from langchain_core.messages import BaseMessage, AIMessage, HumanMessage, SystemMessage

			
 
				-from langchain_core.outputs import ChatResult, ChatGeneration

			
 
				-from pydantic import Field

			
 
				-

			
 
				-from langchain_core.runnables import (

			
 
				-    RunnableBranch,

			
 
				-    RunnableLambda,

			
 
				-    RunnableParallel,

			
 
				-    RunnablePassthrough,

			
 
				-)

			
 
				-

			
 
				-from datasets import Dataset 

			
 
				-from ragas import evaluate

			
 
				-from ragas.metrics import (

			
 
				-    answer_relevancy,

			
 
				-    faithfulness,

			
 
				-    context_recall,

			
 
				-    context_precision,

			
 
				-)

			
 
				-import os

			
 
				-from dotenv import load_dotenv

			
 
				-load_dotenv('environment.env')

			
 
				-

			
 
				-from langchain.cache import SQLiteCache

			
 
				-from langchain_openai import OpenAIEmbeddings

			
 
				-from langchain.globals import set_llm_cache

			
 
				-

			
 
				-import requests

			
 
				-import openai

			
 
				-openai_api_key = os.getenv("OPENAI_API_KEY")

			
 
				-openai.api_key = openai_api_key

			
 
				-URI = os.getenv("SUPABASE_URI")

			
 
				-

			
 
				-# 設置緩存，以減少對API的重複請求。使用SQLite

			
 
				-set_llm_cache(SQLiteCache(database_path=".langchain.db"))

			
 
				-

			
 
				-system_prompt: str = "你是一個來自台灣的AI助理，你的名字是 TAIDE，樂於以台灣人的立場幫助使用者，會用繁體中文回答問題。"

			
 
				-

			
 
				-class OllamaChatModel(BaseChatModel):

			
 
				-    model_name: str = Field(default="taide-local")

			
 
				-

			
 
				-    def _generate(

			
 
				-            self,

			
 
				-            messages: List[BaseMessage],

			
 
				-            stop: Optional[List[str]] = None,

			
 
				-            run_manager: Optional[CallbackManagerForLLMRun] = None,

			
 
				-            **kwargs: Any,

			
 
				-    ) -> ChatResult:

			
 
				-        formatted_messages = []

			
 
				-        for msg in messages:

			
 
				-            if isinstance(msg, HumanMessage):

			
 
				-                formatted_messages.append({"role": "user", "content": msg.content})

			
 
				-            elif isinstance(msg, AIMessage):

			
 
				-                formatted_messages.append({"role": "assistant", "content": msg.content})

			
 
				-            elif isinstance(msg, SystemMessage):

			
 
				-                 formatted_messages.append({"role": "system", "content": msg.content})

			
 
				-

			
 
				-        prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"

			
 
				-        for msg in formatted_messages:

			
 
				-            if msg['role'] == 'user':

			
 
				-                prompt += f"{msg['content']} [/INST]"

			
 
				-            elif msg['role'] == "assistant":

			
 
				-                prompt += f"{msg['content']} </s><s>[INST]"

			
 
				-

			
 
				-        command = ["ollama", "run", self.model_name, prompt]

			
 
				-        result = subprocess.run(command, capture_output=True, text=True)

			
 
				-

			
 
				-        if result.returncode != 0:

			
 
				-            raise Exception(f"Ollama command failed: {result.stderr}")

			
 
				-        

			
 
				-        content = result.stdout.strip()

			
 
				-

			
 
				-        message = AIMessage(content=content)

			
 
				-        generation = ChatGeneration(message=message)

			
 
				-        return ChatResult(generations=[generation])

			
 
				-    

			
 
				-    @property

			
 
				-    def _llm_type(self) -> str:

			
 
				-        return "ollama-chat-model"

			
 
				-    

			
 
				-taide_llm = OllamaChatModel(model_name="taide-local")

			
 
				-

			
 
				-def multi_query(question, retriever, chat_history):

			
 
				-    def multi_query_chain():

			
 
				-        template = """You are an AI language model assistant. Your task is to generate three 

			
 
				-        different versions of the given user question to retrieve relevant documents from a vector 

			
 
				-        database. By generating multiple perspectives on the user question, your goal is to help

			
 
				-        the user overcome some of the limitations of the distance-based similarity search. 

			
 
				-        Provide these alternative questions separated by newlines. 

			
 
				-

			
 
				-        You must return original question also, which means that you return 1 original version + 3 different versions = 4 questions.

			
 
				-        

			
 
				-        Original question: {question}"""

			
 
				-        prompt_perspectives = ChatPromptTemplate.from_template(template)

			
 
				-

			
 
				-        generate_queries = (

			
 
				-            prompt_perspectives 

			
 
				-            | taide_llm

			
 
				-            | StrOutputParser() 

			
 
				-            | (lambda x: x.split("\n"))

			
 
				-        )

			
 
				-

			
 
				-        return generate_queries

			
 
				-

			
 
				-    def get_unique_union(documents: List[list]):

			
 
				-        flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]

			
 
				-        unique_docs = list(set(flattened_docs))

			
 
				-        return [loads(doc) for doc in unique_docs]

			
 
				-

			
 
				-    _search_query = get_search_query()

			
 
				-    modified_question = _search_query.invoke({"question":question, "chat_history": chat_history})

			
 
				-    print(modified_question)

			
 
				-

			
 
				-    generate_queries = multi_query_chain()

			
 
				-

			
 
				-    retrieval_chain = generate_queries | retriever.map() | get_unique_union

			
 
				-    docs = retrieval_chain.invoke({"question":modified_question})

			
 
				-

			
 
				-    answer = multi_query_rag_prompt(retrieval_chain, modified_question)

			
 
				-

			
 
				-    return answer, docs

			
 
				-

			
 
				-def multi_query_rag_prompt(retrieval_chain, question):

			
 
				-    template = """Answer the following question based on this context:

			
 
				-

			
 
				-    {context}

			
 
				-

			
 
				-    Question: {question}

			
 
				-    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw. If the question is in English, then the output will be in English.

			
 
				-    You should not mention anything about "根據提供的文件內容" or other similar terms.

			
 
				-    If you don't know the answer, just say that "很抱歉，目前我無法回答您的問題，請將您的詢問發送至 test@email.com 以便獲得更進一步的幫助，謝謝。I'm sorry I cannot answer your question. Please send your question to test@email.com for further assistance. Thank you."

			
 
				-    """

			
 
				-

			
 
				-    prompt = ChatPromptTemplate.from_template(template)

			
 
				-    context = retrieval_chain.invoke({"question": question})

			
 
				-    print(f"Retrieved context: {context[:200]}...")  # Print first 200 chars of context

			
 
				-

			
 
				-    final_rag_chain = (

			
 
				-        {"context": retrieval_chain, 

			
 
				-        "question": itemgetter("question")} 

			
 
				-        | prompt

			
 
				-        | taide_llm

			
 
				-        | StrOutputParser()

			
 
				-    )

			
 
				-

			
 
				-    print(f"Sending question to model: {question}")

			
 
				-    try:

			
 
				-        answer = final_rag_chain.invoke({"question": question})

			
 
				-        print(f"Received answer: {answer}")

			
 
				-        return answer

			
 
				-    except Exception as e:

			
 
				-        print(f"Error invoking rag_chain: {e}")

			
 
				-        return "Error occurred while processing the question."

			
 
				-

			
 
				-def get_search_query():

			
 
				-    _template = """Rewrite the following query by incorporating relevant context from the conversation history.

			
 
				-    The rewritten query should:

			
 
				-    

			
 
				-    - Preserve the core intent and meaning of the original query

			
 
				-    - Expand and clarify the query to make it more specific and informative for retrieving relevant context

			
 
				-    - Avoid introducing new topics or queries that deviate from the original query

			
 
				-    - DONT EVER ANSWER the Original query, but instead focus on rephrasing and expanding it into a new query

			
 
				-    - The rewritten query should be in its original language.

			
 
				-    

			
 
				-    Return ONLY the rewritten query text, without any additional formatting or explanations.

			
 
				-    

			
 
				-    Conversation History:

			
 
				-    {chat_history}

			
 
				-    

			
 
				-    Original query: [{question}]

			
 
				-    

			
 
				-    Rewritten query: 

			
 
				-    """

			
 
				-    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

			
 
				-

			
 
				-    def _format_chat_history(chat_history: List[tuple[str, str]]) -> List:

			
 
				-        buffer = []

			
 
				-        for human, ai in chat_history:

			
 
				-            buffer.append(HumanMessage(content=human))

			
 
				-            buffer.append(AIMessage(content=ai))

			
 
				-        return buffer

			
 
				-

			
 
				-    _search_query = RunnableBranch(

			
 
				-        (

			
 
				-            RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(

			
 
				-                run_name="HasChatHistoryCheck"

			
 
				-            ),

			
 
				-            RunnablePassthrough.assign(

			
 
				-                chat_history=lambda x: _format_chat_history(x["chat_history"])

			
 
				-            )

			
 
				-            | CONDENSE_QUESTION_PROMPT

			
 
				-            | ChatOpenAI()

			
 
				-            | StrOutputParser(),

			
 
				-        ),

			
 
				-        RunnableLambda(lambda x : x["question"]),

			
 
				-    )

			
 
				-

			
 
				-    return _search_query

			
 
				-

			
 
				-def naive_rag(question, retriever):

			
 
				-    prompt = hub.pull("rlm/rag-prompt")

			
 
				-

			
 
				-    def format_docs(docs):

			
 
				-        return "\n\n".join(doc.page_content for doc in docs)

			
 
				-

			
 
				-    reference = retriever.get_relevant_documents(question)

			
 
				-    

			
 
				-    rag_chain = (

			
 
				-        {"context": retriever | format_docs, "question": RunnablePassthrough()}

			
 
				-        | prompt

			
 
				-        | taide_llm

			
 
				-        | StrOutputParser()

			
 
				-    )

			
 
				-

			
 
				-    answer = rag_chain.invoke(question)

			
 
				-

			
 
				-    return answer, reference

			
 
				-

			
 
				-def naive_rag_for_qapairs(question, retriever):

			
 
				-    template = """You are an assistant for question-answering tasks. 

			
 
				-    Use the following pieces of retrieved context to answer the question. 

			
 
				-    Following retrieved context is question-answer pairs of historical QA, Find the suitable answer from the qa pairs

			
 
				-    If you can not find the suitable answer, just return "False". 

			
 
				-    Use three sentences maximum and Do not make up the answer.

			
 
				-

			
 
				-    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw.

			
 
				-

			
 
				-    {context}

			
 
				-

			
 
				-    Question: {question}

			
 
				-    """

			
 
				-    prompt = PromptTemplate.from_template(template)

			
 
				-

			
 
				-    llm = ChatOpenAI(model_name="gpt-4-0125-preview")

			
 
				-

			
 
				-    def format_docs(docs):

			
 
				-        return "\n\n".join(doc.page_content for doc in docs)

			
 
				-

			
 
				-    reference = retriever.get_relevant_documents(question)

			
 
				-    

			
 
				-    rag_chain = (

			
 
				-        {"context": retriever | format_docs, "question": RunnablePassthrough()}

			
 
				-        | prompt

			
 
				-        | llm

			
 
				-        | StrOutputParser()

			
 
				-    )

			
 
				-

			
 
				-    answer = rag_chain.invoke(question)

			
 
				-

			
 
				-    return answer, reference

			
 
				-

			
 
				-def rag_score(question, ground_truth, answer, reference_docs):

			
 
				-    datasets = {

			
 
				-              "question": [question],

			
 
				-              "answer": [answer],

			
 
				-              "contexts": [reference_docs],

			
 
				-              "ground_truths": [[ground_truth]]

			
 
				-            }

			
 
				-    evalsets = Dataset.from_dict(datasets)

			
 
				-

			
 
				-    result = evaluate(

			
 
				-        evalsets,

			
 
				-        metrics=[

			
 
				-            context_precision,

			
 
				-            faithfulness,

			
 
				-            answer_relevancy,

			
 
				-            context_recall,

			
 
				-        ],

			
 
				-    )

			
 
				-

			
 
				-    result_df = result.to_pandas()

			
 
				-    print(result_df.head())

			
 
				-    result_df.to_csv('ragas_rag.csv')

			
 
				-    return result

			
 
				-

			
 
				-def print_current_model(llm):

			
 
				-    if isinstance(llm, OllamaChatModel):

			
 
				-        print(f"Currently using model: {llm.model_name}")

			
 
				-    else:

			
 
				-        pass
			
--- a/add_vectordb.py
+++ b/add_vectordb.py
@@ -1,192 +0,0 @@
 
				-from dotenv import load_dotenv
			
 
				-load_dotenv('environment.env')
			
 
				-
			
 
				-from langchain_openai import OpenAIEmbeddings
			
 
				-from langchain_community.vectorstores import Chroma
			
 
				-from langchain_community.document_loaders import TextLoader
			
 
				-from langchain_text_splitters import RecursiveCharacterTextSplitter
			
 
				-from langchain_community.document_loaders import PyPDFLoader
			
 
				-from langchain_community.document_loaders import Docx2txtLoader
			
 
				-
			
 
				-import os
			
 
				-import glob
			
 
				-import openai
			
 
				-
			
 
				-from langchain_community.vectorstores import SupabaseVectorStore
			
 
				-from langchain_openai import OpenAIEmbeddings
			
 
				-from supabase.client import Client, create_client
			
 
				-
			
 
				-
			
 
				-def get_data_list(data_list=None, path=None, extension=None, update=False):
			
 
				-    files = data_list or glob.glob(os.path.join(path, f"*.{extension}"))
			
 
				-    if update:    
			
 
				-        doc = files.copy()
			
 
				-    else:
			
 
				-        existed_data = check_existed_data(supabase)
			
 
				-        doc = []
			
 
				-        for file_path in files:
			
 
				-            filename = os.path.basename(file_path)
			
 
				-            if filename not in existed_data:
			
 
				-                doc.append(file_path)
			
 
				-
			
 
				-    return doc
			
 
				-
			
 
				-
			
 
				-def read_and_split_files(data_list=None, path=None, extension=None, update=False):
			
 
				-
			
 
				-    def load_and_split(file_list):
			
 
				-        chunks = []
			
 
				-        for file in file_list:
			
 
				-            if file.endswith(".txt"):
			
 
				-                loader = TextLoader(file, encoding='utf-8')
			
 
				-            elif file.endswith(".pdf"):
			
 
				-                loader = PyPDFLoader(file)
			
 
				-            elif file.endswith(".docx"):
			
 
				-                loader = Docx2txtLoader(file)
			
 
				-            else:
			
 
				-                print(f"Unsupported file extension: {file}")
			
 
				-                continue
			
 
				-
			
 
				-            docs = loader.load()
			
 
				-
			
 
				-            # Split
			
 
				-            if file.endswith(".docx"):
			
 
				-                separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']
			
 
				-                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=300, chunk_overlap=0)
			
 
				-            else:
			
 
				-                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)
			
 
				-            splits = text_splitter.split_documents(docs)
			
 
				-
			
 
				-            chunks.extend(splits)
			
 
				-
			
 
				-        return chunks
			
 
				-
			
 
				-
			
 
				-    doc = get_data_list(data_list=data_list, path=path, extension=extension, update=update)
			
 
				-    # Index
			
 
				-    docs = load_and_split(doc)
			
 
				-
			
 
				-    return docs
			
 
				-
			
 
				-def create_ids(docs):
			
 
				-    # Create a dictionary to count occurrences of each page in each document
			
 
				-    page_counter = {}
			
 
				-
			
 
				-    # List to store the resulting IDs
			
 
				-    document_ids = []
			
 
				-
			
 
				-    # Generate IDs
			
 
				-    for doc in [docs[i].metadata for i in range(len(docs))]:
			
 
				-        source = doc['source']
			
 
				-        file_name = os.path.basename(source).split('.')[0]
			
 
				-
			
 
				-        if "page" in doc.keys():
			
 
				-            page = doc['page']
			
 
				-            key = f"{source}_{page}"
			
 
				-        else:
			
 
				-            key = f"{source}"
			
 
				-
			
 
				-        if key not in page_counter:
			
 
				-            page_counter[key] = 1
			
 
				-        else:
			
 
				-            page_counter[key] += 1
			
 
				-        
			
 
				-        if "page" in doc.keys():
			
 
				-            doc_id = f"{file_name} | page {page} | chunk {page_counter[key]}"
			
 
				-        else:
			
 
				-            doc_id = f"{file_name} | chunk {page_counter[key]}"
			
 
				-
			
 
				-        
			
 
				-        document_ids.append(doc_id)
			
 
				-
			
 
				-    return document_ids
			
 
				-
			
 
				-def get_document(data_list=None, path=None, extension=None, update=False):
			
 
				-    docs = read_and_split_files(data_list=data_list, path=path, extension=extension, update=update)
			
 
				-    document_ids = create_ids(docs)
			
 
				-
			
 
				-    for doc in docs:
			
 
				-        doc.metadata['source'] = os.path.basename(doc.metadata['source'])
			
 
				-        # print(doc.metadata)
			
 
				-
			
 
				-    # document_metadatas = [{'source': doc.metadata['source'], 'page': doc.metadata['page'], 'chunk': int(id.split("chunk ")[-1])} for doc, id in zip(docs, document_ids)]
			
 
				-    document_metadatas = []
			
 
				-
			
 
				-    for doc, id in zip(docs, document_ids):
			
 
				-        chunk_number = int(id.split("chunk ")[-1])
			
 
				-        doc.metadata['chunk'] = chunk_number
			
 
				-        doc.metadata['extension'] = os.path.basename(doc.metadata['source']).split(".")[-1]
			
 
				-        document_metadatas.append(doc.metadata)
			
 
				-
			
 
				-    documents = [docs.metadata['source'].split(".")[0] + docs.page_content for docs in docs]
			
 
				-
			
 
				-    return document_ids, documents, document_metadatas
			
 
				-
			
 
				-def check_existed_data(supabase):
			
 
				-    response = supabase.table('documents').select("id, metadata").execute()
			
 
				-    existed_data = list(set([data['metadata']['source'] for data in response.data]))
			
 
				-    # existed_data = [(data['id'], data['metadata']['source']) for data in response.data]
			
 
				-    return existed_data
			
 
				-
			
 
				-class GetVectorStore(SupabaseVectorStore):
			
 
				-    def __init__(self, embeddings, supabase, table_name):
			
 
				-        super().__init__(embedding=embeddings, client=supabase, table_name=table_name, query_name="match_documents")
			
 
				-
			
 
				-    def insert(self, documents, document_metadatas):
			
 
				-        self.add_texts(
			
 
				-            texts=documents,
			
 
				-            metadatas=document_metadatas,
			
 
				-        )
			
 
				-
			
 
				-    def delete(self, file_list):
			
 
				-        for file_name in file_list:
			
 
				-            self._client.table(self.table_name).delete().eq('metadata->>source', file_name).execute()
			
 
				-
			
 
				-    def update(self, documents, document_metadatas, update_existing_data=False):
			
 
				-        if not document_metadatas:  # no new data
			
 
				-            return
			
 
				-
			
 
				-        if update_existing_data:
			
 
				-            file_list = list(set(metadata['source'] for metadata in document_metadatas))
			
 
				-            self.delete(file_list)
			
 
				-
			
 
				-        self.insert(documents, document_metadatas)
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-
			
 
				-    load_dotenv()
			
 
				-    supabase_url = os.environ.get("SUPABASE_URL")
			
 
				-    supabase_key = os.environ.get("SUPABASE_KEY")
			
 
				-    openai_api_key = os.getenv("OPENAI_API_KEY")
			
 
				-    openai.api_key = openai_api_key
			
 
				-    document_table = "documents"
			
 
				-    supabase: Client = create_client(supabase_url, supabase_key)
			
 
				-
			
 
				-    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
			
 
				-
			
 
				-    # get vector store
			
 
				-    vector_store = GetVectorStore(embeddings, supabase, document_table)
			
 
				-
			
 
				-    # update data (old + new / all new / all old)
			
 
				-    path = "/Documents"
			
 
				-    extension = "pdf"
			
 
				-    # file = None
			
 
				-
			
 
				-    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
			
 
				-    # file = [os.path.join(path, file) for file in file_list]
			
 
				-    file_list = glob.glob(os.path.join(path, "*"))
			
 
				-    print(file_list)
			
 
				-    
			
 
				-    update = True
			
 
				-    document_ids, documents, document_metadatas = get_document(data_list=file_list, path=path, extension=extension, update=update)
			
 
				-    vector_store.update(documents, document_metadatas, update_existing_data=update)
			
 
				-
			
 
				-    # insert new data (all new)
			
 
				-    # vector_store.insert(documents, document_metadatas)
			
 
				-
			
 
				-    # delete data
			
 
				-    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
			
 
				-    # vector_store.delete(file_list)
			
 
				-
			
 
				-    # get retriver
			
 
				-    # retriever = vector_store.as_retriever(search_kwargs={"k": 6})
			
--- a/faiss_index.py
+++ b/faiss_index.py
@@ -1,222 +0,0 @@
 
				-import faiss
			
 
				-import numpy as np
			
 
				-import json
			
 
				-from tqdm import tqdm
			
 
				-from time import time
			
 
				-from RAG_strategy import multi_query
			
 
				-from ragas import evaluate
			
 
				-from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision
			
 
				-from datasets import Dataset
			
 
				-from typing import List, Callable
			
 
				-
			
 
				-from dotenv import load_dotenv
			
 
				-load_dotenv('environment.env')
			
 
				-import os
			
 
				-import pickle
			
 
				-from supabase.client import Client, create_client
			
 
				-from langchain_openai import OpenAIEmbeddings
			
 
				-from add_vectordb import GetVectorStore
			
 
				-from sqlalchemy import create_engine
			
 
				-import pandas as pd
			
 
				-from langchain_core.documents import Document
			
 
				-
			
 
				-# Load environment variables
			
 
				-supabase_url = os.getenv("SUPABASE_URL")
			
 
				-supabase_key = os.getenv("SUPABASE_KEY")
			
 
				-openai_api_key = os.getenv("OPENAI_API_KEY")
			
 
				-document_table = "documents"
			
 
				-URI = os.getenv("SUPABASE_URI")
			
 
				-
			
 
				-# Initialize Supabase client
			
 
				-supabase: Client = create_client(supabase_url, supabase_key)
			
 
				-
			
 
				-# Initialize embeddings
			
 
				-embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
			
 
				-
			
 
				-# Initialize original vector store
			
 
				-original_vector_store = GetVectorStore(embeddings, supabase, document_table)
			
 
				-original_retriever = original_vector_store.as_retriever(search_kwargs={"k": 4})
			
 
				-
			
 
				-
			
 
				-
			
 
				-def download_embeddings():
			
 
				-    response = supabase.table(document_table).select("id, embedding, metadata, content").execute()
			
 
				-    embeddings = []
			
 
				-    ids = []
			
 
				-    metadatas = []
			
 
				-    contents = []
			
 
				-    for item in response.data:
			
 
				-        # Parse the embedding string into a list of floats
			
 
				-        embedding = json.loads(item['embedding'])
			
 
				-        embeddings.append(embedding)
			
 
				-        ids.append(item['id'])
			
 
				-        metadatas.append(item['metadata'])
			
 
				-        contents.append(item['content'])
			
 
				-    return np.array(embeddings, dtype=np.float32), ids, metadatas, contents
			
 
				-
			
 
				-def create_faiss_index(embeddings):
			
 
				-    dimension = embeddings.shape[1]
			
 
				-    index = faiss.IndexFlatL2(dimension)
			
 
				-    index.add(embeddings)
			
 
				-    return index
			
 
				-
			
 
				-def save_faiss_index(index, file_path):
			
 
				-    faiss.write_index(index, file_path)
			
 
				-    print(f"FAISS index saved to {file_path}")
			
 
				-
			
 
				-def load_faiss_index(file_path):
			
 
				-    if os.path.exists(file_path):
			
 
				-        index = faiss.read_index(file_path)
			
 
				-        print(f"FAISS index loaded from {file_path}")
			
 
				-        return index
			
 
				-    return None
			
 
				-
			
 
				-def save_metadata(ids, metadatas, contents, file_path):
			
 
				-    with open(file_path, 'wb') as f:
			
 
				-        pickle.dump((ids, metadatas, contents), f)
			
 
				-    print(f"Metadata saved to {file_path}")
			
 
				-
			
 
				-def load_metadata(file_path):
			
 
				-    if os.path.exists(file_path):
			
 
				-        with open(file_path, 'rb') as f:
			
 
				-            ids, metadatas, contents = pickle.load(f)
			
 
				-        print(f"Metadata loaded from {file_path}")
			
 
				-        return ids, metadatas, contents
			
 
				-    return None, None, None
			
 
				-
			
 
				-def search_faiss(index, query_vector, k=4):
			
 
				-    # Convert query_vector to a numpy array if it's not already
			
 
				-    if not isinstance(query_vector, np.ndarray):
			
 
				-        query_vector = np.array(query_vector)
			
 
				-    
			
 
				-    # Ensure the query_vector is 2D
			
 
				-    if query_vector.ndim == 1:
			
 
				-        query_vector = query_vector.reshape(1, -1)
			
 
				-    
			
 
				-    distances, indices = index.search(query_vector, k)
			
 
				-    return distances[0], indices[0]
			
 
				-
			
 
				-class FAISSRetriever:
			
 
				-    def __init__(self, index, ids, metadatas, contents):
			
 
				-        self.index = index
			
 
				-        self.ids = ids
			
 
				-        self.metadatas = metadatas
			
 
				-        self.contents = contents
			
 
				-        self.embeddings_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
			
 
				-
			
 
				-    def get_relevant_documents(self, query: str) -> List[Document]:
			
 
				-        query_vector = self.embeddings_model.embed_query(query)
			
 
				-        _, indices = search_faiss(self.index, query_vector)
			
 
				-        return [
			
 
				-            Document(page_content=self.contents[i], metadata=self.metadatas[i])
			
 
				-            for i in indices
			
 
				-        ]
			
 
				-
			
 
				-    def as_retriever(self, search_kwargs=None):
			
 
				-        return self
			
 
				-
			
 
				-    def map(self) -> Callable[[List[str]], List[List[Document]]]:
			
 
				-        def _map(queries: List[str]) -> List[List[Document]]:
			
 
				-            return [self.get_relevant_documents(query) for query in queries]
			
 
				-        return _map
			
 
				-
			
 
				-def load_qa_pairs():
			
 
				-    df = pd.read_csv("QA_database_rows.csv")
			
 
				-    return df['Question'].tolist(), df['Answer'].tolist()
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    faiss_index_path = "faiss_index.bin"
			
 
				-    metadata_path = "faiss_metadata.pkl"
			
 
				-
			
 
				-    # Try to load existing FAISS index and metadata
			
 
				-    index = load_faiss_index(faiss_index_path)
			
 
				-    ids, metadatas, contents = load_metadata(metadata_path)
			
 
				-
			
 
				-    if index is None or ids is None:
			
 
				-        print("FAISS index or metadata not found. Creating new index...")
			
 
				-        print("Downloading embeddings from Supabase...")
			
 
				-        embeddings, ids, metadatas, contents = download_embeddings()
			
 
				-
			
 
				-        print("Creating FAISS index...")
			
 
				-        index = create_faiss_index(embeddings)
			
 
				-
			
 
				-        # Save the index and metadata
			
 
				-        save_faiss_index(index, faiss_index_path)
			
 
				-        save_metadata(ids, metadatas, contents, metadata_path)
			
 
				-    else:
			
 
				-        print("Using existing FAISS index and metadata.")
			
 
				-
			
 
				-    print("Creating FAISS retriever...")
			
 
				-    faiss_retriever = FAISSRetriever(index, ids, metadatas, contents)
			
 
				-
			
 
				-    # Load QA pairs from database
			
 
				-    questions, ground_truths = load_qa_pairs()
			
 
				-
			
 
				-    # Compare performance
			
 
				-    for question, ground_truth in zip(questions, ground_truths):
			
 
				-        print(f"\nQuestion: {question}")
			
 
				-
			
 
				-        # Measure time for FAISS retrieval
			
 
				-        start_time = time()
			
 
				-        faiss_answer, faiss_docs = multi_query(question, faiss_retriever, chat_history=[])
			
 
				-        faiss_time = time() - start_time
			
 
				-        print(f"FAISS Answer: {faiss_answer}")
			
 
				-        print(f"FAISS Time: {faiss_time:.4f} seconds")
			
 
				-
			
 
				-        # Measure time for original retrieval
			
 
				-        start_time = time()
			
 
				-        original_answer, original_docs = multi_query(question, original_retriever, chat_history=[])
			
 
				-        original_time = time() - start_time
			
 
				-        print(f"Original Answer: {original_answer}")
			
 
				-        print(f"Original Time: {original_time:.4f} seconds")
			
 
				-
			
 
				-        # RAGAS evaluation for FAISS
			
 
				-        faiss_datasets = {
			
 
				-            "question": [question],
			
 
				-            "answer": [faiss_answer],
			
 
				-            "contexts": [[doc.page_content for doc in faiss_docs]],
			
 
				-            "ground_truths": [[ground_truth]]
			
 
				-        }
			
 
				-        faiss_evalsets = Dataset.from_dict(faiss_datasets)
			
 
				-
			
 
				-        faiss_result = evaluate(
			
 
				-            faiss_evalsets,
			
 
				-            metrics=[
			
 
				-                context_precision,
			
 
				-                faithfulness,
			
 
				-                answer_relevancy,
			
 
				-                context_recall,
			
 
				-            ],
			
 
				-        )
			
 
				-
			
 
				-        print("FAISS RAGAS Evaluation:")
			
 
				-        print(faiss_result.to_pandas())
			
 
				-
			
 
				-        # RAGAS evaluation for Original retriever
			
 
				-        original_datasets = {
			
 
				-            "question": [question],
			
 
				-            "answer": [original_answer],
			
 
				-            "contexts": [[doc.page_content for doc in original_docs]],
			
 
				-            "ground_truths": [[ground_truth]]
			
 
				-        }
			
 
				-        original_evalsets = Dataset.from_dict(original_datasets)
			
 
				-
			
 
				-        original_result = evaluate(
			
 
				-            original_evalsets,
			
 
				-            metrics=[
			
 
				-                context_precision,
			
 
				-                faithfulness,
			
 
				-                answer_relevancy,
			
 
				-                context_recall,
			
 
				-            ],
			
 
				-        )
			
 
				-
			
 
				-        print("Original RAGAS Evaluation:")
			
 
				-        print(original_result.to_pandas())
			
 
				-
			
 
				-    print("\nPerformance comparison complete.")
			
 
				-
			
 
				-# Key points:
			
 
				-
			
 
				-# If both the index and metadata (ids, metadatas, contents) are found, we don't need to download embeddings or recreate the index. The FAISS index already contains the embeddings.
			
 
				-# The FAISSRetriever class doesn't need the raw embeddings. It uses the FAISS index for similarity search and the metadata (ids, metadatas, contents) for returning document information.