1 年之前 · a7beb3efe6
--- a/Indexing_Split.py
+++ b/Indexing_Split.py
@@ -1,177 +0,0 @@
 
															-from dotenv import load_dotenv

														
 
															-load_dotenv('environment.env')

														
 
															-

														
 
															-from langchain_openai import OpenAIEmbeddings

														
 
															-from langchain_community.embeddings import OllamaEmbeddings

														
 
															-from langchain_community.vectorstores import Chroma

														
 
															-from langchain_community.document_loaders import TextLoader

														
 
															-from langchain.text_splitter import CharacterTextSplitter

														
 
															-from langchain_text_splitters import RecursiveCharacterTextSplitter

														
 
															-from langchain_core.documents import Document

														
 
															-from langchain_community.document_loaders import PyPDFLoader

														
 
															-from langchain_community.document_loaders import Docx2txtLoader

														
 
															-from langchain_community.document_loaders import WebBaseLoader

														
 
															-from PyPDF2 import PdfReader

														
 
															-from langchain.docstore.document import Document

														
 
															-from json import loads

														
 
															-import pandas as pd

														
 
															-from sqlalchemy import create_engine

														
 
															-

														
 
															-from langchain.prompts import ChatPromptTemplate

														
 
															-from langchain_openai import ChatOpenAI

														
 
															-from langchain_core.output_parsers import StrOutputParser

														
 
															-from langchain import hub

														
 
															-from tqdm import tqdm

														
 
															-

														
 
															-# __import__('pysqlite3')

														
 
															-# import sys

														
 
															-# sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")

														
 
															-

														
 
															-from datasets import Dataset 

														
 
															-from ragas import evaluate

														
 
															-from ragas.metrics import (

														
 
															-    answer_relevancy,

														
 
															-    faithfulness,

														
 
															-    context_recall,

														
 
															-    context_precision,

														
 
															-)

														
 
															-import pandas as pd

														
 
															-import os

														
 
															-import glob

														
 
															-import openai

														
 
															-

														
 
															-URI = os.getenv("SUPABASE_URI")

														
 
															-openai_api_key = os.getenv("OPENAI_API_KEY")

														
 
															-openai.api_key = openai_api_key

														
 
															-

														
 
															-from RAG_strategy import multi_query, naive_rag

														
 
															-

														
 
															-

														
 
															-def create_retriever(path='Documents', extension="pdf"):

														
 
															-    txt_files = glob.glob(os.path.join(path, f"*.{extension}"))

														
 
															-    

														
 
															-    doc = []

														
 
															-    for file_path in txt_files:

														
 
															-        doc.append(file_path)

														
 
															-    

														
 
															-    def load_and_split(file_list):

														
 
															-        chunks = []

														
 
															-        for file in file_list:

														
 
															-            if file.endswith(".txt"):

														
 
															-                loader = TextLoader(file, encoding='utf-8')

														
 
															-            elif file.endswith(".pdf"):

														
 
															-                loader = PyPDFLoader(file)

														
 
															-            elif file.endswith(".docx"):

														
 
															-                loader = Docx2txtLoader(file)

														
 
															-            else:

														
 
															-                raise ValueError(f"Unsupported file extension: {file}")

														
 
															-            

														
 
															-

														
 
															-            docs = loader.load()

														
 
															-

														
 
															-            # Split

														
 
															-            if file.endswith(".docx"):

														
 
															-                # separators = ["\n\n\u25cb", "\n\n\u25cf"]

														
 
															-                # text_splitter = RecursiveCharacterTextSplitter(separators=separators, chunk_size=500, chunk_overlap=0)

														
 
															-                separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']

														
 
															-                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=300, chunk_overlap=0)

														
 
															-            else:

														
 
															-                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)

														
 
															-            

														
 
															-            splits = text_splitter.split_documents(docs)

														
 
															-

														
 
															-            chunks.extend(splits)

														
 
															-

														
 
															-        return chunks

														
 
															-

														
 
															-    # Index

														
 
															-    docs = load_and_split(doc)

														
 
															-    qa_history_doc = gen_doc_from_history()

														
 
															-    docs.extend(qa_history_doc)

														
 
															-    # web_doc = web_data(os.path.join(path, 'web_url.csv'))

														
 
															-    # docs.extend(web_doc)

														
 
															-

														
 
															-    # vectorstore

														
 
															-    # vectorstore = Chroma.from_texts(texts=docs, embedding=OpenAIEmbeddings())

														
 
															-    # vectorstore = Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings(openai_api_key=openai_api_key))

														
 
															-    # vectorstore = Chroma.from_documents(documents=docs, embedding=OllamaEmbeddings(model="llama3", num_gpu=1))

														
 
															-    vectorstore = Chroma.from_documents(documents=docs, embedding=OllamaEmbeddings(model="gemma2"))

														
 
															-

														
 
															-    vectorstore.persist()

														
 
															-

														
 
															-    retriever = vectorstore.as_retriever()

														
 
															-

														
 
															-    return retriever

														
 
															-

														
 
															-def web_data(url_file):

														
 
															-    df = pd.read_csv(url_file, header = 0)

														
 
															-    url_list = df['url'].to_list()

														
 
															-

														
 
															-    loader = WebBaseLoader(url_list)

														
 
															-    docs = loader.load()

														
 
															-

														
 
															-    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(

														
 
															-                chunk_size=1000, chunk_overlap=0)

														
 
															-    splits = text_splitter.split_documents(docs)

														
 
															-    

														
 
															-    return splits

														
 
															-

														
 
															-def gen_doc_from_history():

														
 
															-    engine = create_engine(URI, echo=True)

														
 
															-

														
 
															-    df = pd.read_sql_table("systex_records", engine.connect())  

														
 
															-    df.fillna('', inplace=True)

														
 
															-    result = df.to_json(orient='index', force_ascii=False)

														
 
															-    result = loads(result)

														
 
															-

														
 
															-

														
 
															-    df = pd.DataFrame(result).T

														
 
															-    qa_history_doc = []

														
 
															-    for i in range(len(df)):

														
 
															-        if df.iloc[i]['used_as_document'] is not True: continue

														
 
															-        Question = df.iloc[i]['Question']

														
 
															-        Answer = df.iloc[i]['Answer']

														
 
															-        context = f'Question: {Question}\nAnswer: {Answer}'

														
 
															-        

														
 
															-        doc =  Document(page_content=context, metadata={"source": "History"})

														
 
															-        qa_history_doc.append(doc)

														
 
															-        # print(doc)

														
 
															-

														
 
															-    return qa_history_doc

														
 
															-

														
 
															-def gen_doc_from_database():

														
 
															-    engine = create_engine(URI, echo=True)

														
 
															-

														
 
															-    df = pd.read_sql_table("QA_database", engine.connect())  

														
 
															-    # df.fillna('', inplace=True)

														
 
															-    result = df[['Question', 'Answer']].to_json(orient='index', force_ascii=False)

														
 
															-    result = loads(result)

														
 
															-

														
 
															-

														
 
															-    df = pd.DataFrame(result).T

														
 
															-    qa_doc = []

														
 
															-    for i in range(len(df)):

														
 
															-        # if df.iloc[i]['used_as_document'] is not True: continue

														
 
															-        Question = df.iloc[i]['Question']

														
 
															-        Answer = df.iloc[i]['Answer']

														
 
															-        context = f'Question: {Question}\nAnswer: {Answer}'

														
 
															-        

														
 
															-        doc = Document(page_content=context, metadata={"source": "History"})

														
 
															-        qa_doc.append(doc)

														
 
															-        # print(doc)

														
 
															-

														
 
															-    return qa_doc

														
 
															-

														
 
															-if __name__ == "__main__":

														
 
															-

														
 
															-    retriever = create_retriever(path='./Documents', extension="pdf")

														
 
															-    question = 'CEV系統可以支援盤查到什麼程度'

														
 
															-    final_answer, reference_docs = multi_query(question, retriever)

														
 
															-    print(question, final_answer)

														
 
															-    question = 'CEV系統依循標準為何'

														
 
															-    final_answer, reference_docs = multi_query(question, retriever)

														
 
															-    print(question, final_answer)

														
 
															-

														
 
															-

														
 
															-

														
 
															-

														
--- a/RAG_app_copy.py
+++ b/RAG_app_copy.py
@@ -1,162 +0,0 @@
 
															-from dotenv import load_dotenv

														
 
															-load_dotenv('environment.env')

														
 
															-

														
 
															-from fastapi import FastAPI, HTTPException, status, Body, Depends

														
 
															-from fastapi.middleware.cors import CORSMiddleware

														
 
															-from contextlib import asynccontextmanager

														
 
															-from pydantic import BaseModel

														
 
															-from typing import List, Optional

														
 
															-import uvicorn

														
 
															-

														
 
															-from sqlalchemy import create_engine

														
 
															-import pandas as pd

														
 
															-import datetime

														
 
															-import json

														
 
															-from json import loads

														
 
															-import time

														
 
															-from langchain.callbacks import get_openai_callback

														
 
															-

														
 
															-from langchain_openai import OpenAIEmbeddings

														
 
															-from RAG_strategy import multi_query, naive_rag, naive_rag_for_qapairs

														
 
															-

														
 
															-import os

														
 
															-from supabase.client import Client, create_client

														
 
															-from add_vectordb import GetVectorStore

														
 
															-import openai

														
 
															-

														
 
															-# Get API log

														
 
															-import logging

														
 
															-logger = logging.getLogger("uvicorn.error")

														
 
															-

														
 
															-openai_api_key = os.getenv("OPENAI_API_KEY")

														
 
															-URI = os.getenv("SUPABASE_URI")

														
 
															-openai.api_key = openai_api_key

														
 
															-

														
 
															-global_retriever = None

														
 
															-

														
 
															-@asynccontextmanager

														
 
															-async def lifespan(app: FastAPI):

														
 
															-    global global_retriever

														
 
															-    global vector_store

														
 
															-    

														
 
															-    start = time.time()

														
 
															-

														
 
															-    supabase_url = os.getenv("SUPABASE_URL")

														
 
															-    supabase_key = os.getenv("SUPABASE_KEY")

														
 
															-    document_table = "documents"

														
 
															-    supabase: Client = create_client(supabase_url, supabase_key)

														
 
															-

														
 
															-    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

														
 
															-    vector_store = GetVectorStore(embeddings, supabase, document_table)

														
 
															-    global_retriever = vector_store.as_retriever(search_kwargs={"k": 4})

														
 
															-

														
 
															-    print(f"Initialization time: {time.time() - start}")

														
 
															-    yield

														
 
															-

														
 
															-def get_retriever():

														
 
															-    return global_retriever

														
 
															-

														
 
															-def get_vector_store():

														
 
															-    return vector_store

														
 
															-

														
 
															-app = FastAPI(lifespan=lifespan)

														
 
															-

														
 
															-app.add_middleware(

														
 
															-    CORSMiddleware,

														
 
															-    allow_origins=["*"],

														
 
															-    allow_credentials=True,

														
 
															-    allow_methods=["*"],

														
 
															-    allow_headers=["*"],

														
 
															-)

														
 
															-

														
 
															-@app.get("/answer2")

														
 
															-def multi_query_answer(question, retriever=Depends(get_retriever)):

														
 
															-    try:

														
 
															-        start = time.time()

														
 
															-

														
 
															-        with get_openai_callback() as cb:

														
 
															-            final_answer, reference_docs = multi_query(question, retriever, chat_history=[])

														
 
															-

														
 
															-        processing_time = time.time() - start

														
 
															-        print(f"Processing time: {processing_time}")

														
 
															-        save_history(question, final_answer, reference_docs, cb, processing_time)

														
 
															-

														
 
															-        return {"Answer": final_answer}

														
 
															-    except Exception as e:

														
 
															-        logger.error(f"Error in /answer2 endpoint: {e}")

														
 
															-        raise HTTPException(status_code=500, detail=str(e))

														
 
															-

														
 
															-class ChatHistoryItem(BaseModel):

														
 
															-    q: str

														
 
															-    a: str

														
 
															-

														
 
															-@app.post("/answer_with_history")

														
 
															-def multi_query_answer(question: Optional[str] = '', chat_history: List[ChatHistoryItem] = Body(...), retriever=Depends(get_retriever)):

														
 
															-    start = time.time()

														
 
															-    

														
 
															-    chat_history = [(item.q, item.a) for item in chat_history if item.a != ""]

														
 
															-    print(f"Chat history: {chat_history}")

														
 
															-    

														
 
															-    with get_openai_callback() as cb:

														
 
															-        final_answer, reference_docs = multi_query(question, retriever, chat_history)

														
 
															-    processing_time = time.time() - start

														
 
															-    print(f"Processing time: {processing_time}")

														
 
															-    save_history(question, final_answer, reference_docs, cb, processing_time)

														
 
															-

														
 
															-    return {"Answer": final_answer}

														
 
															-

														
 
															-@app.post("/answer_with_history2")

														
 
															-def multi_query_answer(question: Optional[str] = '', extension: Optional[str] = 'pdf', chat_history: List[ChatHistoryItem] = Body(...), retriever=Depends(get_retriever)):

														
 
															-    start = time.time()

														
 
															-

														
 
															-    retriever = vector_store.as_retriever(search_kwargs={"k": 4, 'filter': {'extension':extension}})

														
 
															-    

														
 
															-    chat_history = [(item.q, item.a) for item in chat_history if item.a != ""]

														
 
															-    print(f"Chat history: {chat_history}")

														
 
															-    

														
 
															-    with get_openai_callback() as cb:

														
 
															-        final_answer, reference_docs = multi_query(question, retriever, chat_history)

														
 
															-    processing_time = time.time() - start

														
 
															-    print(f"Processing time: {processing_time}")

														
 
															-    save_history(question, final_answer, reference_docs, cb, processing_time)

														
 
															-

														
 
															-    return {"Answer": final_answer}

														
 
															-

														
 
															-def save_history(question, answer, reference, cb, processing_time):

														
 
															-    record = {

														
 
															-        'Question': [question],

														
 
															-        'Answer': [answer],

														
 
															-        'Total_Tokens': [cb.total_tokens],

														
 
															-        'Total_Cost': [cb.total_cost],

														
 
															-        'Processing_time': [processing_time],

														
 
															-        'Contexts': [str(reference)]

														
 
															-    }

														
 
															-    df = pd.DataFrame(record)

														
 
															-    engine = create_engine(URI)

														
 
															-    df.to_sql(name='systex_records', con=engine, index=False, if_exists='append')

														
 
															-

														
 
															-class history_output(BaseModel):

														
 
															-    Question: str

														
 
															-    Answer: str

														
 
															-    Contexts: str

														
 
															-    Total_Tokens: int

														
 
															-    Total_Cost: float

														
 
															-    Processing_time: float

														
 
															-    Time: datetime.datetime

														
 
															-

														
 
															-@app.get('/history', response_model=List[history_output])

														
 
															-async def get_history():

														
 
															-    engine = create_engine(URI, echo=True)

														
 
															-

														
 
															-    df = pd.read_sql_table("systex_records", engine.connect())  

														
 
															-    df.fillna('', inplace=True)

														
 
															-    result = df.to_json(orient='index', force_ascii=False)

														
 
															-    result = loads(result)

														
 
															-    return result.values()

														
 
															-

														
 
															-@app.get("/")

														
 
															-def read_root():

														
 
															-    return {"message": "Welcome to the Carbon Chatbot API"}

														
 
															-

														
 
															-if __name__ == "__main__":

														
 
															-    uvicorn.run("RAG_app_copy:app", host='127.0.0.1', port=8081, reload=True)
														
--- a/RAG_strategy.py
+++ b/RAG_strategy.py
@@ -1,296 +0,0 @@
 
															-from langchain.prompts import ChatPromptTemplate

														
 
															-from langchain.load import dumps, loads

														
 
															-from langchain_core.output_parsers import StrOutputParser

														
 
															-from langchain_openai import ChatOpenAI

														
 
															-from langchain_community.llms import Ollama

														
 
															-from langchain_community.chat_models import ChatOllama

														
 
															-from operator import itemgetter

														
 
															-from langchain_core.runnables import RunnablePassthrough

														
 
															-from langchain import hub

														
 
															-from langchain.globals import set_llm_cache

														
 
															-from langchain import PromptTemplate

														
 
															-import subprocess

														
 
															-import json

														
 
															-from typing import Any, List, Optional, Dict

														
 
															-from langchain_core.callbacks import CallbackManagerForLLMRun

														
 
															-from langchain_core.language_models import BaseChatModel

														
 
															-from langchain_core.messages import BaseMessage, AIMessage, HumanMessage, SystemMessage

														
 
															-from langchain_core.outputs import ChatResult, ChatGeneration

														
 
															-from pydantic import Field

														
 
															-

														
 
															-from langchain_core.runnables import (

														
 
															-    RunnableBranch,

														
 
															-    RunnableLambda,

														
 
															-    RunnableParallel,

														
 
															-    RunnablePassthrough,

														
 
															-)

														
 
															-

														
 
															-from datasets import Dataset 

														
 
															-from ragas import evaluate

														
 
															-from ragas.metrics import (

														
 
															-    answer_relevancy,

														
 
															-    faithfulness,

														
 
															-    context_recall,

														
 
															-    context_precision,

														
 
															-)

														
 
															-import os

														
 
															-from dotenv import load_dotenv

														
 
															-load_dotenv('environment.env')

														
 
															-

														
 
															-from langchain.cache import SQLiteCache

														
 
															-from langchain_openai import OpenAIEmbeddings

														
 
															-from langchain.globals import set_llm_cache

														
 
															-

														
 
															-import requests

														
 
															-import openai

														
 
															-openai_api_key = os.getenv("OPENAI_API_KEY")

														
 
															-openai.api_key = openai_api_key

														
 
															-URI = os.getenv("SUPABASE_URI")

														
 
															-

														
 
															-# 設置緩存，以減少對API的重複請求。使用SQLite

														
 
															-set_llm_cache(SQLiteCache(database_path=".langchain.db"))

														
 
															-

														
 
															-system_prompt: str = "你是一個來自台灣的AI助理，你的名字是 TAIDE，樂於以台灣人的立場幫助使用者，會用繁體中文回答問題。"

														
 
															-

														
 
															-class OllamaChatModel(BaseChatModel):

														
 
															-    model_name: str = Field(default="taide-local")

														
 
															-

														
 
															-    def _generate(

														
 
															-            self,

														
 
															-            messages: List[BaseMessage],

														
 
															-            stop: Optional[List[str]] = None,

														
 
															-            run_manager: Optional[CallbackManagerForLLMRun] = None,

														
 
															-            **kwargs: Any,

														
 
															-    ) -> ChatResult:

														
 
															-        formatted_messages = []

														
 
															-        for msg in messages:

														
 
															-            if isinstance(msg, HumanMessage):

														
 
															-                formatted_messages.append({"role": "user", "content": msg.content})

														
 
															-            elif isinstance(msg, AIMessage):

														
 
															-                formatted_messages.append({"role": "assistant", "content": msg.content})

														
 
															-            elif isinstance(msg, SystemMessage):

														
 
															-                 formatted_messages.append({"role": "system", "content": msg.content})

														
 
															-

														
 
															-        prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"

														
 
															-        for msg in formatted_messages:

														
 
															-            if msg['role'] == 'user':

														
 
															-                prompt += f"{msg['content']} [/INST]"

														
 
															-            elif msg['role'] == "assistant":

														
 
															-                prompt += f"{msg['content']} </s><s>[INST]"

														
 
															-

														
 
															-        command = ["ollama", "run", self.model_name, prompt]

														
 
															-        result = subprocess.run(command, capture_output=True, text=True)

														
 
															-

														
 
															-        if result.returncode != 0:

														
 
															-            raise Exception(f"Ollama command failed: {result.stderr}")

														
 
															-        

														
 
															-        content = result.stdout.strip()

														
 
															-

														
 
															-        message = AIMessage(content=content)

														
 
															-        generation = ChatGeneration(message=message)

														
 
															-        return ChatResult(generations=[generation])

														
 
															-    

														
 
															-    @property

														
 
															-    def _llm_type(self) -> str:

														
 
															-        return "ollama-chat-model"

														
 
															-    

														
 
															-taide_llm = OllamaChatModel(model_name="taide-local")

														
 
															-

														
 
															-def multi_query(question, retriever, chat_history):

														
 
															-    def multi_query_chain():

														
 
															-        template = """You are an AI language model assistant. Your task is to generate three 

														
 
															-        different versions of the given user question to retrieve relevant documents from a vector 

														
 
															-        database. By generating multiple perspectives on the user question, your goal is to help

														
 
															-        the user overcome some of the limitations of the distance-based similarity search. 

														
 
															-        Provide these alternative questions separated by newlines. 

														
 
															-

														
 
															-        You must return original question also, which means that you return 1 original version + 3 different versions = 4 questions.

														
 
															-        

														
 
															-        Original question: {question}"""

														
 
															-        prompt_perspectives = ChatPromptTemplate.from_template(template)

														
 
															-

														
 
															-        generate_queries = (

														
 
															-            prompt_perspectives 

														
 
															-            | taide_llm

														
 
															-            | StrOutputParser() 

														
 
															-            | (lambda x: x.split("\n"))

														
 
															-        )

														
 
															-

														
 
															-        return generate_queries

														
 
															-

														
 
															-    def get_unique_union(documents: List[list]):

														
 
															-        flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]

														
 
															-        unique_docs = list(set(flattened_docs))

														
 
															-        return [loads(doc) for doc in unique_docs]

														
 
															-

														
 
															-    _search_query = get_search_query()

														
 
															-    modified_question = _search_query.invoke({"question":question, "chat_history": chat_history})

														
 
															-    print(modified_question)

														
 
															-

														
 
															-    generate_queries = multi_query_chain()

														
 
															-

														
 
															-    retrieval_chain = generate_queries | retriever.map() | get_unique_union

														
 
															-    docs = retrieval_chain.invoke({"question":modified_question})

														
 
															-

														
 
															-    answer = multi_query_rag_prompt(retrieval_chain, modified_question)

														
 
															-

														
 
															-    return answer, docs

														
 
															-

														
 
															-def multi_query_rag_prompt(retrieval_chain, question):

														
 
															-    template = """Answer the following question based on this context:

														
 
															-

														
 
															-    {context}

														
 
															-

														
 
															-    Question: {question}

														
 
															-    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw. If the question is in English, then the output will be in English.

														
 
															-    You should not mention anything about "根據提供的文件內容" or other similar terms.

														
 
															-    If you don't know the answer, just say that "很抱歉，目前我無法回答您的問題，請將您的詢問發送至 test@email.com 以便獲得更進一步的幫助，謝謝。I'm sorry I cannot answer your question. Please send your question to test@email.com for further assistance. Thank you."

														
 
															-    """

														
 
															-

														
 
															-    prompt = ChatPromptTemplate.from_template(template)

														
 
															-    context = retrieval_chain.invoke({"question": question})

														
 
															-    print(f"Retrieved context: {context[:200]}...")  # Print first 200 chars of context

														
 
															-

														
 
															-    final_rag_chain = (

														
 
															-        {"context": retrieval_chain, 

														
 
															-        "question": itemgetter("question")} 

														
 
															-        | prompt

														
 
															-        | taide_llm

														
 
															-        | StrOutputParser()

														
 
															-    )

														
 
															-

														
 
															-    print(f"Sending question to model: {question}")

														
 
															-    try:

														
 
															-        answer = final_rag_chain.invoke({"question": question})

														
 
															-        print(f"Received answer: {answer}")

														
 
															-        return answer

														
 
															-    except Exception as e:

														
 
															-        print(f"Error invoking rag_chain: {e}")

														
 
															-        return "Error occurred while processing the question."

														
 
															-

														
 
															-def get_search_query():

														
 
															-    _template = """Rewrite the following query by incorporating relevant context from the conversation history.

														
 
															-    The rewritten query should:

														
 
															-    

														
 
															-    - Preserve the core intent and meaning of the original query

														
 
															-    - Expand and clarify the query to make it more specific and informative for retrieving relevant context

														
 
															-    - Avoid introducing new topics or queries that deviate from the original query

														
 
															-    - DONT EVER ANSWER the Original query, but instead focus on rephrasing and expanding it into a new query

														
 
															-    - The rewritten query should be in its original language.

														
 
															-    

														
 
															-    Return ONLY the rewritten query text, without any additional formatting or explanations.

														
 
															-    

														
 
															-    Conversation History:

														
 
															-    {chat_history}

														
 
															-    

														
 
															-    Original query: [{question}]

														
 
															-    

														
 
															-    Rewritten query: 

														
 
															-    """

														
 
															-    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

														
 
															-

														
 
															-    def _format_chat_history(chat_history: List[tuple[str, str]]) -> List:

														
 
															-        buffer = []

														
 
															-        for human, ai in chat_history:

														
 
															-            buffer.append(HumanMessage(content=human))

														
 
															-            buffer.append(AIMessage(content=ai))

														
 
															-        return buffer

														
 
															-

														
 
															-    _search_query = RunnableBranch(

														
 
															-        (

														
 
															-            RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(

														
 
															-                run_name="HasChatHistoryCheck"

														
 
															-            ),

														
 
															-            RunnablePassthrough.assign(

														
 
															-                chat_history=lambda x: _format_chat_history(x["chat_history"])

														
 
															-            )

														
 
															-            | CONDENSE_QUESTION_PROMPT

														
 
															-            | ChatOpenAI()

														
 
															-            | StrOutputParser(),

														
 
															-        ),

														
 
															-        RunnableLambda(lambda x : x["question"]),

														
 
															-    )

														
 
															-

														
 
															-    return _search_query

														
 
															-

														
 
															-def naive_rag(question, retriever):

														
 
															-    prompt = hub.pull("rlm/rag-prompt")

														
 
															-

														
 
															-    def format_docs(docs):

														
 
															-        return "\n\n".join(doc.page_content for doc in docs)

														
 
															-

														
 
															-    reference = retriever.get_relevant_documents(question)

														
 
															-    

														
 
															-    rag_chain = (

														
 
															-        {"context": retriever | format_docs, "question": RunnablePassthrough()}

														
 
															-        | prompt

														
 
															-        | taide_llm

														
 
															-        | StrOutputParser()

														
 
															-    )

														
 
															-

														
 
															-    answer = rag_chain.invoke(question)

														
 
															-

														
 
															-    return answer, reference

														
 
															-

														
 
															-def naive_rag_for_qapairs(question, retriever):

														
 
															-    template = """You are an assistant for question-answering tasks. 

														
 
															-    Use the following pieces of retrieved context to answer the question. 

														
 
															-    Following retrieved context is question-answer pairs of historical QA, Find the suitable answer from the qa pairs

														
 
															-    If you can not find the suitable answer, just return "False". 

														
 
															-    Use three sentences maximum and Do not make up the answer.

														
 
															-

														
 
															-    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw.

														
 
															-

														
 
															-    {context}

														
 
															-

														
 
															-    Question: {question}

														
 
															-    """

														
 
															-    prompt = PromptTemplate.from_template(template)

														
 
															-

														
 
															-    llm = ChatOpenAI(model_name="gpt-4-0125-preview")

														
 
															-

														
 
															-    def format_docs(docs):

														
 
															-        return "\n\n".join(doc.page_content for doc in docs)

														
 
															-

														
 
															-    reference = retriever.get_relevant_documents(question)

														
 
															-    

														
 
															-    rag_chain = (

														
 
															-        {"context": retriever | format_docs, "question": RunnablePassthrough()}

														
 
															-        | prompt

														
 
															-        | llm

														
 
															-        | StrOutputParser()

														
 
															-    )

														
 
															-

														
 
															-    answer = rag_chain.invoke(question)

														
 
															-

														
 
															-    return answer, reference

														
 
															-

														
 
															-def rag_score(question, ground_truth, answer, reference_docs):

														
 
															-    datasets = {

														
 
															-              "question": [question],

														
 
															-              "answer": [answer],

														
 
															-              "contexts": [reference_docs],

														
 
															-              "ground_truths": [[ground_truth]]

														
 
															-            }

														
 
															-    evalsets = Dataset.from_dict(datasets)

														
 
															-

														
 
															-    result = evaluate(

														
 
															-        evalsets,

														
 
															-        metrics=[

														
 
															-            context_precision,

														
 
															-            faithfulness,

														
 
															-            answer_relevancy,

														
 
															-            context_recall,

														
 
															-        ],

														
 
															-    )

														
 
															-

														
 
															-    result_df = result.to_pandas()

														
 
															-    print(result_df.head())

														
 
															-    result_df.to_csv('ragas_rag.csv')

														
 
															-    return result

														
 
															-

														
 
															-def print_current_model(llm):

														
 
															-    if isinstance(llm, OllamaChatModel):

														
 
															-        print(f"Currently using model: {llm.model_name}")

														
 
															-    else:

														
 
															-        pass
														
--- a/add_vectordb.py
+++ b/add_vectordb.py
@@ -1,192 +0,0 @@
 
															-from dotenv import load_dotenv
														
 
															-load_dotenv('environment.env')
														
 
															-
														
 
															-from langchain_openai import OpenAIEmbeddings
														
 
															-from langchain_community.vectorstores import Chroma
														
 
															-from langchain_community.document_loaders import TextLoader
														
 
															-from langchain_text_splitters import RecursiveCharacterTextSplitter
														
 
															-from langchain_community.document_loaders import PyPDFLoader
														
 
															-from langchain_community.document_loaders import Docx2txtLoader
														
 
															-
														
 
															-import os
														
 
															-import glob
														
 
															-import openai
														
 
															-
														
 
															-from langchain_community.vectorstores import SupabaseVectorStore
														
 
															-from langchain_openai import OpenAIEmbeddings
														
 
															-from supabase.client import Client, create_client
														
 
															-
														
 
															-
														
 
															-def get_data_list(data_list=None, path=None, extension=None, update=False):
														
 
															-    files = data_list or glob.glob(os.path.join(path, f"*.{extension}"))
														
 
															-    if update:    
														
 
															-        doc = files.copy()
														
 
															-    else:
														
 
															-        existed_data = check_existed_data(supabase)
														
 
															-        doc = []
														
 
															-        for file_path in files:
														
 
															-            filename = os.path.basename(file_path)
														
 
															-            if filename not in existed_data:
														
 
															-                doc.append(file_path)
														
 
															-
														
 
															-    return doc
														
 
															-
														
 
															-
														
 
															-def read_and_split_files(data_list=None, path=None, extension=None, update=False):
														
 
															-
														
 
															-    def load_and_split(file_list):
														
 
															-        chunks = []
														
 
															-        for file in file_list:
														
 
															-            if file.endswith(".txt"):
														
 
															-                loader = TextLoader(file, encoding='utf-8')
														
 
															-            elif file.endswith(".pdf"):
														
 
															-                loader = PyPDFLoader(file)
														
 
															-            elif file.endswith(".docx"):
														
 
															-                loader = Docx2txtLoader(file)
														
 
															-            else:
														
 
															-                print(f"Unsupported file extension: {file}")
														
 
															-                continue
														
 
															-
														
 
															-            docs = loader.load()
														
 
															-
														
 
															-            # Split
														
 
															-            if file.endswith(".docx"):
														
 
															-                separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']
														
 
															-                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=300, chunk_overlap=0)
														
 
															-            else:
														
 
															-                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)
														
 
															-            splits = text_splitter.split_documents(docs)
														
 
															-
														
 
															-            chunks.extend(splits)
														
 
															-
														
 
															-        return chunks
														
 
															-
														
 
															-
														
 
															-    doc = get_data_list(data_list=data_list, path=path, extension=extension, update=update)
														
 
															-    # Index
														
 
															-    docs = load_and_split(doc)
														
 
															-
														
 
															-    return docs
														
 
															-
														
 
															-def create_ids(docs):
														
 
															-    # Create a dictionary to count occurrences of each page in each document
														
 
															-    page_counter = {}
														
 
															-
														
 
															-    # List to store the resulting IDs
														
 
															-    document_ids = []
														
 
															-
														
 
															-    # Generate IDs
														
 
															-    for doc in [docs[i].metadata for i in range(len(docs))]:
														
 
															-        source = doc['source']
														
 
															-        file_name = os.path.basename(source).split('.')[0]
														
 
															-
														
 
															-        if "page" in doc.keys():
														
 
															-            page = doc['page']
														
 
															-            key = f"{source}_{page}"
														
 
															-        else:
														
 
															-            key = f"{source}"
														
 
															-
														
 
															-        if key not in page_counter:
														
 
															-            page_counter[key] = 1
														
 
															-        else:
														
 
															-            page_counter[key] += 1
														
 
															-        
														
 
															-        if "page" in doc.keys():
														
 
															-            doc_id = f"{file_name} | page {page} | chunk {page_counter[key]}"
														
 
															-        else:
														
 
															-            doc_id = f"{file_name} | chunk {page_counter[key]}"
														
 
															-
														
 
															-        
														
 
															-        document_ids.append(doc_id)
														
 
															-
														
 
															-    return document_ids
														
 
															-
														
 
															-def get_document(data_list=None, path=None, extension=None, update=False):
														
 
															-    docs = read_and_split_files(data_list=data_list, path=path, extension=extension, update=update)
														
 
															-    document_ids = create_ids(docs)
														
 
															-
														
 
															-    for doc in docs:
														
 
															-        doc.metadata['source'] = os.path.basename(doc.metadata['source'])
														
 
															-        # print(doc.metadata)
														
 
															-
														
 
															-    # document_metadatas = [{'source': doc.metadata['source'], 'page': doc.metadata['page'], 'chunk': int(id.split("chunk ")[-1])} for doc, id in zip(docs, document_ids)]
														
 
															-    document_metadatas = []
														
 
															-
														
 
															-    for doc, id in zip(docs, document_ids):
														
 
															-        chunk_number = int(id.split("chunk ")[-1])
														
 
															-        doc.metadata['chunk'] = chunk_number
														
 
															-        doc.metadata['extension'] = os.path.basename(doc.metadata['source']).split(".")[-1]
														
 
															-        document_metadatas.append(doc.metadata)
														
 
															-
														
 
															-    documents = [docs.metadata['source'].split(".")[0] + docs.page_content for docs in docs]
														
 
															-
														
 
															-    return document_ids, documents, document_metadatas
														
 
															-
														
 
															-def check_existed_data(supabase):
														
 
															-    response = supabase.table('documents').select("id, metadata").execute()
														
 
															-    existed_data = list(set([data['metadata']['source'] for data in response.data]))
														
 
															-    # existed_data = [(data['id'], data['metadata']['source']) for data in response.data]
														
 
															-    return existed_data
														
 
															-
														
 
															-class GetVectorStore(SupabaseVectorStore):
														
 
															-    def __init__(self, embeddings, supabase, table_name):
														
 
															-        super().__init__(embedding=embeddings, client=supabase, table_name=table_name, query_name="match_documents")
														
 
															-
														
 
															-    def insert(self, documents, document_metadatas):
														
 
															-        self.add_texts(
														
 
															-            texts=documents,
														
 
															-            metadatas=document_metadatas,
														
 
															-        )
														
 
															-
														
 
															-    def delete(self, file_list):
														
 
															-        for file_name in file_list:
														
 
															-            self._client.table(self.table_name).delete().eq('metadata->>source', file_name).execute()
														
 
															-
														
 
															-    def update(self, documents, document_metadatas, update_existing_data=False):
														
 
															-        if not document_metadatas:  # no new data
														
 
															-            return
														
 
															-
														
 
															-        if update_existing_data:
														
 
															-            file_list = list(set(metadata['source'] for metadata in document_metadatas))
														
 
															-            self.delete(file_list)
														
 
															-
														
 
															-        self.insert(documents, document_metadatas)
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-
														
 
															-    load_dotenv()
														
 
															-    supabase_url = os.environ.get("SUPABASE_URL")
														
 
															-    supabase_key = os.environ.get("SUPABASE_KEY")
														
 
															-    openai_api_key = os.getenv("OPENAI_API_KEY")
														
 
															-    openai.api_key = openai_api_key
														
 
															-    document_table = "documents"
														
 
															-    supabase: Client = create_client(supabase_url, supabase_key)
														
 
															-
														
 
															-    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
														
 
															-
														
 
															-    # get vector store
														
 
															-    vector_store = GetVectorStore(embeddings, supabase, document_table)
														
 
															-
														
 
															-    # update data (old + new / all new / all old)
														
 
															-    path = "/Documents"
														
 
															-    extension = "pdf"
														
 
															-    # file = None
														
 
															-
														
 
															-    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
														
 
															-    # file = [os.path.join(path, file) for file in file_list]
														
 
															-    file_list = glob.glob(os.path.join(path, "*"))
														
 
															-    print(file_list)
														
 
															-    
														
 
															-    update = True
														
 
															-    document_ids, documents, document_metadatas = get_document(data_list=file_list, path=path, extension=extension, update=update)
														
 
															-    vector_store.update(documents, document_metadatas, update_existing_data=update)
														
 
															-
														
 
															-    # insert new data (all new)
														
 
															-    # vector_store.insert(documents, document_metadatas)
														
 
															-
														
 
															-    # delete data
														
 
															-    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
														
 
															-    # vector_store.delete(file_list)
														
 
															-
														
 
															-    # get retriver
														
 
															-    # retriever = vector_store.as_retriever(search_kwargs={"k": 6})
														
--- a/faiss_index.py
+++ b/faiss_index.py
@@ -1,222 +0,0 @@
 
															-import faiss
														
 
															-import numpy as np
														
 
															-import json
														
 
															-from tqdm import tqdm
														
 
															-from time import time
														
 
															-from RAG_strategy import multi_query
														
 
															-from ragas import evaluate
														
 
															-from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision
														
 
															-from datasets import Dataset
														
 
															-from typing import List, Callable
														
 
															-
														
 
															-from dotenv import load_dotenv
														
 
															-load_dotenv('environment.env')
														
 
															-import os
														
 
															-import pickle
														
 
															-from supabase.client import Client, create_client
														
 
															-from langchain_openai import OpenAIEmbeddings
														
 
															-from add_vectordb import GetVectorStore
														
 
															-from sqlalchemy import create_engine
														
 
															-import pandas as pd
														
 
															-from langchain_core.documents import Document
														
 
															-
														
 
															-# Load environment variables
														
 
															-supabase_url = os.getenv("SUPABASE_URL")
														
 
															-supabase_key = os.getenv("SUPABASE_KEY")
														
 
															-openai_api_key = os.getenv("OPENAI_API_KEY")
														
 
															-document_table = "documents"
														
 
															-URI = os.getenv("SUPABASE_URI")
														
 
															-
														
 
															-# Initialize Supabase client
														
 
															-supabase: Client = create_client(supabase_url, supabase_key)
														
 
															-
														
 
															-# Initialize embeddings
														
 
															-embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
														
 
															-
														
 
															-# Initialize original vector store
														
 
															-original_vector_store = GetVectorStore(embeddings, supabase, document_table)
														
 
															-original_retriever = original_vector_store.as_retriever(search_kwargs={"k": 4})
														
 
															-
														
 
															-
														
 
															-
														
 
															-def download_embeddings():
														
 
															-    response = supabase.table(document_table).select("id, embedding, metadata, content").execute()
														
 
															-    embeddings = []
														
 
															-    ids = []
														
 
															-    metadatas = []
														
 
															-    contents = []
														
 
															-    for item in response.data:
														
 
															-        # Parse the embedding string into a list of floats
														
 
															-        embedding = json.loads(item['embedding'])
														
 
															-        embeddings.append(embedding)
														
 
															-        ids.append(item['id'])
														
 
															-        metadatas.append(item['metadata'])
														
 
															-        contents.append(item['content'])
														
 
															-    return np.array(embeddings, dtype=np.float32), ids, metadatas, contents
														
 
															-
														
 
															-def create_faiss_index(embeddings):
														
 
															-    dimension = embeddings.shape[1]
														
 
															-    index = faiss.IndexFlatL2(dimension)
														
 
															-    index.add(embeddings)
														
 
															-    return index
														
 
															-
														
 
															-def save_faiss_index(index, file_path):
														
 
															-    faiss.write_index(index, file_path)
														
 
															-    print(f"FAISS index saved to {file_path}")
														
 
															-
														
 
															-def load_faiss_index(file_path):
														
 
															-    if os.path.exists(file_path):
														
 
															-        index = faiss.read_index(file_path)
														
 
															-        print(f"FAISS index loaded from {file_path}")
														
 
															-        return index
														
 
															-    return None
														
 
															-
														
 
															-def save_metadata(ids, metadatas, contents, file_path):
														
 
															-    with open(file_path, 'wb') as f:
														
 
															-        pickle.dump((ids, metadatas, contents), f)
														
 
															-    print(f"Metadata saved to {file_path}")
														
 
															-
														
 
															-def load_metadata(file_path):
														
 
															-    if os.path.exists(file_path):
														
 
															-        with open(file_path, 'rb') as f:
														
 
															-            ids, metadatas, contents = pickle.load(f)
														
 
															-        print(f"Metadata loaded from {file_path}")
														
 
															-        return ids, metadatas, contents
														
 
															-    return None, None, None
														
 
															-
														
 
															-def search_faiss(index, query_vector, k=4):
														
 
															-    # Convert query_vector to a numpy array if it's not already
														
 
															-    if not isinstance(query_vector, np.ndarray):
														
 
															-        query_vector = np.array(query_vector)
														
 
															-    
														
 
															-    # Ensure the query_vector is 2D
														
 
															-    if query_vector.ndim == 1:
														
 
															-        query_vector = query_vector.reshape(1, -1)
														
 
															-    
														
 
															-    distances, indices = index.search(query_vector, k)
														
 
															-    return distances[0], indices[0]
														
 
															-
														
 
															-class FAISSRetriever:
														
 
															-    def __init__(self, index, ids, metadatas, contents):
														
 
															-        self.index = index
														
 
															-        self.ids = ids
														
 
															-        self.metadatas = metadatas
														
 
															-        self.contents = contents
														
 
															-        self.embeddings_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
														
 
															-
														
 
															-    def get_relevant_documents(self, query: str) -> List[Document]:
														
 
															-        query_vector = self.embeddings_model.embed_query(query)
														
 
															-        _, indices = search_faiss(self.index, query_vector)
														
 
															-        return [
														
 
															-            Document(page_content=self.contents[i], metadata=self.metadatas[i])
														
 
															-            for i in indices
														
 
															-        ]
														
 
															-
														
 
															-    def as_retriever(self, search_kwargs=None):
														
 
															-        return self
														
 
															-
														
 
															-    def map(self) -> Callable[[List[str]], List[List[Document]]]:
														
 
															-        def _map(queries: List[str]) -> List[List[Document]]:
														
 
															-            return [self.get_relevant_documents(query) for query in queries]
														
 
															-        return _map
														
 
															-
														
 
															-def load_qa_pairs():
														
 
															-    df = pd.read_csv("QA_database_rows.csv")
														
 
															-    return df['Question'].tolist(), df['Answer'].tolist()
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    faiss_index_path = "faiss_index.bin"
														
 
															-    metadata_path = "faiss_metadata.pkl"
														
 
															-
														
 
															-    # Try to load existing FAISS index and metadata
														
 
															-    index = load_faiss_index(faiss_index_path)
														
 
															-    ids, metadatas, contents = load_metadata(metadata_path)
														
 
															-
														
 
															-    if index is None or ids is None:
														
 
															-        print("FAISS index or metadata not found. Creating new index...")
														
 
															-        print("Downloading embeddings from Supabase...")
														
 
															-        embeddings, ids, metadatas, contents = download_embeddings()
														
 
															-
														
 
															-        print("Creating FAISS index...")
														
 
															-        index = create_faiss_index(embeddings)
														
 
															-
														
 
															-        # Save the index and metadata
														
 
															-        save_faiss_index(index, faiss_index_path)
														
 
															-        save_metadata(ids, metadatas, contents, metadata_path)
														
 
															-    else:
														
 
															-        print("Using existing FAISS index and metadata.")
														
 
															-
														
 
															-    print("Creating FAISS retriever...")
														
 
															-    faiss_retriever = FAISSRetriever(index, ids, metadatas, contents)
														
 
															-
														
 
															-    # Load QA pairs from database
														
 
															-    questions, ground_truths = load_qa_pairs()
														
 
															-
														
 
															-    # Compare performance
														
 
															-    for question, ground_truth in zip(questions, ground_truths):
														
 
															-        print(f"\nQuestion: {question}")
														
 
															-
														
 
															-        # Measure time for FAISS retrieval
														
 
															-        start_time = time()
														
 
															-        faiss_answer, faiss_docs = multi_query(question, faiss_retriever, chat_history=[])
														
 
															-        faiss_time = time() - start_time
														
 
															-        print(f"FAISS Answer: {faiss_answer}")
														
 
															-        print(f"FAISS Time: {faiss_time:.4f} seconds")
														
 
															-
														
 
															-        # Measure time for original retrieval
														
 
															-        start_time = time()
														
 
															-        original_answer, original_docs = multi_query(question, original_retriever, chat_history=[])
														
 
															-        original_time = time() - start_time
														
 
															-        print(f"Original Answer: {original_answer}")
														
 
															-        print(f"Original Time: {original_time:.4f} seconds")
														
 
															-
														
 
															-        # RAGAS evaluation for FAISS
														
 
															-        faiss_datasets = {
														
 
															-            "question": [question],
														
 
															-            "answer": [faiss_answer],
														
 
															-            "contexts": [[doc.page_content for doc in faiss_docs]],
														
 
															-            "ground_truths": [[ground_truth]]
														
 
															-        }
														
 
															-        faiss_evalsets = Dataset.from_dict(faiss_datasets)
														
 
															-
														
 
															-        faiss_result = evaluate(
														
 
															-            faiss_evalsets,
														
 
															-            metrics=[
														
 
															-                context_precision,
														
 
															-                faithfulness,
														
 
															-                answer_relevancy,
														
 
															-                context_recall,
														
 
															-            ],
														
 
															-        )
														
 
															-
														
 
															-        print("FAISS RAGAS Evaluation:")
														
 
															-        print(faiss_result.to_pandas())
														
 
															-
														
 
															-        # RAGAS evaluation for Original retriever
														
 
															-        original_datasets = {
														
 
															-            "question": [question],
														
 
															-            "answer": [original_answer],
														
 
															-            "contexts": [[doc.page_content for doc in original_docs]],
														
 
															-            "ground_truths": [[ground_truth]]
														
 
															-        }
														
 
															-        original_evalsets = Dataset.from_dict(original_datasets)
														
 
															-
														
 
															-        original_result = evaluate(
														
 
															-            original_evalsets,
														
 
															-            metrics=[
														
 
															-                context_precision,
														
 
															-                faithfulness,
														
 
															-                answer_relevancy,
														
 
															-                context_recall,
														
 
															-            ],
														
 
															-        )
														
 
															-
														
 
															-        print("Original RAGAS Evaluation:")
														
 
															-        print(original_result.to_pandas())
														
 
															-
														
 
															-    print("\nPerformance comparison complete.")
														
 
															-
														
 
															-# Key points:
														
 
															-
														
 
															-# If both the index and metadata (ids, metadatas, contents) are found, we don't need to download embeddings or recreate the index. The FAISS index already contains the embeddings.
														
 
															-# The FAISSRetriever class doesn't need the raw embeddings. It uses the FAISS index for similarity search and the metadata (ids, metadatas, contents) for returning document information.