1 år sedan · ae38039b0c
--- a/Indexing_RAPTOR.py
+++ b/Indexing_RAPTOR.py
@@ -0,0 +1,395 @@
 
				+from langchain_community.vectorstores import Chroma

			
 
				+

			
 
				+from typing import Dict, List, Optional, Tuple

			
 
				+

			
 
				+import numpy as np

			
 
				+import pandas as pd

			
 
				+import umap

			
 
				+from langchain.prompts import ChatPromptTemplate

			
 
				+from langchain_core.output_parsers import StrOutputParser

			
 
				+from sklearn.mixture import GaussianMixture

			
 
				+

			
 
				+from dotenv import load_dotenv

			
 
				+load_dotenv()

			
 
				+import os

			
 
				+os.environ["PATH"] += os.pathsep + "C:/Users/lzl/anaconda3/Lib/site-packages/poppler-24.02.0/Library/bin"

			
 
				+os.environ["PATH"] += os.pathsep + r"C:\Program Files\Tesseract-OCR\tessdata"

			
 
				+os.environ["PATH"] += os.pathsep + r"C:\Program Files\Tesseract-OCR"

			
 
				+os.environ["TESSDATA_PREFIX"] = r"C:\Program Files\Tesseract-OCR\tessdata"

			
 
				+

			
 
				+from langchain_openai import OpenAIEmbeddings

			
 
				+

			
 
				+embd = OpenAIEmbeddings()

			
 
				+

			
 
				+from langchain_openai import ChatOpenAI

			
 
				+

			
 
				+model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")

			
 
				+

			
 
				+RANDOM_SEED = 224  # Fixed seed for reproducibility

			
 
				+

			
 
				+### --- Code from citations referenced above (added comments and docstrings) --- ###

			
 
				+

			
 
				+

			
 
				+def global_cluster_embeddings(

			
 
				+    embeddings: np.ndarray,

			
 
				+    dim: int,

			
 
				+    n_neighbors: Optional[int] = None,

			
 
				+    metric: str = "cosine",

			
 
				+) -> np.ndarray:

			
 
				+    """

			
 
				+    Perform global dimensionality reduction on the embeddings using UMAP.

			
 
				+

			
 
				+    Parameters:

			
 
				+    - embeddings: The input embeddings as a numpy array.

			
 
				+    - dim: The target dimensionality for the reduced space.

			
 
				+    - n_neighbors: Optional; the number of neighbors to consider for each point.

			
 
				+                   If not provided, it defaults to the square root of the number of embeddings.

			
 
				+    - metric: The distance metric to use for UMAP.

			
 
				+

			
 
				+    Returns:

			
 
				+    - A numpy array of the embeddings reduced to the specified dimensionality.

			
 
				+    """

			
 
				+    if n_neighbors is None:

			
 
				+        n_neighbors = int((len(embeddings) - 1) ** 0.5)

			
 
				+    return umap.UMAP(

			
 
				+        n_neighbors=n_neighbors, n_components=dim, metric=metric

			
 
				+    ).fit_transform(embeddings)

			
 
				+

			
 
				+

			
 
				+def local_cluster_embeddings(

			
 
				+    embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = "cosine"

			
 
				+) -> np.ndarray:

			
 
				+    """

			
 
				+    Perform local dimensionality reduction on the embeddings using UMAP, typically after global clustering.

			
 
				+

			
 
				+    Parameters:

			
 
				+    - embeddings: The input embeddings as a numpy array.

			
 
				+    - dim: The target dimensionality for the reduced space.

			
 
				+    - num_neighbors: The number of neighbors to consider for each point.

			
 
				+    - metric: The distance metric to use for UMAP.

			
 
				+

			
 
				+    Returns:

			
 
				+    - A numpy array of the embeddings reduced to the specified dimensionality.

			
 
				+    """

			
 
				+    return umap.UMAP(

			
 
				+        n_neighbors=num_neighbors, n_components=dim, metric=metric

			
 
				+    ).fit_transform(embeddings)

			
 
				+

			
 
				+

			
 
				+def get_optimal_clusters(

			
 
				+    embeddings: np.ndarray, max_clusters: int = 50, random_state: int = RANDOM_SEED

			
 
				+) -> int:

			
 
				+    """

			
 
				+    Determine the optimal number of clusters using the Bayesian Information Criterion (BIC) with a Gaussian Mixture Model.

			
 
				+

			
 
				+    Parameters:

			
 
				+    - embeddings: The input embeddings as a numpy array.

			
 
				+    - max_clusters: The maximum number of clusters to consider.

			
 
				+    - random_state: Seed for reproducibility.

			
 
				+

			
 
				+    Returns:

			
 
				+    - An integer representing the optimal number of clusters found.

			
 
				+    """

			
 
				+    max_clusters = min(max_clusters, len(embeddings))

			
 
				+    n_clusters = np.arange(1, max_clusters)

			
 
				+    bics = []

			
 
				+    for n in n_clusters:

			
 
				+        gm = GaussianMixture(n_components=n, random_state=random_state)

			
 
				+        gm.fit(embeddings)

			
 
				+        bics.append(gm.bic(embeddings))

			
 
				+    return n_clusters[np.argmin(bics)]

			
 
				+

			
 
				+

			
 
				+def GMM_cluster(embeddings: np.ndarray, threshold: float, random_state: int = 0):

			
 
				+    """

			
 
				+    Cluster embeddings using a Gaussian Mixture Model (GMM) based on a probability threshold.

			
 
				+

			
 
				+    Parameters:

			
 
				+    - embeddings: The input embeddings as a numpy array.

			
 
				+    - threshold: The probability threshold for assigning an embedding to a cluster.

			
 
				+    - random_state: Seed for reproducibility.

			
 
				+

			
 
				+    Returns:

			
 
				+    - A tuple containing the cluster labels and the number of clusters determined.

			
 
				+    """

			
 
				+    n_clusters = get_optimal_clusters(embeddings)

			
 
				+    gm = GaussianMixture(n_components=n_clusters, random_state=random_state)

			
 
				+    gm.fit(embeddings)

			
 
				+    probs = gm.predict_proba(embeddings)

			
 
				+    labels = [np.where(prob > threshold)[0] for prob in probs]

			
 
				+    return labels, n_clusters

			
 
				+

			
 
				+

			
 
				+def perform_clustering(

			
 
				+    embeddings: np.ndarray,

			
 
				+    dim: int,

			
 
				+    threshold: float,

			
 
				+) -> List[np.ndarray]:

			
 
				+    """

			
 
				+    Perform clustering on the embeddings by first reducing their dimensionality globally, then clustering

			
 
				+    using a Gaussian Mixture Model, and finally performing local clustering within each global cluster.

			
 
				+

			
 
				+    Parameters:

			
 
				+    - embeddings: The input embeddings as a numpy array.

			
 
				+    - dim: The target dimensionality for UMAP reduction.

			
 
				+    - threshold: The probability threshold for assigning an embedding to a cluster in GMM.

			
 
				+

			
 
				+    Returns:

			
 
				+    - A list of numpy arrays, where each array contains the cluster IDs for each embedding.

			
 
				+    """

			
 
				+    if len(embeddings) <= dim + 1:

			
 
				+        # Avoid clustering when there's insufficient data

			
 
				+        return [np.array([0]) for _ in range(len(embeddings))]

			
 
				+

			
 
				+    # Global dimensionality reduction

			
 
				+    reduced_embeddings_global = global_cluster_embeddings(embeddings, dim)

			
 
				+    # Global clustering

			
 
				+    global_clusters, n_global_clusters = GMM_cluster(

			
 
				+        reduced_embeddings_global, threshold

			
 
				+    )

			
 
				+

			
 
				+    all_local_clusters = [np.array([]) for _ in range(len(embeddings))]

			
 
				+    total_clusters = 0

			
 
				+

			
 
				+    # Iterate through each global cluster to perform local clustering

			
 
				+    for i in range(n_global_clusters):

			
 
				+        # Extract embeddings belonging to the current global cluster

			
 
				+        global_cluster_embeddings_ = embeddings[

			
 
				+            np.array([i in gc for gc in global_clusters])

			
 
				+        ]

			
 
				+

			
 
				+        if len(global_cluster_embeddings_) == 0:

			
 
				+            continue

			
 
				+        if len(global_cluster_embeddings_) <= dim + 1:

			
 
				+            # Handle small clusters with direct assignment

			
 
				+            local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]

			
 
				+            n_local_clusters = 1

			
 
				+        else:

			
 
				+            # Local dimensionality reduction and clustering

			
 
				+            reduced_embeddings_local = local_cluster_embeddings(

			
 
				+                global_cluster_embeddings_, dim

			
 
				+            )

			
 
				+            local_clusters, n_local_clusters = GMM_cluster(

			
 
				+                reduced_embeddings_local, threshold

			
 
				+            )

			
 
				+

			
 
				+        # Assign local cluster IDs, adjusting for total clusters already processed

			
 
				+        for j in range(n_local_clusters):

			
 
				+            local_cluster_embeddings_ = global_cluster_embeddings_[

			
 
				+                np.array([j in lc for lc in local_clusters])

			
 
				+            ]

			
 
				+            indices = np.where(

			
 
				+                (embeddings == local_cluster_embeddings_[:, None]).all(-1)

			
 
				+            )[1]

			
 
				+            for idx in indices:

			
 
				+                all_local_clusters[idx] = np.append(

			
 
				+                    all_local_clusters[idx], j + total_clusters

			
 
				+                )

			
 
				+

			
 
				+        total_clusters += n_local_clusters

			
 
				+

			
 
				+    return all_local_clusters

			
 
				+

			
 
				+

			
 
				+### --- Our code below --- ###

			
 
				+

			
 
				+

			
 
				+def embed(texts):

			
 
				+    """

			
 
				+    Generate embeddings for a list of text documents.

			
 
				+

			
 
				+    This function assumes the existence of an `embd` object with a method `embed_documents`

			
 
				+    that takes a list of texts and returns their embeddings.

			
 
				+

			
 
				+    Parameters:

			
 
				+    - texts: List[str], a list of text documents to be embedded.

			
 
				+

			
 
				+    Returns:

			
 
				+    - numpy.ndarray: An array of embeddings for the given text documents.

			
 
				+    """

			
 
				+    text_embeddings = embd.embed_documents(texts)

			
 
				+    text_embeddings_np = np.array(text_embeddings)

			
 
				+    return text_embeddings_np

			
 
				+

			
 
				+

			
 
				+def embed_cluster_texts(texts):

			
 
				+    """

			
 
				+    Embeds a list of texts and clusters them, returning a DataFrame with texts, their embeddings, and cluster labels.

			
 
				+

			
 
				+    This function combines embedding generation and clustering into a single step. It assumes the existence

			
 
				+    of a previously defined `perform_clustering` function that performs clustering on the embeddings.

			
 
				+

			
 
				+    Parameters:

			
 
				+    - texts: List[str], a list of text documents to be processed.

			
 
				+

			
 
				+    Returns:

			
 
				+    - pandas.DataFrame: A DataFrame containing the original texts, their embeddings, and the assigned cluster labels.

			
 
				+    """

			
 
				+    text_embeddings_np = embed(texts)  # Generate embeddings

			
 
				+    cluster_labels = perform_clustering(

			
 
				+        text_embeddings_np, 10, 0.1

			
 
				+    )  # Perform clustering on the embeddings

			
 
				+    df = pd.DataFrame()  # Initialize a DataFrame to store the results

			
 
				+    df["text"] = texts  # Store original texts

			
 
				+    df["embd"] = list(text_embeddings_np)  # Store embeddings as a list in the DataFrame

			
 
				+    df["cluster"] = cluster_labels  # Store cluster labels

			
 
				+    return df

			
 
				+

			
 
				+

			
 
				+def fmt_txt(df: pd.DataFrame) -> str:

			
 
				+    """

			
 
				+    Formats the text documents in a DataFrame into a single string.

			
 
				+

			
 
				+    Parameters:

			
 
				+    - df: DataFrame containing the 'text' column with text documents to format.

			
 
				+

			
 
				+    Returns:

			
 
				+    - A single string where all text documents are joined by a specific delimiter.

			
 
				+    """

			
 
				+    unique_txt = df["text"].tolist()

			
 
				+    return "--- --- \n --- --- ".join(unique_txt)

			
 
				+

			
 
				+

			
 
				+def embed_cluster_summarize_texts(

			
 
				+    texts: List[str], level: int

			
 
				+) -> Tuple[pd.DataFrame, pd.DataFrame]:

			
 
				+    """

			
 
				+    Embeds, clusters, and summarizes a list of texts. This function first generates embeddings for the texts,

			
 
				+    clusters them based on similarity, expands the cluster assignments for easier processing, and then summarizes

			
 
				+    the content within each cluster.

			
 
				+

			
 
				+    Parameters:

			
 
				+    - texts: A list of text documents to be processed.

			
 
				+    - level: An integer parameter that could define the depth or detail of processing.

			
 
				+

			
 
				+    Returns:

			
 
				+    - Tuple containing two DataFrames:

			
 
				+      1. The first DataFrame (`df_clusters`) includes the original texts, their embeddings, and cluster assignments.

			
 
				+      2. The second DataFrame (`df_summary`) contains summaries for each cluster, the specified level of detail,

			
 
				+         and the cluster identifiers.

			
 
				+    """

			
 
				+

			
 
				+    # Embed and cluster the texts, resulting in a DataFrame with 'text', 'embd', and 'cluster' columns

			
 
				+    df_clusters = embed_cluster_texts(texts)

			
 
				+

			
 
				+    # Prepare to expand the DataFrame for easier manipulation of clusters

			
 
				+    expanded_list = []

			
 
				+

			
 
				+    # Expand DataFrame entries to document-cluster pairings for straightforward processing

			
 
				+    for index, row in df_clusters.iterrows():

			
 
				+        for cluster in row["cluster"]:

			
 
				+            expanded_list.append(

			
 
				+                {"text": row["text"], "embd": row["embd"], "cluster": cluster}

			
 
				+            )

			
 
				+

			
 
				+    # Create a new DataFrame from the expanded list

			
 
				+    expanded_df = pd.DataFrame(expanded_list)

			
 
				+

			
 
				+    # Retrieve unique cluster identifiers for processing

			
 
				+    all_clusters = expanded_df["cluster"].unique()

			
 
				+

			
 
				+    print(f"--Generated {len(all_clusters)} clusters--")

			
 
				+

			
 
				+    # Summarization

			
 
				+    template = """Here is a sub-set of LangChain Expression Langauge doc. 

			
 
				+    

			
 
				+    LangChain Expression Langauge provides a way to compose chain in LangChain.

			
 
				+    

			
 
				+    Give a detailed summary of the documentation provided.

			
 
				+    

			
 
				+    Documentation:

			
 
				+    {context}

			
 
				+    """

			
 
				+    prompt = ChatPromptTemplate.from_template(template)

			
 
				+    chain = prompt | model | StrOutputParser()

			
 
				+

			
 
				+    # Format text within each cluster for summarization

			
 
				+    summaries = []

			
 
				+    for i in all_clusters:

			
 
				+        df_cluster = expanded_df[expanded_df["cluster"] == i]

			
 
				+        formatted_txt = fmt_txt(df_cluster)

			
 
				+        summaries.append(chain.invoke({"context": formatted_txt}))

			
 
				+

			
 
				+    # Create a DataFrame to store summaries with their corresponding cluster and level

			
 
				+    df_summary = pd.DataFrame(

			
 
				+        {

			
 
				+            "summaries": summaries,

			
 
				+            "level": [level] * len(summaries),

			
 
				+            "cluster": list(all_clusters),

			
 
				+        }

			
 
				+    )

			
 
				+

			
 
				+    return df_clusters, df_summary

			
 
				+

			
 
				+

			
 
				+def recursive_embed_cluster_summarize(

			
 
				+    texts: List[str], level: int = 1, n_levels: int = 3

			
 
				+) -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:

			
 
				+    """

			
 
				+    Recursively embeds, clusters, and summarizes texts up to a specified level or until

			
 
				+    the number of unique clusters becomes 1, storing the results at each level.

			
 
				+

			
 
				+    Parameters:

			
 
				+    - texts: List[str], texts to be processed.

			
 
				+    - level: int, current recursion level (starts at 1).

			
 
				+    - n_levels: int, maximum depth of recursion.

			
 
				+

			
 
				+    Returns:

			
 
				+    - Dict[int, Tuple[pd.DataFrame, pd.DataFrame]], a dictionary where keys are the recursion

			
 
				+      levels and values are tuples containing the clusters DataFrame and summaries DataFrame at that level.

			
 
				+    """

			
 
				+    results = {}  # Dictionary to store results at each level

			
 
				+

			
 
				+    # Perform embedding, clustering, and summarization for the current level

			
 
				+    df_clusters, df_summary = embed_cluster_summarize_texts(texts, level)

			
 
				+

			
 
				+    # Store the results of the current level

			
 
				+    results[level] = (df_clusters, df_summary)

			
 
				+

			
 
				+    # Determine if further recursion is possible and meaningful

			
 
				+    unique_clusters = df_summary["cluster"].nunique()

			
 
				+    if level < n_levels and unique_clusters > 1:

			
 
				+        # Use summaries as the input texts for the next level of recursion

			
 
				+        new_texts = df_summary["summaries"].tolist()

			
 
				+        next_level_results = recursive_embed_cluster_summarize(

			
 
				+            new_texts, level + 1, n_levels

			
 
				+        )

			
 
				+

			
 
				+        # Merge the results from the next level into the current results dictionary

			
 
				+        results.update(next_level_results)

			
 
				+

			
 
				+    return results

			
 
				+

			
 
				+def create_retriever(path='Documents', extension="txt"):

			
 
				+    def preprocessing(path, extension):

			
 
				+        from langchain_community.document_loaders import DirectoryLoader

			
 
				+        loader = DirectoryLoader(path, glob=f'**/*.{extension}', show_progress=True)

			
 
				+        docs = loader.load()

			
 
				+

			
 
				+        docs_texts = [d.page_content for d in docs]

			
 
				+

			
 
				+        return docs_texts

			
 
				+

			
 
				+    # Build tree

			
 
				+    leaf_texts = preprocessing(path, extension)

			
 
				+    results = recursive_embed_cluster_summarize(leaf_texts, level=1, n_levels=3)

			
 
				+

			
 
				+    # Initialize all_texts with leaf_texts

			
 
				+    all_texts = leaf_texts.copy()

			
 
				+

			
 
				+    # Iterate through the results to extract summaries from each level and add them to all_texts

			
 
				+    for level in sorted(results.keys()):

			
 
				+        # Extract summaries from the current level's DataFrame

			
 
				+        summaries = results[level][1]["summaries"].tolist()

			
 
				+        # Extend all_texts with the summaries from the current level

			
 
				+        all_texts.extend(summaries)

			
 
				+

			
 
				+    # Now, use all_texts to build the vectorstore with Chroma

			
 
				+    embd = OpenAIEmbeddings()

			
 
				+    vectorstore = Chroma.from_texts(texts=all_texts, embedding=embd)

			
 
				+    retriever = vectorstore.as_retriever()

			
 
				+

			
 
				+    return retriever

			
 
				+

			
 
				+

			
 
				+

			
--- a/Indexing_Split.py
+++ b/Indexing_Split.py
@@ -0,0 +1,174 @@
 
				+from dotenv import load_dotenv

			
 
				+load_dotenv()

			
 
				+

			
 
				+from langchain_openai import OpenAIEmbeddings

			
 
				+from langchain_community.embeddings import OllamaEmbeddings

			
 
				+from langchain_community.vectorstores import Chroma

			
 
				+from langchain_community.document_loaders import TextLoader

			
 
				+from langchain.text_splitter import CharacterTextSplitter

			
 
				+from langchain_text_splitters import RecursiveCharacterTextSplitter

			
 
				+from langchain_core.documents import Document

			
 
				+from langchain_community.document_loaders import PyPDFLoader

			
 
				+from langchain_community.document_loaders import Docx2txtLoader

			
 
				+from langchain_community.document_loaders import WebBaseLoader

			
 
				+from PyPDF2 import PdfReader

			
 
				+from langchain.docstore.document import Document

			
 
				+from json import loads

			
 
				+import pandas as pd

			
 
				+from sqlalchemy import create_engine

			
 
				+

			
 
				+from langchain.prompts import ChatPromptTemplate

			
 
				+from langchain_openai import ChatOpenAI

			
 
				+from langchain_core.output_parsers import StrOutputParser

			
 
				+from langchain import hub

			
 
				+from tqdm import tqdm

			
 
				+

			
 
				+# __import__('pysqlite3')

			
 
				+# import sys

			
 
				+# sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")

			
 
				+

			
 
				+from datasets import Dataset 

			
 
				+from ragas import evaluate

			
 
				+from ragas.metrics import (

			
 
				+    answer_relevancy,

			
 
				+    faithfulness,

			
 
				+    context_recall,

			
 
				+    context_precision,

			
 
				+)

			
 
				+import pandas as pd

			
 
				+import os

			
 
				+import glob

			
 
				+

			
 
				+from dotenv import load_dotenv

			
 
				+import os

			
 
				+load_dotenv()

			
 
				+URI = os.getenv("SUPABASE_URI")

			
 
				+

			
 
				+from RAG_strategy import multi_query, naive_rag, individually_generate_final_answer

			
 
				+

			
 
				+def create_retriever(path='Documents', extension="pdf"):

			
 
				+    txt_files = glob.glob(os.path.join(path, f"*.{extension}"))

			
 
				+    

			
 
				+    doc = []

			
 
				+    for file_path in txt_files:

			
 
				+        doc.append(file_path)

			
 
				+    

			
 
				+    def load_and_split(file_list):

			
 
				+        chunks = []

			
 
				+        for file in file_list:

			
 
				+            if file.endswith(".txt"):

			
 
				+                loader = TextLoader(file, encoding='utf-8')

			
 
				+            elif file.endswith(".pdf"):

			
 
				+                loader = PyPDFLoader(file)

			
 
				+            elif file.endswith(".docx"):

			
 
				+                loader = Docx2txtLoader(file)

			
 
				+            else:

			
 
				+                raise ValueError(f"Unsupported file extension: {file}")

			
 
				+            

			
 
				+

			
 
				+            docs = loader.load()

			
 
				+

			
 
				+            # Split

			
 
				+            if file.endswith(".docx"):

			
 
				+                # separators = ["\n\n\u25cb", "\n\n\u25cf"]

			
 
				+                # text_splitter = RecursiveCharacterTextSplitter(separators=separators, chunk_size=500, chunk_overlap=0)

			
 
				+                separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']

			
 
				+                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=300, chunk_overlap=0)

			
 
				+            else:

			
 
				+                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)

			
 
				+            

			
 
				+            splits = text_splitter.split_documents(docs)

			
 
				+

			
 
				+            chunks.extend(splits)

			
 
				+

			
 
				+        return chunks

			
 
				+

			
 
				+    # Index

			
 
				+    docs = load_and_split(doc)

			
 
				+    qa_history_doc = gen_doc_from_history()

			
 
				+    docs.extend(qa_history_doc)

			
 
				+    # web_doc = web_data(os.path.join(path, 'web_url.csv'))

			
 
				+    # docs.extend(web_doc)

			
 
				+

			
 
				+    # vectorstore

			
 
				+    # vectorstore = Chroma.from_texts(texts=docs, embedding=OpenAIEmbeddings())

			
 
				+    vectorstore = Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings())

			
 
				+    # vectorstore = Chroma.from_documents(documents=docs, embedding=OllamaEmbeddings(model="llama3", num_gpu=1))

			
 
				+    vectorstore.persist()

			
 
				+

			
 
				+    retriever = vectorstore.as_retriever()

			
 
				+

			
 
				+    return retriever

			
 
				+

			
 
				+def web_data(url_file):

			
 
				+    df = pd.read_csv(url_file, header = 0)

			
 
				+    url_list = df['url'].to_list()

			
 
				+

			
 
				+    loader = WebBaseLoader(url_list)

			
 
				+    docs = loader.load()

			
 
				+

			
 
				+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(

			
 
				+                chunk_size=1000, chunk_overlap=0)

			
 
				+    splits = text_splitter.split_documents(docs)

			
 
				+    

			
 
				+    return splits

			
 
				+

			
 
				+def gen_doc_from_history():

			
 
				+    engine = create_engine(URI, echo=True)

			
 
				+

			
 
				+    df = pd.read_sql_table("systex_records", engine.connect())  

			
 
				+    df.fillna('', inplace=True)

			
 
				+    result = df.to_json(orient='index', force_ascii=False)

			
 
				+    result = loads(result)

			
 
				+

			
 
				+

			
 
				+    df = pd.DataFrame(result).T

			
 
				+    qa_history_doc = []

			
 
				+    for i in range(len(df)):

			
 
				+        if df.iloc[i]['used_as_document'] is not True: continue

			
 
				+        Question = df.iloc[i]['Question']

			
 
				+        Answer = df.iloc[i]['Answer']

			
 
				+        context = f'Question: {Question}\nAnswer: {Answer}'

			
 
				+        

			
 
				+        doc =  Document(page_content=context, metadata={"source": "History"})

			
 
				+        qa_history_doc.append(doc)

			
 
				+        # print(doc)

			
 
				+

			
 
				+    return qa_history_doc

			
 
				+

			
 
				+def gen_doc_from_database():

			
 
				+    engine = create_engine(URI, echo=True)

			
 
				+

			
 
				+    df = pd.read_sql_table("QA_database", engine.connect())  

			
 
				+    # df.fillna('', inplace=True)

			
 
				+    result = df[['Question', 'Answer']].to_json(orient='index', force_ascii=False)

			
 
				+    result = loads(result)

			
 
				+

			
 
				+

			
 
				+    df = pd.DataFrame(result).T

			
 
				+    qa_doc = []

			
 
				+    for i in range(len(df)):

			
 
				+        # if df.iloc[i]['used_as_document'] is not True: continue

			
 
				+        Question = df.iloc[i]['Question']

			
 
				+        Answer = df.iloc[i]['Answer']

			
 
				+        context = f'Question: {Question}\nAnswer: {Answer}'

			
 
				+        

			
 
				+        doc = Document(page_content=context, metadata={"source": "History"})

			
 
				+        qa_doc.append(doc)

			
 
				+        # print(doc)

			
 
				+

			
 
				+    return qa_doc

			
 
				+

			
 
				+if __name__ == "__main__":

			
 
				+

			
 
				+    retriever = create_retriever(path='./Documents', extension="pdf")

			
 
				+    question = 'CEV系統可以支援盤查到什麼程度'

			
 
				+    final_answer, reference_docs = multi_query(question, retriever)

			
 
				+    print(question, final_answer)

			
 
				+    question = 'CEV系統依循標準為何'

			
 
				+    final_answer, reference_docs = multi_query(question, retriever)

			
 
				+    print(question, final_answer)

			
 
				+

			
 
				+

			
 
				+

			
 
				+

			
--- a/Indexing_Unstructured.py
+++ b/Indexing_Unstructured.py
@@ -0,0 +1,188 @@
 
				+import glob

			
 
				+from typing import Any

			
 
				+

			
 
				+from pydantic import BaseModel

			
 
				+from tqdm import tqdm

			
 
				+from unstructured.partition.pdf import partition_pdf

			
 
				+from dotenv import load_dotenv

			
 
				+import os

			
 
				+

			
 
				+

			
 
				+load_dotenv()

			
 
				+os.environ["PATH"] += os.pathsep + "C:/Users/lzl/anaconda3/Lib/site-packages/poppler-24.02.0/Library/bin"

			
 
				+os.environ["PATH"] += os.pathsep + r"C:\Program Files\Tesseract-OCR\tessdata"

			
 
				+os.environ["PATH"] += os.pathsep + r"C:\Program Files\Tesseract-OCR"

			
 
				+os.environ["TESSDATA_PREFIX"] = r"C:\Program Files\Tesseract-OCR\tessdata"

			
 
				+

			
 
				+def read_document(pdf):

			
 
				+

			
 
				+    # read unstructured documents

			
 
				+    # Get elements

			
 
				+    raw_pdf_elements = partition_pdf(

			
 
				+        filename=pdf,

			
 
				+        # Unstructured first finds embedded image blocks

			
 
				+        extract_images_in_pdf=False,

			
 
				+        # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles

			
 
				+        # Titles are any sub-section of the document

			
 
				+        infer_table_structure=True,

			
 
				+        # Post processing to aggregate text once we have the title

			
 
				+        chunking_strategy="by_title",

			
 
				+        # Chunking params to aggregate text blocks

			
 
				+        # Attempt to create a new chunk 3800 chars

			
 
				+        # Attempt to keep chunks > 2000 chars

			
 
				+        max_characters=1000,

			
 
				+        new_after_n_chars=980,

			
 
				+        combine_text_under_n_chars=500,

			
 
				+        # image_output_dir_path=path,

			
 
				+    )

			
 
				+

			
 
				+    return raw_pdf_elements

			
 
				+

			
 
				+

			
 
				+

			
 
				+class Element(BaseModel):

			
 
				+    type: str

			
 
				+    text: Any

			
 
				+

			
 
				+def extract_different_type_elements(raw_pdf_elements):

			
 
				+    # Categorize by type

			
 
				+    categorized_elements = []

			
 
				+    for element in raw_pdf_elements:

			
 
				+        if "unstructured.documents.elements.Table" in str(type(element)):

			
 
				+            categorized_elements.append(Element(type="table", text=str(element)))

			
 
				+        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):

			
 
				+            categorized_elements.append(Element(type="text", text=str(element)))

			
 
				+

			
 
				+    # Tables

			
 
				+    table_elements = [e for e in categorized_elements if e.type == "table"]

			
 
				+    # print(len(table_elements))

			
 
				+

			
 
				+    # Text

			
 
				+    text_elements = [e for e in categorized_elements if e.type == "text"]

			
 
				+    # print(len(text_elements))

			
 
				+

			
 
				+    return table_elements, text_elements

			
 
				+

			
 
				+

			
 
				+from langchain_core.output_parsers import StrOutputParser

			
 
				+from langchain_core.prompts import ChatPromptTemplate

			
 
				+from langchain_openai import ChatOpenAI

			
 
				+

			
 
				+def summarize(table_elements, text_elements):

			
 
				+    # Prompt

			
 
				+    prompt_text = """You are an assistant tasked with summarizing tables and text. \ 

			
 
				+    Give a concise summary of the table or text. Table or text chunk: {element} """

			
 
				+    prompt = ChatPromptTemplate.from_template(prompt_text)

			
 
				+

			
 
				+    # Summary chain

			
 
				+    model = ChatOpenAI(temperature=0, model="gpt-4")

			
 
				+    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

			
 
				+

			
 
				+    # Apply to tables

			
 
				+    tables = [i.text for i in table_elements]

			
 
				+    table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

			
 
				+

			
 
				+    # Apply to texts

			
 
				+    texts = [i.text for i in text_elements]

			
 
				+    text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

			
 
				+

			
 
				+    return (tables, table_summaries), (texts, text_summaries)

			
 
				+

			
 
				+import uuid

			
 
				+

			
 
				+from langchain.retrievers.multi_vector import MultiVectorRetriever

			
 
				+from langchain.storage import InMemoryStore

			
 
				+from langchain_community.vectorstores import Chroma

			
 
				+from langchain_core.documents import Document

			
 
				+from langchain_openai import OpenAIEmbeddings

			
 
				+

			
 
				+def create_retriever(path='../Documents'):

			
 
				+    txt_files = glob.glob(os.path.join(path, f"*.pdf"))

			
 
				+    

			
 
				+    pdfs = []

			
 
				+    for file_path in txt_files:

			
 
				+        pdfs.append(file_path)

			
 
				+

			
 
				+    def add_elements(docs, docs_summaries):

			
 
				+        doc_ids = [str(uuid.uuid4()) for _ in docs]

			
 
				+        summary_texts = [

			
 
				+            Document(page_content=s, metadata={id_key: doc_ids[i]})

			
 
				+            for i, s in enumerate(docs_summaries)

			
 
				+        ]

			
 
				+        try:

			
 
				+            retriever.vectorstore.add_documents(summary_texts)

			
 
				+            retriever.docstore.mset(list(zip(doc_ids, docs)))

			
 
				+        except ValueError as e:

			
 
				+            pass

			
 
				+

			
 
				+    tqdm_total = len(pdfs) * 3 + 1

			
 
				+    with tqdm(total=tqdm_total, desc="Creating retriever...") as retriever_progress:

			
 
				+        # The vectorstore to use to index the child chunks

			
 
				+        vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())

			
 
				+

			
 
				+        # The storage layer for the parent documents

			
 
				+        store = InMemoryStore()

			
 
				+        id_key = "doc_id"

			
 
				+

			
 
				+        # The retriever (empty to start)

			
 
				+        retriever = MultiVectorRetriever(

			
 
				+            vectorstore=vectorstore,

			
 
				+            docstore=store,

			
 
				+            id_key=id_key,

			
 
				+        )

			
 
				+        retriever_progress.update(1)

			
 
				+        # ----------------------------------------------------------------

			
 
				+        

			
 
				+        for pdf in pdfs:

			
 
				+            # preprocessing

			
 
				+            raw_pdf_elements = read_document(pdf)

			
 
				+            retriever_progress.update(1)

			
 
				+

			
 
				+            table_elements, text_elements = extract_different_type_elements(raw_pdf_elements)

			
 
				+            retriever_progress.update(1)

			
 
				+

			
 
				+            (tables, table_summaries), (texts, text_summaries) = summarize(table_elements, text_elements)

			
 
				+            retriever_progress.update(1)

			
 
				+

			
 
				+            add_elements(texts, text_summaries)

			
 
				+            add_elements(tables, table_summaries)

			
 
				+

			
 
				+    return retriever

			
 
				+

			
 
				+

			
 
				+if __name__ == "__main__":

			
 
				+    from langchain_core.runnables import RunnablePassthrough

			
 
				+

			
 
				+    # # Prompt template

			
 
				+    # template = """Answer the question based only on the following context, which can include text and tables:

			
 
				+    # {context}

			
 
				+    # Question: {question}

			
 
				+    # """

			
 
				+    # prompt = ChatPromptTemplate.from_template(template)

			
 
				+

			
 
				+    # LLM

			
 
				+    # model = ChatOpenAI(temperature=0, model="gpt-4")

			
 
				+

			
 
				+    # pdfs = ["國際鏈結之企業碳足跡指引.pdf"]

			
 
				+    retriever = create_retriever()

			
 
				+

			
 
				+

			
 
				+    # # RAG pipeline

			
 
				+    # chain = (

			
 
				+    #     {"context": retriever, "question": RunnablePassthrough()}

			
 
				+    #     | prompt

			
 
				+    #     | model

			
 
				+    #     | StrOutputParser()

			
 
				+    # )

			
 
				+

			
 
				+    question = ""

			
 
				+    while question != "exit":

			
 
				+        question = input("Question: ")

			
 
				+        if question == "exit":

			
 
				+            break

			
 
				+        # answer = chain.invoke(question)

			
 
				+        final_answer, reference_docs = generate_final_answer(question, retriever)

			
 
				+        print(final_answer)

			
 
				+        print("\n")

			
 
				+

			
 
				+    
			
--- a/RAG_app.py
+++ b/RAG_app.py
@@ -0,0 +1,148 @@
 
				+from fastapi import FastAPI, Request, HTTPException, status

			
 
				+# from fastapi.templating import Jinja2Templates

			
 
				+from fastapi.middleware.cors import CORSMiddleware

			
 
				+from fastapi.responses import FileResponse

			
 
				+from fastapi import Depends

			
 
				+from contextlib import asynccontextmanager

			
 
				+from pydantic import BaseModel

			
 
				+import uvicorn

			
 
				+

			
 
				+from typing import List, Optional

			
 
				+import sqlparse

			
 
				+from sqlalchemy import create_engine

			
 
				+import pandas as pd

			
 
				+#from retrying import retry

			
 
				+import datetime

			
 
				+import json

			
 
				+from json import loads

			
 
				+import pandas as pd

			
 
				+import time

			
 
				+from langchain.callbacks import get_openai_callback

			
 
				+

			
 
				+from langchain_community.vectorstores import Chroma

			
 
				+from langchain_openai import OpenAIEmbeddings

			
 
				+from RAG_strategy import multi_query, naive_rag, individually_generate_final_answer, naive_rag_for_qapairs

			
 
				+from Indexing_Split import create_retriever as split_retriever

			
 
				+# from Indexing_RAPTOR import create_retriever as raptor_retriever

			
 
				+# from Indexing_Unstructured import create_retriever as unstructured_retriever

			
 
				+from Indexing_Split import gen_doc_from_database, gen_doc_from_history

			
 
				+

			
 
				+from dotenv import load_dotenv

			
 
				+import os

			
 
				+load_dotenv()

			
 
				+URI = os.getenv("SUPABASE_URI")

			
 
				+

			
 
				+global_retriever = None

			
 
				+

			
 
				+@asynccontextmanager

			
 
				+async def lifespan(app: FastAPI):

			
 
				+    global global_retriever

			
 
				+    start = time.time()

			
 
				+    global_retriever = split_retriever(path='./Documents', extension="docx")

			
 
				+    # global_retriever = raptor_retriever(path='../Documents', extension="txt")

			
 
				+    # global_retriever = unstructured_retriever(path='../Documents')

			
 
				+    print(time.time() - start)

			
 
				+    yield

			
 
				+

			
 
				+def get_retriever():

			
 
				+    return global_retriever

			
 
				+

			
 
				+app = FastAPI(lifespan=lifespan)

			
 
				+

			
 
				+# templates = Jinja2Templates(directory="temp")

			
 
				+app.add_middleware(

			
 
				+    CORSMiddleware,

			
 
				+    allow_origins=["*"],

			
 
				+    allow_credentials=True,

			
 
				+    allow_methods=["*"],

			
 
				+    allow_headers=["*"],

			
 
				+)

			
 
				+

			
 
				+#@app.get("/answer")

			
 
				+def decomposition_individually_answer(question: Optional[str] = '', retriever=Depends(get_retriever)):

			
 
				+    start = time.time()

			
 
				+    with get_openai_callback() as cb:

			
 
				+        final_answer, reference_docs = individually_generate_final_answer(question, retriever)

			
 
				+    processing_time = time.time() - start

			
 
				+    print(processing_time)

			
 
				+    save_history(question, final_answer, reference_docs, cb, processing_time)

			
 
				+    

			
 
				+

			
 
				+    return {"Answer": final_answer, "Reference": reference_docs}

			
 
				+

			
 
				+

			
 
				+CHAT_HISTORY = []

			
 
				+@app.get("/answer2")

			
 
				+def multi_query_answer(question: Optional[str] = '', retriever=Depends(get_retriever)):

			
 
				+    start = time.time()

			
 
				+

			
 
				+    with get_openai_callback() as cb:

			
 
				+        # qa_doc = gen_doc_from_database()

			
 
				+        # qa_history_doc = gen_doc_from_history()

			
 
				+        # qa_doc.extend(qa_history_doc)

			
 
				+        # vectorstore = Chroma.from_documents(documents=qa_doc, embedding=OpenAIEmbeddings(), collection_name="qa_pairs")

			
 
				+        # retriever_qa = vectorstore.as_retriever(search_kwargs={"k": 3})

			
 
				+        # final_answer, reference_docs = naive_rag_for_qapairs(question, retriever_qa)

			
 
				+        final_answer = 'False'

			
 
				+        if final_answer == 'False':

			
 
				+            final_answer, reference_docs = multi_query(question, retriever, CHAT_HISTORY)

			
 
				+

			
 
				+    CHAT_HISTORY.append((question, final_answer))

			
 
				+    # print(CHAT_HISTORY)

			
 
				+    

			
 
				+    # with get_openai_callback() as cb:

			
 
				+    #     final_answer, reference_docs = multi_query(question, retriever)

			
 
				+    processing_time = time.time() - start

			
 
				+    print(processing_time)

			
 
				+    save_history(question, final_answer, reference_docs, cb, processing_time)

			
 
				+

			
 
				+    return {"Answer": final_answer}

			
 
				+

			
 
				+#@app.get("/answer3")

			
 
				+def naive_answer(question: Optional[str] = '', retriever=Depends(get_retriever)):

			
 
				+    start = time.time()

			
 
				+    with get_openai_callback() as cb:

			
 
				+        final_answer, reference_docs = naive_rag(question, retriever)

			
 
				+    processing_time = time.time() - start

			
 
				+    print(processing_time)

			
 
				+    save_history(question, final_answer, reference_docs, cb, processing_time)

			
 
				+

			
 
				+    return {"Answer": final_answer, "Reference": reference_docs}

			
 
				+

			
 
				+def save_history(question, answer, reference, cb, processing_time):

			
 
				+    # reference = [doc.dict() for doc in reference]

			
 
				+    record = {

			
 
				+        'Question': [question],

			
 
				+        'Answer': [answer],

			
 
				+        'Total_Tokens': [cb.total_tokens],

			
 
				+        'Total_Cost': [cb.total_cost],

			
 
				+        'Processing_time': [processing_time],

			
 
				+        'Contexts': [str(reference)]

			
 
				+    }

			
 
				+    df = pd.DataFrame(record)

			
 
				+    engine = create_engine(URI)

			
 
				+    df.to_sql(name='systex_records', con=engine, index=False, if_exists='append')

			
 
				+

			
 
				+class history_output(BaseModel):

			
 
				+    Question: str

			
 
				+    Answer: str

			
 
				+    Contexts: str

			
 
				+    Total_Tokens: int

			
 
				+    Total_Cost: float

			
 
				+    Processing_time: float

			
 
				+    Time: datetime.datetime

			
 
				+    

			
 
				+@app.get('/history', response_model=List[history_output])

			
 
				+async def get_history():

			
 
				+    engine = create_engine(URI, echo=True)

			
 
				+

			
 
				+    df = pd.read_sql_table("systex_records", engine.connect())  

			
 
				+    df.fillna('', inplace=True)

			
 
				+    result = df.to_json(orient='index', force_ascii=False)

			
 
				+    result = loads(result)

			
 
				+    return result.values()

			
 
				+

			
 
				+if __name__ == "__main__":

			
 
				+    uvicorn.run("RAG_app:app", host='cmm.ai', port=8081, ssl_keyfile="/etc/letsencrypt/live/cmm.ai/privkey.pem", 

			
 
				+                ssl_certfile="/etc/letsencrypt/live/cmm.ai/fullchain.pem")

			
 
				+

			
--- a/RAG_strategy.py
+++ b/RAG_strategy.py
@@ -0,0 +1,357 @@
 
				+from langchain.prompts import ChatPromptTemplate

			
 
				+from langchain.load import dumps, loads

			
 
				+from langchain_core.output_parsers import StrOutputParser

			
 
				+from langchain_openai import ChatOpenAI

			
 
				+from langchain_community.llms import Ollama

			
 
				+from langchain_community.chat_models import ChatOllama

			
 
				+from operator import itemgetter

			
 
				+from langchain_core.runnables import RunnablePassthrough

			
 
				+from langchain import hub

			
 
				+from langchain.globals import set_llm_cache

			
 
				+from langchain import PromptTemplate

			
 
				+

			
 
				+

			
 
				+from langchain_core.runnables import (

			
 
				+    RunnableBranch,

			
 
				+    RunnableLambda,

			
 
				+    RunnableParallel,

			
 
				+    RunnablePassthrough,

			
 
				+)

			
 
				+from typing import Tuple, List, Optional

			
 
				+from langchain_core.messages import AIMessage, HumanMessage

			
 
				+

			
 
				+

			
 
				+from datasets import Dataset 

			
 
				+from ragas import evaluate

			
 
				+from ragas.metrics import (

			
 
				+    answer_relevancy,

			
 
				+    faithfulness,

			
 
				+    context_recall,

			
 
				+    context_precision,

			
 
				+)

			
 
				+from typing import List

			
 
				+from dotenv import load_dotenv

			
 
				+load_dotenv()

			
 
				+

			
 
				+########################################################################################################################

			
 
				+########################################################################################################################

			
 
				+from langchain.cache import SQLiteCache

			
 
				+

			
 
				+from langchain.cache import RedisSemanticCache

			
 
				+from langchain_openai import OpenAIEmbeddings

			
 
				+# set_llm_cache(SQLiteCache(database_path=".langchain.db"))

			
 
				+set_llm_cache(RedisSemanticCache(redis_url="redis://localhost:6380", embedding=OpenAIEmbeddings(), score_threshold=0.0005))

			
 
				+########################################################################################################################

			
 
				+

			
 
				+def get_search_query():

			
 
				+    # Condense a chat history and follow-up question into a standalone question

			
 
				+    # 

			
 
				+    _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,

			
 
				+    in its original language.

			
 
				+    Chat History:

			
 
				+    {chat_history}

			
 
				+    Follow Up Input: {question}

			
 
				+    Standalone question:"""  # noqa: E501

			
 
				+    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

			
 
				+

			
 
				+    def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:

			
 
				+        buffer = []

			
 
				+        for human, ai in chat_history:

			
 
				+            buffer.append(HumanMessage(content=human))

			
 
				+            buffer.append(AIMessage(content=ai))

			
 
				+        return buffer

			
 
				+

			
 
				+    _search_query = RunnableBranch(

			
 
				+        # If input includes chat_history, we condense it with the follow-up question

			
 
				+        (

			
 
				+            RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(

			
 
				+                run_name="HasChatHistoryCheck"

			
 
				+            ),  # Condense follow-up question and chat into a standalone_question

			
 
				+            RunnablePassthrough.assign(

			
 
				+                chat_history=lambda x: _format_chat_history(x["chat_history"])

			
 
				+            )

			
 
				+            | CONDENSE_QUESTION_PROMPT

			
 
				+            | ChatOpenAI(temperature=0)

			
 
				+            | StrOutputParser(),

			
 
				+        ),

			
 
				+        # Else, we have no chat history, so just pass through the question

			
 
				+        RunnableLambda(lambda x : x["question"]),

			
 
				+    )

			
 
				+

			
 
				+    return _search_query

			
 
				+########################################################################################################################

			
 
				+def multi_query_rag_prompt(retrieval_chain, question):

			
 
				+    # RAG

			
 
				+    template = """Answer the following question based on this context:

			
 
				+

			
 
				+    {context}

			
 
				+

			
 
				+    Question: {question}

			
 
				+    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw. \n

			
 
				+    You should not mention anything about "根據提供的文件內容" or other similar terms.

			
 
				+    If you don't know the answer, just say that "很抱歉，目前我無法回答您的問題，請將您的詢問發送至 test@systex.com 以便獲得更進一步的幫助，謝謝。"

			
 
				+    """

			
 
				+

			
 
				+    prompt = ChatPromptTemplate.from_template(template)

			
 
				+

			
 
				+    # llm = ChatOpenAI(temperature=0)

			
 
				+    llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")

			
 
				+    # llm = ChatOllama(model="llama3", num_gpu=1, temperature=0)

			
 
				+

			
 
				+    final_rag_chain = (

			
 
				+        {"context": retrieval_chain, 

			
 
				+        "question": itemgetter("question")} 

			
 
				+        | prompt

			
 
				+        | llm

			
 
				+        | StrOutputParser()

			
 
				+    )

			
 
				+

			
 
				+    answer = final_rag_chain.invoke({"question":question})

			
 
				+

			
 
				+    return answer

			
 
				+

			
 
				+def multi_query(question, retriever, chat_history):

			
 
				+

			
 
				+    def multi_query_chain():

			
 
				+        # Multi Query: Different Perspectives

			
 
				+        template = """You are an AI language model assistant. Your task is to generate three 

			
 
				+        different versions of the given user question to retrieve relevant documents from a vector 

			
 
				+        database. By generating multiple perspectives on the user question, your goal is to help

			
 
				+        the user overcome some of the limitations of the distance-based similarity search. 

			
 
				+        Provide these alternative questions separated by newlines. Original question: {question}"""

			
 
				+        prompt_perspectives = ChatPromptTemplate.from_template(template)

			
 
				+

			
 
				+        

			
 
				+        llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")

			
 
				+        # llm = ChatOllama(model="llama3", num_gpu=1, temperature=0)

			
 
				+

			
 
				+        generate_queries = (

			
 
				+            prompt_perspectives 

			
 
				+            | llm

			
 
				+            | StrOutputParser() 

			
 
				+            | (lambda x: x.split("\n"))

			
 
				+        )

			
 
				+

			
 
				+        return generate_queries

			
 
				+

			
 
				+    def get_unique_union(documents: List[list]):

			
 
				+        """ Unique union of retrieved docs """

			
 
				+        # Flatten list of lists, and convert each Document to string

			
 
				+        flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]

			
 
				+        # Get unique documents

			
 
				+        unique_docs = list(set(flattened_docs))

			
 
				+        # Return

			
 
				+        return [loads(doc) for doc in unique_docs]

			
 
				+    

			
 
				+

			
 
				+    _search_query = get_search_query()

			
 
				+    modified_question = _search_query.invoke({"question":question, "chat_history": chat_history})

			
 
				+    print(modified_question)

			
 
				+

			
 
				+    generate_queries = multi_query_chain()

			
 
				+

			
 
				+    retrieval_chain = generate_queries | retriever.map() | get_unique_union

			
 
				+    docs = retrieval_chain.invoke({"question":modified_question})

			
 
				+

			
 
				+    answer = multi_query_rag_prompt(retrieval_chain, modified_question)

			
 
				+

			
 
				+    return answer, docs

			
 
				+

			
 
				+########################################################################################################################

			
 
				+

			
 
				+def naive_rag(question, retriever):

			
 
				+    #### RETRIEVAL and GENERATION ####

			
 
				+

			
 
				+    # Prompt

			
 
				+    prompt = hub.pull("rlm/rag-prompt")

			
 
				+

			
 
				+    # LLM

			
 
				+    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

			
 
				+

			
 
				+    # Post-processing

			
 
				+    def format_docs(docs):

			
 
				+        return "\n\n".join(doc.page_content for doc in docs)

			
 
				+

			
 
				+    reference = retriever.get_relevant_documents(question)

			
 
				+    

			
 
				+    # Chain

			
 
				+    rag_chain = (

			
 
				+        {"context": retriever | format_docs, "question": RunnablePassthrough()}

			
 
				+        | prompt

			
 
				+        | llm

			
 
				+        | StrOutputParser()

			
 
				+    )

			
 
				+

			
 
				+    # Question

			
 
				+    answer = rag_chain.invoke(question)

			
 
				+

			
 
				+    return answer, reference

			
 
				+################################################################################################

			
 
				+def naive_rag_for_qapairs(question, retriever):

			
 
				+    #### RETRIEVAL and GENERATION ####

			
 
				+

			
 
				+    # Prompt

			
 
				+    # prompt = hub.pull("rlm/rag-prompt")

			
 
				+    template = """You are an assistant for question-answering tasks. 

			
 
				+    Use the following pieces of retrieved context to answer the question. 

			
 
				+    Following retrieved context is question-answer pairs of historical QA, Find the suitable answer from the qa pairs

			
 
				+    If you can not find the suitable answer, just return "False". 

			
 
				+    Use three sentences maximum and Do not make up the answer.

			
 
				+

			
 
				+    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw.

			
 
				+

			
 
				+    {context}

			
 
				+

			
 
				+    Question: {question}

			
 
				+    """

			
 
				+    prompt = PromptTemplate.from_template(template)

			
 
				+

			
 
				+    # LLM

			
 
				+    llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)

			
 
				+    # llm = ChatOllama(model="llama3", num_gpu=1, temperature=0)

			
 
				+

			
 
				+    # Post-processing

			
 
				+    def format_docs(docs):

			
 
				+        return "\n\n".join(doc.page_content for doc in docs)

			
 
				+

			
 
				+    reference = retriever.get_relevant_documents(question)

			
 
				+    

			
 
				+    # Chain

			
 
				+    rag_chain = (

			
 
				+        {"context": retriever | format_docs, "question": RunnablePassthrough()}

			
 
				+        | prompt

			
 
				+        | llm

			
 
				+        | StrOutputParser()

			
 
				+    )

			
 
				+

			
 
				+    # Question

			
 
				+    answer = rag_chain.invoke(question)

			
 
				+

			
 
				+    return answer, reference

			
 
				+########################################################################################################################

			
 
				+def decomposition():

			
 
				+    # Decomposition

			
 
				+    template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n

			
 
				+    The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n

			
 
				+    Generate multiple search queries related to: {question} \n

			
 
				+    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw. \n

			
 
				+    Output (3 queries):"""

			
 
				+    prompt_decomposition = ChatPromptTemplate.from_template(template)

			
 
				+

			
 
				+    # LLM

			
 
				+    llm = ChatOpenAI(temperature=0.5)

			
 
				+

			
 
				+    # Chain

			
 
				+    generate_queries_decomposition = (prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))

			
 
				+

			
 
				+    return generate_queries_decomposition

			
 
				+

			
 
				+# ----------------------------------------------------------------

			
 
				+

			
 
				+# Answer each sub-question individually 

			
 
				+def retrieve_and_rag(question, sub_question_generator_chain, retriever):

			
 
				+    """RAG on each sub-question"""

			
 
				+

			
 
				+    # LLM

			
 
				+    # llm = ChatOpenAI(temperature=0.5)

			
 
				+    llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")

			
 
				+

			
 
				+    # Use our decomposition / 

			
 
				+    sub_questions = sub_question_generator_chain.invoke({"question":question})

			
 
				+    

			
 
				+    # Initialize a list to hold RAG chain results

			
 
				+    rag_results = []

			
 
				+

			
 
				+    # RAG prompt

			
 
				+    prompt_rag = hub.pull("rlm/rag-prompt")

			
 
				+

			
 
				+    all_reference_docs = []

			
 
				+

			
 
				+    for sub_question in sub_questions:

			
 
				+        

			
 
				+        # Retrieve documents for each sub-question

			
 
				+        retrieved_docs = retriever.get_relevant_documents(sub_question)

			
 
				+        #print(f'\nreference docs: \n{retrieved_docs[0].page_content}\n')

			
 
				+        #print(retrieved_docs)

			
 
				+        for doc in retrieved_docs:

			
 
				+            #print(doc.page_content)

			
 
				+            # all_reference_docs.append(doc.page_content)

			
 
				+            all_reference_docs.append(doc)

			
 
				+        

			
 
				+        # Use retrieved documents and sub-question in RAG chain

			
 
				+        answer = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs, 

			
 
				+                                                                "question": sub_question})

			
 
				+        rag_results.append(answer)

			
 
				+    # all_reference_docs = ""

			
 
				+

			
 
				+    return rag_results, sub_questions, all_reference_docs

			
 
				+

			
 
				+# ----------------------------------------------------------------

			
 
				+

			
 
				+def individually_generate_final_answer(question, retriever):

			
 
				+

			
 
				+    def format_qa_pairs(questions, answers):

			
 
				+        """Format Q and A pairs"""

			
 
				+        

			
 
				+        formatted_string = ""

			
 
				+        for i, (question, answer) in enumerate(zip(questions, answers), start=1):

			
 
				+            formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"

			
 
				+        return formatted_string.strip()

			
 
				+

			
 
				+    # LLM

			
 
				+    # llm = ChatOpenAI(temperature=0.5)

			
 
				+    llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")

			
 
				+

			
 
				+    # decomposition

			
 
				+    generate_queries_decomposition = decomposition()

			
 
				+

			
 
				+    # Wrap the retrieval and RAG process in a RunnableLambda for integration into a chain

			
 
				+    answers, questions, reference_docs = retrieve_and_rag(question, generate_queries_decomposition, retriever)

			
 
				+    #print(answers, questions, reference_docs)

			
 
				+    context = format_qa_pairs(questions, answers)

			
 
				+

			
 
				+    # Prompt

			
 
				+    template = """Here is a set of Q+A pairs:

			
 
				+

			
 
				+    {context}

			
 
				+

			
 
				+    Use these to synthesize an answer to the question: {question}

			
 
				+    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw. \n

			
 
				+    """

			
 
				+

			
 
				+    prompt = ChatPromptTemplate.from_template(template)

			
 
				+

			
 
				+    final_rag_chain = (

			
 
				+        prompt

			
 
				+        | llm

			
 
				+        | StrOutputParser()

			
 
				+    )

			
 
				+

			
 
				+    final_answer = final_rag_chain.invoke({"context": context, "question": question})

			
 
				+

			
 
				+    return final_answer, reference_docs

			
 
				+

			
 
				+

			
 
				+####################

			
 
				+

			
 
				+def rag_score(question, ground_truth, answer, reference_docs):

			
 
				+    

			
 
				+    datasets = {

			
 
				+              "question": [question],       # question: list[str]

			
 
				+              "answer": [answer],           # answer: list[str]

			
 
				+              "contexts": [reference_docs], # contexts: list[list[str]]

			
 
				+              "ground_truths": [[ground_truth]] # ground_truth: list[list[str]]

			
 
				+            }

			
 
				+    evalsets = Dataset.from_dict(datasets)

			
 
				+

			
 
				+    result = evaluate(

			
 
				+        evalsets,

			
 
				+        metrics=[

			
 
				+            context_precision,

			
 
				+            faithfulness,

			
 
				+            answer_relevancy,

			
 
				+            context_recall,

			
 
				+        ],

			
 
				+    )

			
 
				+

			
 
				+    return result

			
--- a/add_vectordb.py
+++ b/add_vectordb.py
@@ -0,0 +1,79 @@
 
				+from dotenv import load_dotenv
			
 
				+load_dotenv()
			
 
				+
			
 
				+from langchain_openai import OpenAIEmbeddings
			
 
				+from langchain_community.vectorstores import Chroma
			
 
				+from langchain_community.document_loaders import TextLoader
			
 
				+from langchain_text_splitters import RecursiveCharacterTextSplitter
			
 
				+from langchain_community.document_loaders import PyPDFLoader
			
 
				+
			
 
				+import os
			
 
				+import glob
			
 
				+
			
 
				+def read_and_split_files(path='Documents', extension="pdf"):
			
 
				+    txt_files = glob.glob(os.path.join(path, f"*.{extension}"))
			
 
				+        
			
 
				+    doc = []
			
 
				+    for file_path in txt_files:
			
 
				+        doc.append(file_path)
			
 
				+
			
 
				+    def load_and_split(file_list):
			
 
				+        chunks = []
			
 
				+        for file in file_list:
			
 
				+            if file.endswith(".txt"):
			
 
				+                loader = TextLoader(file, encoding='utf-8')
			
 
				+            elif file.endswith(".pdf"):
			
 
				+                loader = PyPDFLoader(file)
			
 
				+            else:
			
 
				+                raise ValueError(f"Unsupported file extension: {file}")
			
 
				+
			
 
				+            docs = loader.load()
			
 
				+
			
 
				+            # Split
			
 
				+            text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
			
 
				+                chunk_size=1000, chunk_overlap=200)
			
 
				+            splits = text_splitter.split_documents(docs)
			
 
				+
			
 
				+            chunks.extend(splits)
			
 
				+
			
 
				+        return chunks
			
 
				+
			
 
				+    # Index
			
 
				+    docs = load_and_split(doc)
			
 
				+    # qa_history_doc = gen_doc_from_history()
			
 
				+    # docs.extend(qa_history_doc)
			
 
				+    # web_doc = web_data(os.path.join(path, 'web_url.csv'))
			
 
				+    # docs.extend(web_doc)
			
 
				+
			
 
				+    return docs
			
 
				+
			
 
				+
			
 
				+def create_vectordb(docs):
			
 
				+    path = "../SYSTEX_精誠/RAG/Documents/"
			
 
				+    docs = read_and_split_files(path)
			
 
				+
			
 
				+    persist_directory = 'db'
			
 
				+
			
 
				+    embedding = OpenAIEmbeddings()
			
 
				+
			
 
				+    vectordb = Chroma.from_documents(documents = docs,
			
 
				+                                    embedding = embedding,
			
 
				+                                    persist_directory = persist_directory)
			
 
				+
			
 
				+    # 用persist方式執行vectordb，可以將db資料寫到磁碟中
			
 
				+    vectordb.persist()
			
 
				+
			
 
				+
			
 
				+def use_vectordb(persist_directory):
			
 
				+    # embedding使用OpenAI的Embedding
			
 
				+    embedding = OpenAIEmbeddings()
			
 
				+
			
 
				+    # 使用的vectordb
			
 
				+
			
 
				+    vectordb = Chroma(persist_directory=persist_directory, 
			
 
				+                    embedding_function=embedding)
			
 
				+    
			
 
				+def use_retriever(vectordb):
			
 
				+    retriever = vectordb.as_retriever(search_kwargs={'k': 2})
			
 
				+    ans2 = retriever.invoke('溫室氣體種類')
			
 
				+    print(ans2)
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,142 @@
 
				+aiohttp==3.9.5
			
 
				+aiosignal==1.3.1
			
 
				+annotated-types==0.6.0
			
 
				+anyio==4.3.0
			
 
				+appdirs==1.4.4
			
 
				+asgiref==3.8.1
			
 
				+async-timeout==4.0.3
			
 
				+attrs==23.2.0
			
 
				+backoff==2.2.1
			
 
				+bcrypt==4.1.2
			
 
				+beautifulsoup4==4.12.3
			
 
				+build==1.2.1
			
 
				+cachetools==5.3.3
			
 
				+certifi==2024.2.2
			
 
				+charset-normalizer==3.3.2
			
 
				+chroma-hnswlib==0.7.3
			
 
				+chromadb==0.4.24
			
 
				+click==8.1.7
			
 
				+coloredlogs==15.0.1
			
 
				+dataclasses-json==0.6.4
			
 
				+datasets==2.18.0
			
 
				+Deprecated==1.2.14
			
 
				+dill==0.3.8
			
 
				+distro==1.9.0
			
 
				+exceptiongroup==1.2.0
			
 
				+faiss-cpu==1.8.0
			
 
				+fastapi==0.110.1
			
 
				+filelock==3.13.4
			
 
				+flatbuffers==24.3.25
			
 
				+frozenlist==1.4.1
			
 
				+fsspec==2024.2.0
			
 
				+google-auth==2.29.0
			
 
				+googleapis-common-protos==1.63.0
			
 
				+gptcache==0.1.43
			
 
				+graphlib_backport==1.1.0
			
 
				+greenlet==3.0.3
			
 
				+grpcio==1.62.1
			
 
				+h11==0.14.0
			
 
				+httpcore==1.0.5
			
 
				+httptools==0.6.1
			
 
				+httpx==0.27.0
			
 
				+huggingface-hub==0.22.2
			
 
				+humanfriendly==10.0
			
 
				+idna==3.7
			
 
				+importlib-metadata==6.5.0
			
 
				+importlib_resources==6.4.0
			
 
				+jsonpatch==1.33
			
 
				+jsonpointer==2.4
			
 
				+kubernetes==29.0.0
			
 
				+langchain==0.1.16
			
 
				+langchain-community==0.0.33
			
 
				+langchain-core==0.1.43
			
 
				+langchain-openai==0.1.3
			
 
				+langchain-text-splitters==0.0.1
			
 
				+langchainhub==0.1.15
			
 
				+langsmith==0.1.48
			
 
				+markdown-it-py==3.0.0
			
 
				+marshmallow==3.21.1
			
 
				+mdurl==0.1.2
			
 
				+mmh3==4.1.0
			
 
				+monotonic==1.6
			
 
				+mpmath==1.3.0
			
 
				+multidict==6.0.5
			
 
				+multiprocess==0.70.16
			
 
				+mypy-extensions==1.0.0
			
 
				+nest-asyncio==1.6.0
			
 
				+numpy==1.24.4
			
 
				+oauthlib==3.2.2
			
 
				+onnxruntime==1.16.3
			
 
				+openai==1.20.0
			
 
				+opentelemetry-api==1.24.0
			
 
				+opentelemetry-exporter-otlp-proto-common==1.24.0
			
 
				+opentelemetry-exporter-otlp-proto-grpc==1.24.0
			
 
				+opentelemetry-instrumentation==0.45b0
			
 
				+opentelemetry-instrumentation-asgi==0.45b0
			
 
				+opentelemetry-instrumentation-fastapi==0.45b0
			
 
				+opentelemetry-proto==1.24.0
			
 
				+opentelemetry-sdk==1.24.0
			
 
				+opentelemetry-semantic-conventions==0.45b0
			
 
				+opentelemetry-util-http==0.45b0
			
 
				+orjson==3.10.1
			
 
				+overrides==7.7.0
			
 
				+packaging==23.2
			
 
				+pandas==2.0.3
			
 
				+pkg_resources==0.0.0
			
 
				+posthog==3.5.0
			
 
				+protobuf==4.25.3
			
 
				+psycopg2==2.9.9
			
 
				+pulsar-client==3.5.0
			
 
				+pyarrow==15.0.2
			
 
				+pyarrow-hotfix==0.6
			
 
				+pyasn1==0.6.0
			
 
				+pyasn1_modules==0.4.0
			
 
				+pydantic==2.7.0
			
 
				+pydantic_core==2.18.1
			
 
				+Pygments==2.17.2
			
 
				+pypdf==4.2.0
			
 
				+PyPDF2==3.0.1
			
 
				+PyPika==0.48.9
			
 
				+pyproject_hooks==1.0.0
			
 
				+pysbd==0.3.4
			
 
				+pysqlite3-binary==0.5.2.post3
			
 
				+python-dateutil==2.9.0.post0
			
 
				+python-dotenv==1.0.1
			
 
				+pytz==2024.1
			
 
				+PyYAML==6.0.1
			
 
				+ragas==0.1.7
			
 
				+regex==2024.4.16
			
 
				+requests==2.31.0
			
 
				+requests-oauthlib==2.0.0
			
 
				+rich==13.7.1
			
 
				+rsa==4.9
			
 
				+safetensors==0.4.3
			
 
				+shellingham==1.5.4
			
 
				+six==1.16.0
			
 
				+sniffio==1.3.1
			
 
				+soupsieve==2.5
			
 
				+SQLAlchemy==2.0.27
			
 
				+sqlparse==0.5.0
			
 
				+starlette==0.37.2
			
 
				+sympy==1.12
			
 
				+tenacity==8.2.3
			
 
				+tiktoken==0.6.0
			
 
				+tokenizers==0.19.1
			
 
				+tomli==2.0.1
			
 
				+tqdm==4.66.2
			
 
				+transformers==4.40.1
			
 
				+typer==0.12.3
			
 
				+types-requests==2.31.0.20240406
			
 
				+typing-inspect==0.9.0
			
 
				+typing_extensions==4.11.0
			
 
				+tzdata==2024.1
			
 
				+urllib3==2.2.1
			
 
				+uvicorn==0.29.0
			
 
				+uvloop==0.19.0
			
 
				+watchfiles==0.21.0
			
 
				+websocket-client==1.7.0
			
 
				+websockets==12.0
			
 
				+wrapt==1.16.0
			
 
				+xxhash==3.4.1
			
 
				+yarl==1.9.4
			
 
				+zipp==3.18.1