conrad пре 8 месеци
комит
ae38039b0c
7 измењених фајлова са 1483 додато и 0 уклоњено
  1. 395 0
      Indexing_RAPTOR.py
  2. 174 0
      Indexing_Split.py
  3. 188 0
      Indexing_Unstructured.py
  4. 148 0
      RAG_app.py
  5. 357 0
      RAG_strategy.py
  6. 79 0
      add_vectordb.py
  7. 142 0
      requirements.txt

+ 395 - 0
Indexing_RAPTOR.py

@@ -0,0 +1,395 @@
+from langchain_community.vectorstores import Chroma
+
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+import umap
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from sklearn.mixture import GaussianMixture
+
+from dotenv import load_dotenv
+load_dotenv()
+import os
+os.environ["PATH"] += os.pathsep + "C:/Users/lzl/anaconda3/Lib/site-packages/poppler-24.02.0/Library/bin"
+os.environ["PATH"] += os.pathsep + r"C:\Program Files\Tesseract-OCR\tessdata"
+os.environ["PATH"] += os.pathsep + r"C:\Program Files\Tesseract-OCR"
+os.environ["TESSDATA_PREFIX"] = r"C:\Program Files\Tesseract-OCR\tessdata"
+
+from langchain_openai import OpenAIEmbeddings
+
+embd = OpenAIEmbeddings()
+
+from langchain_openai import ChatOpenAI
+
+model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
+
+RANDOM_SEED = 224  # Fixed seed for reproducibility
+
+### --- Code from citations referenced above (added comments and docstrings) --- ###
+
+
+def global_cluster_embeddings(
+    embeddings: np.ndarray,
+    dim: int,
+    n_neighbors: Optional[int] = None,
+    metric: str = "cosine",
+) -> np.ndarray:
+    """
+    Perform global dimensionality reduction on the embeddings using UMAP.
+
+    Parameters:
+    - embeddings: The input embeddings as a numpy array.
+    - dim: The target dimensionality for the reduced space.
+    - n_neighbors: Optional; the number of neighbors to consider for each point.
+                   If not provided, it defaults to the square root of the number of embeddings.
+    - metric: The distance metric to use for UMAP.
+
+    Returns:
+    - A numpy array of the embeddings reduced to the specified dimensionality.
+    """
+    if n_neighbors is None:
+        n_neighbors = int((len(embeddings) - 1) ** 0.5)
+    return umap.UMAP(
+        n_neighbors=n_neighbors, n_components=dim, metric=metric
+    ).fit_transform(embeddings)
+
+
+def local_cluster_embeddings(
+    embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = "cosine"
+) -> np.ndarray:
+    """
+    Perform local dimensionality reduction on the embeddings using UMAP, typically after global clustering.
+
+    Parameters:
+    - embeddings: The input embeddings as a numpy array.
+    - dim: The target dimensionality for the reduced space.
+    - num_neighbors: The number of neighbors to consider for each point.
+    - metric: The distance metric to use for UMAP.
+
+    Returns:
+    - A numpy array of the embeddings reduced to the specified dimensionality.
+    """
+    return umap.UMAP(
+        n_neighbors=num_neighbors, n_components=dim, metric=metric
+    ).fit_transform(embeddings)
+
+
+def get_optimal_clusters(
+    embeddings: np.ndarray, max_clusters: int = 50, random_state: int = RANDOM_SEED
+) -> int:
+    """
+    Determine the optimal number of clusters using the Bayesian Information Criterion (BIC) with a Gaussian Mixture Model.
+
+    Parameters:
+    - embeddings: The input embeddings as a numpy array.
+    - max_clusters: The maximum number of clusters to consider.
+    - random_state: Seed for reproducibility.
+
+    Returns:
+    - An integer representing the optimal number of clusters found.
+    """
+    max_clusters = min(max_clusters, len(embeddings))
+    n_clusters = np.arange(1, max_clusters)
+    bics = []
+    for n in n_clusters:
+        gm = GaussianMixture(n_components=n, random_state=random_state)
+        gm.fit(embeddings)
+        bics.append(gm.bic(embeddings))
+    return n_clusters[np.argmin(bics)]
+
+
+def GMM_cluster(embeddings: np.ndarray, threshold: float, random_state: int = 0):
+    """
+    Cluster embeddings using a Gaussian Mixture Model (GMM) based on a probability threshold.
+
+    Parameters:
+    - embeddings: The input embeddings as a numpy array.
+    - threshold: The probability threshold for assigning an embedding to a cluster.
+    - random_state: Seed for reproducibility.
+
+    Returns:
+    - A tuple containing the cluster labels and the number of clusters determined.
+    """
+    n_clusters = get_optimal_clusters(embeddings)
+    gm = GaussianMixture(n_components=n_clusters, random_state=random_state)
+    gm.fit(embeddings)
+    probs = gm.predict_proba(embeddings)
+    labels = [np.where(prob > threshold)[0] for prob in probs]
+    return labels, n_clusters
+
+
+def perform_clustering(
+    embeddings: np.ndarray,
+    dim: int,
+    threshold: float,
+) -> List[np.ndarray]:
+    """
+    Perform clustering on the embeddings by first reducing their dimensionality globally, then clustering
+    using a Gaussian Mixture Model, and finally performing local clustering within each global cluster.
+
+    Parameters:
+    - embeddings: The input embeddings as a numpy array.
+    - dim: The target dimensionality for UMAP reduction.
+    - threshold: The probability threshold for assigning an embedding to a cluster in GMM.
+
+    Returns:
+    - A list of numpy arrays, where each array contains the cluster IDs for each embedding.
+    """
+    if len(embeddings) <= dim + 1:
+        # Avoid clustering when there's insufficient data
+        return [np.array([0]) for _ in range(len(embeddings))]
+
+    # Global dimensionality reduction
+    reduced_embeddings_global = global_cluster_embeddings(embeddings, dim)
+    # Global clustering
+    global_clusters, n_global_clusters = GMM_cluster(
+        reduced_embeddings_global, threshold
+    )
+
+    all_local_clusters = [np.array([]) for _ in range(len(embeddings))]
+    total_clusters = 0
+
+    # Iterate through each global cluster to perform local clustering
+    for i in range(n_global_clusters):
+        # Extract embeddings belonging to the current global cluster
+        global_cluster_embeddings_ = embeddings[
+            np.array([i in gc for gc in global_clusters])
+        ]
+
+        if len(global_cluster_embeddings_) == 0:
+            continue
+        if len(global_cluster_embeddings_) <= dim + 1:
+            # Handle small clusters with direct assignment
+            local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]
+            n_local_clusters = 1
+        else:
+            # Local dimensionality reduction and clustering
+            reduced_embeddings_local = local_cluster_embeddings(
+                global_cluster_embeddings_, dim
+            )
+            local_clusters, n_local_clusters = GMM_cluster(
+                reduced_embeddings_local, threshold
+            )
+
+        # Assign local cluster IDs, adjusting for total clusters already processed
+        for j in range(n_local_clusters):
+            local_cluster_embeddings_ = global_cluster_embeddings_[
+                np.array([j in lc for lc in local_clusters])
+            ]
+            indices = np.where(
+                (embeddings == local_cluster_embeddings_[:, None]).all(-1)
+            )[1]
+            for idx in indices:
+                all_local_clusters[idx] = np.append(
+                    all_local_clusters[idx], j + total_clusters
+                )
+
+        total_clusters += n_local_clusters
+
+    return all_local_clusters
+
+
+### --- Our code below --- ###
+
+
+def embed(texts):
+    """
+    Generate embeddings for a list of text documents.
+
+    This function assumes the existence of an `embd` object with a method `embed_documents`
+    that takes a list of texts and returns their embeddings.
+
+    Parameters:
+    - texts: List[str], a list of text documents to be embedded.
+
+    Returns:
+    - numpy.ndarray: An array of embeddings for the given text documents.
+    """
+    text_embeddings = embd.embed_documents(texts)
+    text_embeddings_np = np.array(text_embeddings)
+    return text_embeddings_np
+
+
+def embed_cluster_texts(texts):
+    """
+    Embeds a list of texts and clusters them, returning a DataFrame with texts, their embeddings, and cluster labels.
+
+    This function combines embedding generation and clustering into a single step. It assumes the existence
+    of a previously defined `perform_clustering` function that performs clustering on the embeddings.
+
+    Parameters:
+    - texts: List[str], a list of text documents to be processed.
+
+    Returns:
+    - pandas.DataFrame: A DataFrame containing the original texts, their embeddings, and the assigned cluster labels.
+    """
+    text_embeddings_np = embed(texts)  # Generate embeddings
+    cluster_labels = perform_clustering(
+        text_embeddings_np, 10, 0.1
+    )  # Perform clustering on the embeddings
+    df = pd.DataFrame()  # Initialize a DataFrame to store the results
+    df["text"] = texts  # Store original texts
+    df["embd"] = list(text_embeddings_np)  # Store embeddings as a list in the DataFrame
+    df["cluster"] = cluster_labels  # Store cluster labels
+    return df
+
+
+def fmt_txt(df: pd.DataFrame) -> str:
+    """
+    Formats the text documents in a DataFrame into a single string.
+
+    Parameters:
+    - df: DataFrame containing the 'text' column with text documents to format.
+
+    Returns:
+    - A single string where all text documents are joined by a specific delimiter.
+    """
+    unique_txt = df["text"].tolist()
+    return "--- --- \n --- --- ".join(unique_txt)
+
+
+def embed_cluster_summarize_texts(
+    texts: List[str], level: int
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Embeds, clusters, and summarizes a list of texts. This function first generates embeddings for the texts,
+    clusters them based on similarity, expands the cluster assignments for easier processing, and then summarizes
+    the content within each cluster.
+
+    Parameters:
+    - texts: A list of text documents to be processed.
+    - level: An integer parameter that could define the depth or detail of processing.
+
+    Returns:
+    - Tuple containing two DataFrames:
+      1. The first DataFrame (`df_clusters`) includes the original texts, their embeddings, and cluster assignments.
+      2. The second DataFrame (`df_summary`) contains summaries for each cluster, the specified level of detail,
+         and the cluster identifiers.
+    """
+
+    # Embed and cluster the texts, resulting in a DataFrame with 'text', 'embd', and 'cluster' columns
+    df_clusters = embed_cluster_texts(texts)
+
+    # Prepare to expand the DataFrame for easier manipulation of clusters
+    expanded_list = []
+
+    # Expand DataFrame entries to document-cluster pairings for straightforward processing
+    for index, row in df_clusters.iterrows():
+        for cluster in row["cluster"]:
+            expanded_list.append(
+                {"text": row["text"], "embd": row["embd"], "cluster": cluster}
+            )
+
+    # Create a new DataFrame from the expanded list
+    expanded_df = pd.DataFrame(expanded_list)
+
+    # Retrieve unique cluster identifiers for processing
+    all_clusters = expanded_df["cluster"].unique()
+
+    print(f"--Generated {len(all_clusters)} clusters--")
+
+    # Summarization
+    template = """Here is a sub-set of LangChain Expression Langauge doc. 
+    
+    LangChain Expression Langauge provides a way to compose chain in LangChain.
+    
+    Give a detailed summary of the documentation provided.
+    
+    Documentation:
+    {context}
+    """
+    prompt = ChatPromptTemplate.from_template(template)
+    chain = prompt | model | StrOutputParser()
+
+    # Format text within each cluster for summarization
+    summaries = []
+    for i in all_clusters:
+        df_cluster = expanded_df[expanded_df["cluster"] == i]
+        formatted_txt = fmt_txt(df_cluster)
+        summaries.append(chain.invoke({"context": formatted_txt}))
+
+    # Create a DataFrame to store summaries with their corresponding cluster and level
+    df_summary = pd.DataFrame(
+        {
+            "summaries": summaries,
+            "level": [level] * len(summaries),
+            "cluster": list(all_clusters),
+        }
+    )
+
+    return df_clusters, df_summary
+
+
+def recursive_embed_cluster_summarize(
+    texts: List[str], level: int = 1, n_levels: int = 3
+) -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:
+    """
+    Recursively embeds, clusters, and summarizes texts up to a specified level or until
+    the number of unique clusters becomes 1, storing the results at each level.
+
+    Parameters:
+    - texts: List[str], texts to be processed.
+    - level: int, current recursion level (starts at 1).
+    - n_levels: int, maximum depth of recursion.
+
+    Returns:
+    - Dict[int, Tuple[pd.DataFrame, pd.DataFrame]], a dictionary where keys are the recursion
+      levels and values are tuples containing the clusters DataFrame and summaries DataFrame at that level.
+    """
+    results = {}  # Dictionary to store results at each level
+
+    # Perform embedding, clustering, and summarization for the current level
+    df_clusters, df_summary = embed_cluster_summarize_texts(texts, level)
+
+    # Store the results of the current level
+    results[level] = (df_clusters, df_summary)
+
+    # Determine if further recursion is possible and meaningful
+    unique_clusters = df_summary["cluster"].nunique()
+    if level < n_levels and unique_clusters > 1:
+        # Use summaries as the input texts for the next level of recursion
+        new_texts = df_summary["summaries"].tolist()
+        next_level_results = recursive_embed_cluster_summarize(
+            new_texts, level + 1, n_levels
+        )
+
+        # Merge the results from the next level into the current results dictionary
+        results.update(next_level_results)
+
+    return results
+
+def create_retriever(path='Documents', extension="txt"):
+    def preprocessing(path, extension):
+        from langchain_community.document_loaders import DirectoryLoader
+        loader = DirectoryLoader(path, glob=f'**/*.{extension}', show_progress=True)
+        docs = loader.load()
+
+        docs_texts = [d.page_content for d in docs]
+
+        return docs_texts
+
+    # Build tree
+    leaf_texts = preprocessing(path, extension)
+    results = recursive_embed_cluster_summarize(leaf_texts, level=1, n_levels=3)
+
+    # Initialize all_texts with leaf_texts
+    all_texts = leaf_texts.copy()
+
+    # Iterate through the results to extract summaries from each level and add them to all_texts
+    for level in sorted(results.keys()):
+        # Extract summaries from the current level's DataFrame
+        summaries = results[level][1]["summaries"].tolist()
+        # Extend all_texts with the summaries from the current level
+        all_texts.extend(summaries)
+
+    # Now, use all_texts to build the vectorstore with Chroma
+    embd = OpenAIEmbeddings()
+    vectorstore = Chroma.from_texts(texts=all_texts, embedding=embd)
+    retriever = vectorstore.as_retriever()
+
+    return retriever
+
+
+

+ 174 - 0
Indexing_Split.py

@@ -0,0 +1,174 @@
+from dotenv import load_dotenv
+load_dotenv()
+
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.embeddings import OllamaEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain_community.document_loaders import TextLoader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import Docx2txtLoader
+from langchain_community.document_loaders import WebBaseLoader
+from PyPDF2 import PdfReader
+from langchain.docstore.document import Document
+from json import loads
+import pandas as pd
+from sqlalchemy import create_engine
+
+from langchain.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import StrOutputParser
+from langchain import hub
+from tqdm import tqdm
+
+# __import__('pysqlite3')
+# import sys
+# sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
+
+from datasets import Dataset 
+from ragas import evaluate
+from ragas.metrics import (
+    answer_relevancy,
+    faithfulness,
+    context_recall,
+    context_precision,
+)
+import pandas as pd
+import os
+import glob
+
+from dotenv import load_dotenv
+import os
+load_dotenv()
+URI = os.getenv("SUPABASE_URI")
+
+from RAG_strategy import multi_query, naive_rag, individually_generate_final_answer
+
+def create_retriever(path='Documents', extension="pdf"):
+    txt_files = glob.glob(os.path.join(path, f"*.{extension}"))
+    
+    doc = []
+    for file_path in txt_files:
+        doc.append(file_path)
+    
+    def load_and_split(file_list):
+        chunks = []
+        for file in file_list:
+            if file.endswith(".txt"):
+                loader = TextLoader(file, encoding='utf-8')
+            elif file.endswith(".pdf"):
+                loader = PyPDFLoader(file)
+            elif file.endswith(".docx"):
+                loader = Docx2txtLoader(file)
+            else:
+                raise ValueError(f"Unsupported file extension: {file}")
+            
+
+            docs = loader.load()
+
+            # Split
+            if file.endswith(".docx"):
+                # separators = ["\n\n\u25cb", "\n\n\u25cf"]
+                # text_splitter = RecursiveCharacterTextSplitter(separators=separators, chunk_size=500, chunk_overlap=0)
+                separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']
+                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=300, chunk_overlap=0)
+            else:
+                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)
+            
+            splits = text_splitter.split_documents(docs)
+
+            chunks.extend(splits)
+
+        return chunks
+
+    # Index
+    docs = load_and_split(doc)
+    qa_history_doc = gen_doc_from_history()
+    docs.extend(qa_history_doc)
+    # web_doc = web_data(os.path.join(path, 'web_url.csv'))
+    # docs.extend(web_doc)
+
+    # vectorstore
+    # vectorstore = Chroma.from_texts(texts=docs, embedding=OpenAIEmbeddings())
+    vectorstore = Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings())
+    # vectorstore = Chroma.from_documents(documents=docs, embedding=OllamaEmbeddings(model="llama3", num_gpu=1))
+    vectorstore.persist()
+
+    retriever = vectorstore.as_retriever()
+
+    return retriever
+
+def web_data(url_file):
+    df = pd.read_csv(url_file, header = 0)
+    url_list = df['url'].to_list()
+
+    loader = WebBaseLoader(url_list)
+    docs = loader.load()
+
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+                chunk_size=1000, chunk_overlap=0)
+    splits = text_splitter.split_documents(docs)
+    
+    return splits
+
+def gen_doc_from_history():
+    engine = create_engine(URI, echo=True)
+
+    df = pd.read_sql_table("systex_records", engine.connect())  
+    df.fillna('', inplace=True)
+    result = df.to_json(orient='index', force_ascii=False)
+    result = loads(result)
+
+
+    df = pd.DataFrame(result).T
+    qa_history_doc = []
+    for i in range(len(df)):
+        if df.iloc[i]['used_as_document'] is not True: continue
+        Question = df.iloc[i]['Question']
+        Answer = df.iloc[i]['Answer']
+        context = f'Question: {Question}\nAnswer: {Answer}'
+        
+        doc =  Document(page_content=context, metadata={"source": "History"})
+        qa_history_doc.append(doc)
+        # print(doc)
+
+    return qa_history_doc
+
+def gen_doc_from_database():
+    engine = create_engine(URI, echo=True)
+
+    df = pd.read_sql_table("QA_database", engine.connect())  
+    # df.fillna('', inplace=True)
+    result = df[['Question', 'Answer']].to_json(orient='index', force_ascii=False)
+    result = loads(result)
+
+
+    df = pd.DataFrame(result).T
+    qa_doc = []
+    for i in range(len(df)):
+        # if df.iloc[i]['used_as_document'] is not True: continue
+        Question = df.iloc[i]['Question']
+        Answer = df.iloc[i]['Answer']
+        context = f'Question: {Question}\nAnswer: {Answer}'
+        
+        doc = Document(page_content=context, metadata={"source": "History"})
+        qa_doc.append(doc)
+        # print(doc)
+
+    return qa_doc
+
+if __name__ == "__main__":
+
+    retriever = create_retriever(path='./Documents', extension="pdf")
+    question = 'CEV系統可以支援盤查到什麼程度'
+    final_answer, reference_docs = multi_query(question, retriever)
+    print(question, final_answer)
+    question = 'CEV系統依循標準為何'
+    final_answer, reference_docs = multi_query(question, retriever)
+    print(question, final_answer)
+
+
+
+

+ 188 - 0
Indexing_Unstructured.py

@@ -0,0 +1,188 @@
+import glob
+from typing import Any
+
+from pydantic import BaseModel
+from tqdm import tqdm
+from unstructured.partition.pdf import partition_pdf
+from dotenv import load_dotenv
+import os
+
+
+load_dotenv()
+os.environ["PATH"] += os.pathsep + "C:/Users/lzl/anaconda3/Lib/site-packages/poppler-24.02.0/Library/bin"
+os.environ["PATH"] += os.pathsep + r"C:\Program Files\Tesseract-OCR\tessdata"
+os.environ["PATH"] += os.pathsep + r"C:\Program Files\Tesseract-OCR"
+os.environ["TESSDATA_PREFIX"] = r"C:\Program Files\Tesseract-OCR\tessdata"
+
+def read_document(pdf):
+
+    # read unstructured documents
+    # Get elements
+    raw_pdf_elements = partition_pdf(
+        filename=pdf,
+        # Unstructured first finds embedded image blocks
+        extract_images_in_pdf=False,
+        # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
+        # Titles are any sub-section of the document
+        infer_table_structure=True,
+        # Post processing to aggregate text once we have the title
+        chunking_strategy="by_title",
+        # Chunking params to aggregate text blocks
+        # Attempt to create a new chunk 3800 chars
+        # Attempt to keep chunks > 2000 chars
+        max_characters=1000,
+        new_after_n_chars=980,
+        combine_text_under_n_chars=500,
+        # image_output_dir_path=path,
+    )
+
+    return raw_pdf_elements
+
+
+
+class Element(BaseModel):
+    type: str
+    text: Any
+
+def extract_different_type_elements(raw_pdf_elements):
+    # Categorize by type
+    categorized_elements = []
+    for element in raw_pdf_elements:
+        if "unstructured.documents.elements.Table" in str(type(element)):
+            categorized_elements.append(Element(type="table", text=str(element)))
+        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
+            categorized_elements.append(Element(type="text", text=str(element)))
+
+    # Tables
+    table_elements = [e for e in categorized_elements if e.type == "table"]
+    # print(len(table_elements))
+
+    # Text
+    text_elements = [e for e in categorized_elements if e.type == "text"]
+    # print(len(text_elements))
+
+    return table_elements, text_elements
+
+
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+
+def summarize(table_elements, text_elements):
+    # Prompt
+    prompt_text = """You are an assistant tasked with summarizing tables and text. \ 
+    Give a concise summary of the table or text. Table or text chunk: {element} """
+    prompt = ChatPromptTemplate.from_template(prompt_text)
+
+    # Summary chain
+    model = ChatOpenAI(temperature=0, model="gpt-4")
+    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
+
+    # Apply to tables
+    tables = [i.text for i in table_elements]
+    table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
+
+    # Apply to texts
+    texts = [i.text for i in text_elements]
+    text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
+
+    return (tables, table_summaries), (texts, text_summaries)
+
+import uuid
+
+from langchain.retrievers.multi_vector import MultiVectorRetriever
+from langchain.storage import InMemoryStore
+from langchain_community.vectorstores import Chroma
+from langchain_core.documents import Document
+from langchain_openai import OpenAIEmbeddings
+
+def create_retriever(path='../Documents'):
+    txt_files = glob.glob(os.path.join(path, f"*.pdf"))
+    
+    pdfs = []
+    for file_path in txt_files:
+        pdfs.append(file_path)
+
+    def add_elements(docs, docs_summaries):
+        doc_ids = [str(uuid.uuid4()) for _ in docs]
+        summary_texts = [
+            Document(page_content=s, metadata={id_key: doc_ids[i]})
+            for i, s in enumerate(docs_summaries)
+        ]
+        try:
+            retriever.vectorstore.add_documents(summary_texts)
+            retriever.docstore.mset(list(zip(doc_ids, docs)))
+        except ValueError as e:
+            pass
+
+    tqdm_total = len(pdfs) * 3 + 1
+    with tqdm(total=tqdm_total, desc="Creating retriever...") as retriever_progress:
+        # The vectorstore to use to index the child chunks
+        vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
+
+        # The storage layer for the parent documents
+        store = InMemoryStore()
+        id_key = "doc_id"
+
+        # The retriever (empty to start)
+        retriever = MultiVectorRetriever(
+            vectorstore=vectorstore,
+            docstore=store,
+            id_key=id_key,
+        )
+        retriever_progress.update(1)
+        # ----------------------------------------------------------------
+        
+        for pdf in pdfs:
+            # preprocessing
+            raw_pdf_elements = read_document(pdf)
+            retriever_progress.update(1)
+
+            table_elements, text_elements = extract_different_type_elements(raw_pdf_elements)
+            retriever_progress.update(1)
+
+            (tables, table_summaries), (texts, text_summaries) = summarize(table_elements, text_elements)
+            retriever_progress.update(1)
+
+            add_elements(texts, text_summaries)
+            add_elements(tables, table_summaries)
+
+    return retriever
+
+
+if __name__ == "__main__":
+    from langchain_core.runnables import RunnablePassthrough
+
+    # # Prompt template
+    # template = """Answer the question based only on the following context, which can include text and tables:
+    # {context}
+    # Question: {question}
+    # """
+    # prompt = ChatPromptTemplate.from_template(template)
+
+    # LLM
+    # model = ChatOpenAI(temperature=0, model="gpt-4")
+
+    # pdfs = ["國際鏈結之企業碳足跡指引.pdf"]
+    retriever = create_retriever()
+
+
+    # # RAG pipeline
+    # chain = (
+    #     {"context": retriever, "question": RunnablePassthrough()}
+    #     | prompt
+    #     | model
+    #     | StrOutputParser()
+    # )
+
+    question = ""
+    while question != "exit":
+        question = input("Question: ")
+        if question == "exit":
+            break
+        # answer = chain.invoke(question)
+        final_answer, reference_docs = generate_final_answer(question, retriever)
+        print(final_answer)
+        print("\n")
+
+    

+ 148 - 0
RAG_app.py

@@ -0,0 +1,148 @@
+from fastapi import FastAPI, Request, HTTPException, status
+# from fastapi.templating import Jinja2Templates
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+from fastapi import Depends
+from contextlib import asynccontextmanager
+from pydantic import BaseModel
+import uvicorn
+
+from typing import List, Optional
+import sqlparse
+from sqlalchemy import create_engine
+import pandas as pd
+#from retrying import retry
+import datetime
+import json
+from json import loads
+import pandas as pd
+import time
+from langchain.callbacks import get_openai_callback
+
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings
+from RAG_strategy import multi_query, naive_rag, individually_generate_final_answer, naive_rag_for_qapairs
+from Indexing_Split import create_retriever as split_retriever
+# from Indexing_RAPTOR import create_retriever as raptor_retriever
+# from Indexing_Unstructured import create_retriever as unstructured_retriever
+from Indexing_Split import gen_doc_from_database, gen_doc_from_history
+
+from dotenv import load_dotenv
+import os
+load_dotenv()
+URI = os.getenv("SUPABASE_URI")
+
+global_retriever = None
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global global_retriever
+    start = time.time()
+    global_retriever = split_retriever(path='./Documents', extension="docx")
+    # global_retriever = raptor_retriever(path='../Documents', extension="txt")
+    # global_retriever = unstructured_retriever(path='../Documents')
+    print(time.time() - start)
+    yield
+
+def get_retriever():
+    return global_retriever
+
+app = FastAPI(lifespan=lifespan)
+
+# templates = Jinja2Templates(directory="temp")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+#@app.get("/answer")
+def decomposition_individually_answer(question: Optional[str] = '', retriever=Depends(get_retriever)):
+    start = time.time()
+    with get_openai_callback() as cb:
+        final_answer, reference_docs = individually_generate_final_answer(question, retriever)
+    processing_time = time.time() - start
+    print(processing_time)
+    save_history(question, final_answer, reference_docs, cb, processing_time)
+    
+
+    return {"Answer": final_answer, "Reference": reference_docs}
+
+
+CHAT_HISTORY = []
+@app.get("/answer2")
+def multi_query_answer(question: Optional[str] = '', retriever=Depends(get_retriever)):
+    start = time.time()
+
+    with get_openai_callback() as cb:
+        # qa_doc = gen_doc_from_database()
+        # qa_history_doc = gen_doc_from_history()
+        # qa_doc.extend(qa_history_doc)
+        # vectorstore = Chroma.from_documents(documents=qa_doc, embedding=OpenAIEmbeddings(), collection_name="qa_pairs")
+        # retriever_qa = vectorstore.as_retriever(search_kwargs={"k": 3})
+        # final_answer, reference_docs = naive_rag_for_qapairs(question, retriever_qa)
+        final_answer = 'False'
+        if final_answer == 'False':
+            final_answer, reference_docs = multi_query(question, retriever, CHAT_HISTORY)
+
+    CHAT_HISTORY.append((question, final_answer))
+    # print(CHAT_HISTORY)
+    
+    # with get_openai_callback() as cb:
+    #     final_answer, reference_docs = multi_query(question, retriever)
+    processing_time = time.time() - start
+    print(processing_time)
+    save_history(question, final_answer, reference_docs, cb, processing_time)
+
+    return {"Answer": final_answer}
+
+#@app.get("/answer3")
+def naive_answer(question: Optional[str] = '', retriever=Depends(get_retriever)):
+    start = time.time()
+    with get_openai_callback() as cb:
+        final_answer, reference_docs = naive_rag(question, retriever)
+    processing_time = time.time() - start
+    print(processing_time)
+    save_history(question, final_answer, reference_docs, cb, processing_time)
+
+    return {"Answer": final_answer, "Reference": reference_docs}
+
+def save_history(question, answer, reference, cb, processing_time):
+    # reference = [doc.dict() for doc in reference]
+    record = {
+        'Question': [question],
+        'Answer': [answer],
+        'Total_Tokens': [cb.total_tokens],
+        'Total_Cost': [cb.total_cost],
+        'Processing_time': [processing_time],
+        'Contexts': [str(reference)]
+    }
+    df = pd.DataFrame(record)
+    engine = create_engine(URI)
+    df.to_sql(name='systex_records', con=engine, index=False, if_exists='append')
+
+class history_output(BaseModel):
+    Question: str
+    Answer: str
+    Contexts: str
+    Total_Tokens: int
+    Total_Cost: float
+    Processing_time: float
+    Time: datetime.datetime
+    
+@app.get('/history', response_model=List[history_output])
+async def get_history():
+    engine = create_engine(URI, echo=True)
+
+    df = pd.read_sql_table("systex_records", engine.connect())  
+    df.fillna('', inplace=True)
+    result = df.to_json(orient='index', force_ascii=False)
+    result = loads(result)
+    return result.values()
+
+if __name__ == "__main__":
+    uvicorn.run("RAG_app:app", host='cmm.ai', port=8081, ssl_keyfile="/etc/letsencrypt/live/cmm.ai/privkey.pem", 
+                ssl_certfile="/etc/letsencrypt/live/cmm.ai/fullchain.pem")
+

+ 357 - 0
RAG_strategy.py

@@ -0,0 +1,357 @@
+from langchain.prompts import ChatPromptTemplate
+from langchain.load import dumps, loads
+from langchain_core.output_parsers import StrOutputParser
+from langchain_openai import ChatOpenAI
+from langchain_community.llms import Ollama
+from langchain_community.chat_models import ChatOllama
+from operator import itemgetter
+from langchain_core.runnables import RunnablePassthrough
+from langchain import hub
+from langchain.globals import set_llm_cache
+from langchain import PromptTemplate
+
+
+from langchain_core.runnables import (
+    RunnableBranch,
+    RunnableLambda,
+    RunnableParallel,
+    RunnablePassthrough,
+)
+from typing import Tuple, List, Optional
+from langchain_core.messages import AIMessage, HumanMessage
+
+
+from datasets import Dataset 
+from ragas import evaluate
+from ragas.metrics import (
+    answer_relevancy,
+    faithfulness,
+    context_recall,
+    context_precision,
+)
+from typing import List
+from dotenv import load_dotenv
+load_dotenv()
+
+########################################################################################################################
+########################################################################################################################
+from langchain.cache import SQLiteCache
+
+from langchain.cache import RedisSemanticCache
+from langchain_openai import OpenAIEmbeddings
+# set_llm_cache(SQLiteCache(database_path=".langchain.db"))
+set_llm_cache(RedisSemanticCache(redis_url="redis://localhost:6380", embedding=OpenAIEmbeddings(), score_threshold=0.0005))
+########################################################################################################################
+
+def get_search_query():
+    # Condense a chat history and follow-up question into a standalone question
+    # 
+    _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
+    in its original language.
+    Chat History:
+    {chat_history}
+    Follow Up Input: {question}
+    Standalone question:"""  # noqa: E501
+    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
+
+    def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
+        buffer = []
+        for human, ai in chat_history:
+            buffer.append(HumanMessage(content=human))
+            buffer.append(AIMessage(content=ai))
+        return buffer
+
+    _search_query = RunnableBranch(
+        # If input includes chat_history, we condense it with the follow-up question
+        (
+            RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
+                run_name="HasChatHistoryCheck"
+            ),  # Condense follow-up question and chat into a standalone_question
+            RunnablePassthrough.assign(
+                chat_history=lambda x: _format_chat_history(x["chat_history"])
+            )
+            | CONDENSE_QUESTION_PROMPT
+            | ChatOpenAI(temperature=0)
+            | StrOutputParser(),
+        ),
+        # Else, we have no chat history, so just pass through the question
+        RunnableLambda(lambda x : x["question"]),
+    )
+
+    return _search_query
+########################################################################################################################
+def multi_query_rag_prompt(retrieval_chain, question):
+    # RAG
+    template = """Answer the following question based on this context:
+
+    {context}
+
+    Question: {question}
+    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw. \n
+    You should not mention anything about "根據提供的文件內容" or other similar terms.
+    If you don't know the answer, just say that "很抱歉,目前我無法回答您的問題,請將您的詢問發送至 test@systex.com 以便獲得更進一步的幫助,謝謝。"
+    """
+
+    prompt = ChatPromptTemplate.from_template(template)
+
+    # llm = ChatOpenAI(temperature=0)
+    llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
+    # llm = ChatOllama(model="llama3", num_gpu=1, temperature=0)
+
+    final_rag_chain = (
+        {"context": retrieval_chain, 
+        "question": itemgetter("question")} 
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+
+    answer = final_rag_chain.invoke({"question":question})
+
+    return answer
+
+def multi_query(question, retriever, chat_history):
+
+    def multi_query_chain():
+        # Multi Query: Different Perspectives
+        template = """You are an AI language model assistant. Your task is to generate three 
+        different versions of the given user question to retrieve relevant documents from a vector 
+        database. By generating multiple perspectives on the user question, your goal is to help
+        the user overcome some of the limitations of the distance-based similarity search. 
+        Provide these alternative questions separated by newlines. Original question: {question}"""
+        prompt_perspectives = ChatPromptTemplate.from_template(template)
+
+        
+        llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
+        # llm = ChatOllama(model="llama3", num_gpu=1, temperature=0)
+
+        generate_queries = (
+            prompt_perspectives 
+            | llm
+            | StrOutputParser() 
+            | (lambda x: x.split("\n"))
+        )
+
+        return generate_queries
+
+    def get_unique_union(documents: List[list]):
+        """ Unique union of retrieved docs """
+        # Flatten list of lists, and convert each Document to string
+        flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
+        # Get unique documents
+        unique_docs = list(set(flattened_docs))
+        # Return
+        return [loads(doc) for doc in unique_docs]
+    
+
+    _search_query = get_search_query()
+    modified_question = _search_query.invoke({"question":question, "chat_history": chat_history})
+    print(modified_question)
+
+    generate_queries = multi_query_chain()
+
+    retrieval_chain = generate_queries | retriever.map() | get_unique_union
+    docs = retrieval_chain.invoke({"question":modified_question})
+
+    answer = multi_query_rag_prompt(retrieval_chain, modified_question)
+
+    return answer, docs
+
+########################################################################################################################
+
+def naive_rag(question, retriever):
+    #### RETRIEVAL and GENERATION ####
+
+    # Prompt
+    prompt = hub.pull("rlm/rag-prompt")
+
+    # LLM
+    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
+
+    # Post-processing
+    def format_docs(docs):
+        return "\n\n".join(doc.page_content for doc in docs)
+
+    reference = retriever.get_relevant_documents(question)
+    
+    # Chain
+    rag_chain = (
+        {"context": retriever | format_docs, "question": RunnablePassthrough()}
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+
+    # Question
+    answer = rag_chain.invoke(question)
+
+    return answer, reference
+################################################################################################
+def naive_rag_for_qapairs(question, retriever):
+    #### RETRIEVAL and GENERATION ####
+
+    # Prompt
+    # prompt = hub.pull("rlm/rag-prompt")
+    template = """You are an assistant for question-answering tasks. 
+    Use the following pieces of retrieved context to answer the question. 
+    Following retrieved context is question-answer pairs of historical QA, Find the suitable answer from the qa pairs
+    If you can not find the suitable answer, just return "False". 
+    Use three sentences maximum and Do not make up the answer.
+
+    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw.
+
+    {context}
+
+    Question: {question}
+    """
+    prompt = PromptTemplate.from_template(template)
+
+    # LLM
+    llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)
+    # llm = ChatOllama(model="llama3", num_gpu=1, temperature=0)
+
+    # Post-processing
+    def format_docs(docs):
+        return "\n\n".join(doc.page_content for doc in docs)
+
+    reference = retriever.get_relevant_documents(question)
+    
+    # Chain
+    rag_chain = (
+        {"context": retriever | format_docs, "question": RunnablePassthrough()}
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+
+    # Question
+    answer = rag_chain.invoke(question)
+
+    return answer, reference
+########################################################################################################################
+def decomposition():
+    # Decomposition
+    template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
+    The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
+    Generate multiple search queries related to: {question} \n
+    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw. \n
+    Output (3 queries):"""
+    prompt_decomposition = ChatPromptTemplate.from_template(template)
+
+    # LLM
+    llm = ChatOpenAI(temperature=0.5)
+
+    # Chain
+    generate_queries_decomposition = (prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))
+
+    return generate_queries_decomposition
+
+# ----------------------------------------------------------------
+
+# Answer each sub-question individually 
+def retrieve_and_rag(question, sub_question_generator_chain, retriever):
+    """RAG on each sub-question"""
+
+    # LLM
+    # llm = ChatOpenAI(temperature=0.5)
+    llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
+
+    # Use our decomposition / 
+    sub_questions = sub_question_generator_chain.invoke({"question":question})
+    
+    # Initialize a list to hold RAG chain results
+    rag_results = []
+
+    # RAG prompt
+    prompt_rag = hub.pull("rlm/rag-prompt")
+
+    all_reference_docs = []
+
+    for sub_question in sub_questions:
+        
+        # Retrieve documents for each sub-question
+        retrieved_docs = retriever.get_relevant_documents(sub_question)
+        #print(f'\nreference docs: \n{retrieved_docs[0].page_content}\n')
+        #print(retrieved_docs)
+        for doc in retrieved_docs:
+            #print(doc.page_content)
+            # all_reference_docs.append(doc.page_content)
+            all_reference_docs.append(doc)
+        
+        # Use retrieved documents and sub-question in RAG chain
+        answer = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs, 
+                                                                "question": sub_question})
+        rag_results.append(answer)
+    # all_reference_docs = ""
+
+    return rag_results, sub_questions, all_reference_docs
+
+# ----------------------------------------------------------------
+
+def individually_generate_final_answer(question, retriever):
+
+    def format_qa_pairs(questions, answers):
+        """Format Q and A pairs"""
+        
+        formatted_string = ""
+        for i, (question, answer) in enumerate(zip(questions, answers), start=1):
+            formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
+        return formatted_string.strip()
+
+    # LLM
+    # llm = ChatOpenAI(temperature=0.5)
+    llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
+
+    # decomposition
+    generate_queries_decomposition = decomposition()
+
+    # Wrap the retrieval and RAG process in a RunnableLambda for integration into a chain
+    answers, questions, reference_docs = retrieve_and_rag(question, generate_queries_decomposition, retriever)
+    #print(answers, questions, reference_docs)
+    context = format_qa_pairs(questions, answers)
+
+    # Prompt
+    template = """Here is a set of Q+A pairs:
+
+    {context}
+
+    Use these to synthesize an answer to the question: {question}
+    Output in user's language. If the question is in zh-tw, then the output will be in zh-tw. \n
+    """
+
+    prompt = ChatPromptTemplate.from_template(template)
+
+    final_rag_chain = (
+        prompt
+        | llm
+        | StrOutputParser()
+    )
+
+    final_answer = final_rag_chain.invoke({"context": context, "question": question})
+
+    return final_answer, reference_docs
+
+
+####################
+
+def rag_score(question, ground_truth, answer, reference_docs):
+    
+    datasets = {
+              "question": [question],       # question: list[str]
+              "answer": [answer],           # answer: list[str]
+              "contexts": [reference_docs], # contexts: list[list[str]]
+              "ground_truths": [[ground_truth]] # ground_truth: list[list[str]]
+            }
+    evalsets = Dataset.from_dict(datasets)
+
+    result = evaluate(
+        evalsets,
+        metrics=[
+            context_precision,
+            faithfulness,
+            answer_relevancy,
+            context_recall,
+        ],
+    )
+
+    return result

+ 79 - 0
add_vectordb.py

@@ -0,0 +1,79 @@
+from dotenv import load_dotenv
+load_dotenv()
+
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain_community.document_loaders import TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+
+import os
+import glob
+
+def read_and_split_files(path='Documents', extension="pdf"):
+    txt_files = glob.glob(os.path.join(path, f"*.{extension}"))
+        
+    doc = []
+    for file_path in txt_files:
+        doc.append(file_path)
+
+    def load_and_split(file_list):
+        chunks = []
+        for file in file_list:
+            if file.endswith(".txt"):
+                loader = TextLoader(file, encoding='utf-8')
+            elif file.endswith(".pdf"):
+                loader = PyPDFLoader(file)
+            else:
+                raise ValueError(f"Unsupported file extension: {file}")
+
+            docs = loader.load()
+
+            # Split
+            text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+                chunk_size=1000, chunk_overlap=200)
+            splits = text_splitter.split_documents(docs)
+
+            chunks.extend(splits)
+
+        return chunks
+
+    # Index
+    docs = load_and_split(doc)
+    # qa_history_doc = gen_doc_from_history()
+    # docs.extend(qa_history_doc)
+    # web_doc = web_data(os.path.join(path, 'web_url.csv'))
+    # docs.extend(web_doc)
+
+    return docs
+
+
+def create_vectordb(docs):
+    path = "../SYSTEX_精誠/RAG/Documents/"
+    docs = read_and_split_files(path)
+
+    persist_directory = 'db'
+
+    embedding = OpenAIEmbeddings()
+
+    vectordb = Chroma.from_documents(documents = docs,
+                                    embedding = embedding,
+                                    persist_directory = persist_directory)
+
+    # 用persist方式執行vectordb,可以將db資料寫到磁碟中
+    vectordb.persist()
+
+
+def use_vectordb(persist_directory):
+    # embedding使用OpenAI的Embedding
+    embedding = OpenAIEmbeddings()
+
+    # 使用的vectordb
+
+    vectordb = Chroma(persist_directory=persist_directory, 
+                    embedding_function=embedding)
+    
+def use_retriever(vectordb):
+    retriever = vectordb.as_retriever(search_kwargs={'k': 2})
+    ans2 = retriever.invoke('溫室氣體種類')
+    print(ans2)

+ 142 - 0
requirements.txt

@@ -0,0 +1,142 @@
+aiohttp==3.9.5
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==4.3.0
+appdirs==1.4.4
+asgiref==3.8.1
+async-timeout==4.0.3
+attrs==23.2.0
+backoff==2.2.1
+bcrypt==4.1.2
+beautifulsoup4==4.12.3
+build==1.2.1
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+chroma-hnswlib==0.7.3
+chromadb==0.4.24
+click==8.1.7
+coloredlogs==15.0.1
+dataclasses-json==0.6.4
+datasets==2.18.0
+Deprecated==1.2.14
+dill==0.3.8
+distro==1.9.0
+exceptiongroup==1.2.0
+faiss-cpu==1.8.0
+fastapi==0.110.1
+filelock==3.13.4
+flatbuffers==24.3.25
+frozenlist==1.4.1
+fsspec==2024.2.0
+google-auth==2.29.0
+googleapis-common-protos==1.63.0
+gptcache==0.1.43
+graphlib_backport==1.1.0
+greenlet==3.0.3
+grpcio==1.62.1
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.22.2
+humanfriendly==10.0
+idna==3.7
+importlib-metadata==6.5.0
+importlib_resources==6.4.0
+jsonpatch==1.33
+jsonpointer==2.4
+kubernetes==29.0.0
+langchain==0.1.16
+langchain-community==0.0.33
+langchain-core==0.1.43
+langchain-openai==0.1.3
+langchain-text-splitters==0.0.1
+langchainhub==0.1.15
+langsmith==0.1.48
+markdown-it-py==3.0.0
+marshmallow==3.21.1
+mdurl==0.1.2
+mmh3==4.1.0
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+numpy==1.24.4
+oauthlib==3.2.2
+onnxruntime==1.16.3
+openai==1.20.0
+opentelemetry-api==1.24.0
+opentelemetry-exporter-otlp-proto-common==1.24.0
+opentelemetry-exporter-otlp-proto-grpc==1.24.0
+opentelemetry-instrumentation==0.45b0
+opentelemetry-instrumentation-asgi==0.45b0
+opentelemetry-instrumentation-fastapi==0.45b0
+opentelemetry-proto==1.24.0
+opentelemetry-sdk==1.24.0
+opentelemetry-semantic-conventions==0.45b0
+opentelemetry-util-http==0.45b0
+orjson==3.10.1
+overrides==7.7.0
+packaging==23.2
+pandas==2.0.3
+pkg_resources==0.0.0
+posthog==3.5.0
+protobuf==4.25.3
+psycopg2==2.9.9
+pulsar-client==3.5.0
+pyarrow==15.0.2
+pyarrow-hotfix==0.6
+pyasn1==0.6.0
+pyasn1_modules==0.4.0
+pydantic==2.7.0
+pydantic_core==2.18.1
+Pygments==2.17.2
+pypdf==4.2.0
+PyPDF2==3.0.1
+PyPika==0.48.9
+pyproject_hooks==1.0.0
+pysbd==0.3.4
+pysqlite3-binary==0.5.2.post3
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.1
+PyYAML==6.0.1
+ragas==0.1.7
+regex==2024.4.16
+requests==2.31.0
+requests-oauthlib==2.0.0
+rich==13.7.1
+rsa==4.9
+safetensors==0.4.3
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.5
+SQLAlchemy==2.0.27
+sqlparse==0.5.0
+starlette==0.37.2
+sympy==1.12
+tenacity==8.2.3
+tiktoken==0.6.0
+tokenizers==0.19.1
+tomli==2.0.1
+tqdm==4.66.2
+transformers==4.40.1
+typer==0.12.3
+types-requests==2.31.0.20240406
+typing-inspect==0.9.0
+typing_extensions==4.11.0
+tzdata==2024.1
+urllib3==2.2.1
+uvicorn==0.29.0
+uvloop==0.19.0
+watchfiles==0.21.0
+websocket-client==1.7.0
+websockets==12.0
+wrapt==1.16.0
+xxhash==3.4.1
+yarl==1.9.4
+zipp==3.18.1