2 Commits a9266b9268 ... cc65fa5fca

Author SHA1 Message Date
  ling cc65fa5fca update README 3 months ago
  ling 041ad256b6 add new_information loader 3 months ago
4 changed files with 310 additions and 228 deletions
  1. 14 3
      README.md
  2. 267 225
      file_loader/add_vectordb.py
  3. BIN
      file_loader/new_information.docx
  4. 29 0
      file_loader/new_information_loader.py

+ 14 - 3
README.md

@@ -3,11 +3,12 @@
 此專案建立了 multi-agent AI chatbot,主要包含兩個 agent:一個負責處理客戶自有資料的 Text-to-SQL agent,另一個則是使用檢索增強生成(RAG)技術處理專業知識的agent。整體 multi-agent 架構是使用 `langgraph` 完成。
 
 ## 目錄
-- [專案概述](#專案概述)
+- [系統說明](#系統說明)
 - [使用方式](#使用方式)
 - [檔案說明](#檔案說明)
+- [VectorDB](#VectorDB)
 
-## 專案概述
+## 系統說明
 此 multi-agent 系統旨在透過判斷使用者提問而選擇使用客戶自有資料庫或外部專業知識來源,回答使用者提問。系統包含兩個主要代理:
 1. 客戶自有資料 agent:使用 **Text-to-SQL** 技術,用於處理客戶自有的結構化數據。
 2. 外部專業知識 agent:使用 **RAG** 技術,並以 **FAISS** 實現 RAG 的 retriever 功能,用於從外部非結構化知識中檢索並生成答案。
@@ -33,4 +34,14 @@ FastAPI Link: https://cmm.ai:8989/docs
 - **`ai_agent.py`**:定義 multi-agent 架構,包括 Text-to-SQL 和 RAG agent。該架構使用 `langgraph` 框架建立。
 - **`ai_agent_llama.py`**:同 `ai_agent.py`,改為使用 local LLM
 - **`faiss_index.py`**:處理 RAG 所需的 retriever 功能,用來處理文字向量資料。
-- **`tex_to_sql_private.py`**:將自然語言轉換為 SQL query,用來處理客戶自有資料。
+- **`tex_to_sql_private.py`**:將自然語言轉換為 SQL query,用來處理客戶自有資料。
+
+## VectorDB
+
+- 如果 new_information 有更新可按以下步驟更新:
+```bash
+cd file_loader/
+python new_information_loader.py
+# tmux [systex] window 0 shutdown first
+python systex_app.py
+```

+ 267 - 225
file_loader/add_vectordb.py

@@ -1,226 +1,268 @@
-from dotenv import load_dotenv
-load_dotenv("../.env")
-
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.vectorstores import Chroma
-from langchain_community.document_loaders import TextLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_community.document_loaders import Docx2txtLoader
-from langchain.document_loaders import CSVLoader
-
-import os
-import glob
-
-from langchain_community.vectorstores import SupabaseVectorStore
-from langchain_openai import OpenAIEmbeddings
-from supabase.client import Client, create_client
-
-document_table = "documents"
-def get_data_list(data_list=None, path=None, extension=None, update=False):
-    files = data_list or glob.glob(os.path.join(path, f"*.{extension}"))
-    if update:    
-        doc = files.copy()
-    else:
-        existed_data = check_existed_data(supabase)
-        doc = []
-        for file_path in files:
-            filename = os.path.basename(file_path)
-            if filename not in existed_data:
-                doc.append(file_path)
-
-    return doc
-
-
-def read_and_split_files(data_list=None, path=None, extension=None, update=False):
-
-    def read_csv(path):
-        extension = "csv"
-        # path = r"./Phase2/"
-        files = glob.glob(os.path.join(path, f"*.{extension}"))
-            
-        documents = []
-        for file_path in files:
-            print(file_path)
-
-            loader = CSVLoader(file_path, encoding="utf-8")
-            doc = loader.load()
-
-            documents.extend(doc)
-        
-        return documents
-    
-    def load_and_split(file_list):
-        chunks = []
-        for file in file_list:
-            if file.endswith(".txt"):
-                loader = TextLoader(file, encoding='utf-8')
-            elif file.endswith(".pdf"):
-                loader = PyPDFLoader(file)
-            elif file.endswith(".docx"):
-                loader = Docx2txtLoader(file)
-            else:
-                print(f"Unsupported file extension: {file}")
-                continue
-
-            docs = loader.load()
-
-
-            # Split
-            if file.endswith(".docx"):
-                separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']
-                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=300, chunk_overlap=0)
-            else:
-                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)
-            splits = text_splitter.split_documents(docs)
-
-            chunks.extend(splits)
-            doc = read_csv(path)
-            chunks.extend(doc)
-
-        return chunks
-    
-
-
-    # specific data type
-    doc = get_data_list(data_list=data_list, path=path, extension=extension, update=update)
-
-    # web url
-
-    # csv
-    # doc = read_csv(path)
-
-    # Index
-    docs = load_and_split(doc)
-
-    return docs
-
-def create_ids(docs):
-    # Create a dictionary to count occurrences of each page in each document
-    page_counter = {}
-
-    # List to store the resulting IDs
-    document_ids = []
-
-    # Generate IDs
-    for doc in [docs[i].metadata for i in range(len(docs))]:
-        source = doc['source']
-        file_name = os.path.basename(source).split('.')[0]
-
-        if "page" in doc.keys():
-            page = doc['page']
-            key = f"{source}_{page}"
-        else:
-            key = f"{source}"
-
-        if key not in page_counter:
-            page_counter[key] = 1
-        else:
-            page_counter[key] += 1
-        
-        if "page" in doc.keys():
-            doc_id = f"{file_name} | page {page} | chunk {page_counter[key]}"
-        else:
-            doc_id = f"{file_name} | chunk {page_counter[key]}"
-
-        
-        document_ids.append(doc_id)
-
-    return document_ids
-
-def get_document(data_list=None, path=None, extension=None, update=False):
-    docs = read_and_split_files(data_list=data_list, path=path, extension=extension, update=update)
-    document_ids = create_ids(docs)
-
-    for doc in docs:
-        doc.metadata['source'] = os.path.basename(doc.metadata['source'])
-        # print(doc.metadata)
-
-    # document_metadatas = [{'source': doc.metadata['source'], 'page': doc.metadata['page'], 'chunk': int(id.split("chunk ")[-1])} for doc, id in zip(docs, document_ids)]
-    document_metadatas = []
-
-    for doc, id in zip(docs, document_ids):
-        chunk_number = int(id.split("chunk ")[-1])
-        doc.metadata['chunk'] = chunk_number
-        doc.metadata['extension'] = os.path.basename(doc.metadata['source']).split(".")[-1]
-        document_metadatas.append(doc.metadata)
-
-    documents = [docs.metadata['source'].split(".")[0] + docs.page_content for docs in docs]
-
-    return document_ids, documents, document_metadatas
-
-def check_existed_data(supabase):
-    response = supabase.table(document_table).select("id, metadata").execute()
-    existed_data = list(set([data['metadata']['source'] for data in response.data]))
-    # existed_data = [(data['id'], data['metadata']['source']) for data in response.data]
-    return existed_data
-
-class GetVectorStore(SupabaseVectorStore):
-    def __init__(self, embeddings, supabase, table_name):
-        super().__init__(embedding=embeddings, client=supabase, table_name=table_name, query_name="match_documents")
-
-    def insert(self, documents, document_metadatas):
-        self.add_texts(
-            texts=documents,
-            metadatas=document_metadatas,
-        )
-
-    def delete(self, file_list):
-        for file_name in file_list:
-            self._client.table(self.table_name).delete().eq('metadata->>source', file_name).execute()
-
-    def update(self, documents, document_metadatas, update_existing_data=False):
-        if not document_metadatas:  # no new data
-            return
-
-        if update_existing_data:
-            file_list = list(set(metadata['source'] for metadata in document_metadatas))
-            self.delete(file_list)
-
-        self.insert(documents, document_metadatas)
-
-if __name__ == "__main__":
-
-    load_dotenv("../.env")
-    supabase_url = os.environ.get("SUPABASE_URL")
-    supabase_key = os.environ.get("SUPABASE_KEY")
-    document_table = "documents"
-    supabase: Client = create_client(supabase_url, supabase_key)
-
-    embeddings = OpenAIEmbeddings()
-
-    ###################################################################################
-    # get vector store
-    vector_store = GetVectorStore(embeddings, supabase, document_table)
-
-    ###################################################################################
-    # update data (old + new / all new / all old)
-    path = "/home/mia/systex/Documents"
-    extension = "pdf"
-    # file = None
-
-    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
-    # file = [os.path.join(path, file) for file in file_list]
-    # file_list = glob.glob(os.path.join(path, "*"))
-    file_list =glob.glob(os.path.join(path, f"*.{extension}"))
-    # print(file_list)
-    
-    # update = False
-    # document_ids, documents, document_metadatas = get_document(data_list=file_list, path=path, extension=extension, update=update)
-    # vector_store.update(documents, document_metadatas, update_existing_data=update)
-
-    ###################################################################################
-    # insert new data (all new)
-    # vector_store.insert(documents, document_metadatas)
-
-    ###################################################################################
-    # delete data
-    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
-    file_list = glob.glob(os.path.join(path, f"*.docx"))
-    file_list = [os.path.basename(file_path) for file_path in file_list]
-    print(file_list)
-    vector_store.delete(file_list)
-
-    ###################################################################################
-    # get retriver
+from dotenv import load_dotenv
+load_dotenv("../.env")
+
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain_community.document_loaders import TextLoader
+from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import Docx2txtLoader
+from langchain.document_loaders import CSVLoader
+
+import os
+import glob
+
+from langchain_community.vectorstores import SupabaseVectorStore
+from langchain_openai import OpenAIEmbeddings
+from supabase.client import Client, create_client
+
+load_dotenv()
+supabase_url = os.environ.get("SUPABASE_URL")
+supabase_key = os.environ.get("SUPABASE_KEY")
+document_table = "documents2"
+supabase: Client = create_client(supabase_url, supabase_key)
+def get_data_list(data_list=None, path=None, extension=None, update=False):
+    files = data_list or glob.glob(os.path.join(path, f"*.{extension}"))
+    if update:    
+        doc = files.copy()
+    else:
+        existed_data = check_existed_data(supabase)
+        doc = []
+        for file_path in files:
+            filename = os.path.basename(file_path)
+            if filename not in existed_data:
+                doc.append(file_path)
+
+    return doc
+
+
+def read_and_split_files(data_list=None, path=None, extension=None, update=False):
+
+    def read_csv(path):
+        extension = "csv"
+        # path = r"./Phase2/"
+        files = glob.glob(os.path.join(path, f"*.{extension}"))
+
+        if not files:
+            return None
+
+        documents = []
+        for file_path in files:
+            print(file_path)
+
+            loader = CSVLoader(file_path, encoding="utf-8")
+            doc = loader.load()
+
+            documents.extend(doc)
+        
+        return documents
+    
+    def load_and_split(file_list):
+        chunks = []
+        for file in file_list:
+            if file.endswith(".txt") or file.endswith(".md"):
+                loader = TextLoader(file, encoding='utf-8')
+                docs = loader.load()
+            elif file.endswith(".pdf"):
+                loader = PyPDFLoader(file)
+                docs = loader.load()
+            elif file.endswith(".docx"):
+                loader = Docx2txtLoader(file)
+                docs = loader.load()
+            else:
+                print(f"Unsupported file extension: {file}")
+                continue
+
+
+
+            # Split
+            rules = ['低碳產品獎勵辦法.docx', '公私場所固定污染源空氣污染物排放量申報管理辦法.docx', '氣候變遷因應法.docx', '氣候變遷因應法施行細則.docx', 
+                     '淘汰老舊機車換購電動機車溫室氣體減量獎勵辦法訂定總說明及逐條說明.docx', '溫室氣體抵換專案管理辦法.docx', '溫室氣體排放源符合效能標準獎勵辦法.docx', 
+                     '溫室氣體排放量增量抵換管理辦法.docx', '溫室氣體排放量增量抵換管理辦法訂定總說明及逐條說明.docx', '溫室氣體排放量盤查登錄及查驗管理辦法修正條文.docx', 
+                     '溫室氣體排放量盤查登錄管理辦法(溫室氣體排放量盤查登錄及查驗管理辦法修正條文前身).docx', '溫室氣體自願減量專案管理辦法.docx', 
+                     '溫室氣體自願減量專案管理辦法中華民國112年10月12日訂定總說明及逐條說明.docx', '溫室氣體認證機構及查驗機構管理辦法.docx', 
+                     '溫室氣體階段管制目標及管制方式作業準則.docx', '碳足跡產品類別規則訂定、引用及修訂指引.docx', 
+                     '老舊汽車汰舊換新溫室氣體減量獎勵辦法中華民國112年1月11日訂定總說明及逐條說明.docx']
+            print(os.path.basename(file))
+            if file.endswith(".docx") and os.path.basename(file) in rules:
+                separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']
+                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=500, chunk_overlap=0)
+                splits = text_splitter.split_documents(docs)
+            elif os.path.basename(file) in ["new_information.docx"]:
+                print(file)
+                separators = ['###']
+                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=500, chunk_overlap=0)
+                splits = text_splitter.split_documents(docs)
+
+            elif file.endswith(".md"):
+                headers_to_split_on = [
+                    ("#", "Header 1"),
+                    ("##", "Header 2"),
+                    ("###", "Header 3"),
+                ]
+
+                markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
+                splits = markdown_splitter.split_text(docs[0].page_content)
+                for split in splits:
+                    split
+                
+            else:
+                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)
+                splits = text_splitter.split_documents(docs)
+
+            chunks.extend(splits)
+            # doc = read_csv(path)
+            # chunks.extend(doc)
+
+        return chunks
+    
+
+
+    # specific data type
+    doc = get_data_list(data_list=data_list, path=path, extension=extension, update=update)
+
+    # web url
+
+    # csv
+    # doc = read_csv(path)
+
+    # Index
+    docs = load_and_split(doc)
+
+    return docs
+
+def create_ids(docs):
+    # Create a dictionary to count occurrences of each page in each document
+    page_counter = {}
+
+    # List to store the resulting IDs
+    document_ids = []
+
+    # Generate IDs
+    for doc in [docs[i].metadata for i in range(len(docs))]:
+        if "source" in doc.keys():
+            source = doc['source']
+            file_name = os.path.basename(source).split('.')[0]
+        else:
+            source = "supplement"
+            file_name = "supplement"
+
+        if "page" in doc.keys():
+            page = doc['page']
+            key = f"{source}_{page}"
+        else:
+            key = f"{source}"
+
+        if key not in page_counter:
+            page_counter[key] = 1
+        else:
+            page_counter[key] += 1
+        
+        if "page" in doc.keys():
+            doc_id = f"{file_name} | page {page} | chunk {page_counter[key]}"
+        else:
+            doc_id = f"{file_name} | chunk {page_counter[key]}"
+
+        
+        document_ids.append(doc_id)
+
+    return document_ids
+
+def get_document(data_list=None, path=None, extension=None, update=False):
+    docs = read_and_split_files(data_list=data_list, path=path, extension=extension, update=update)
+    document_ids = create_ids(docs)
+
+    for doc in docs:
+        doc.metadata['source'] = os.path.basename(doc.metadata['source']) if 'source' in doc.metadata else "supplement.md"
+        # print(doc.metadata)
+
+    # document_metadatas = [{'source': doc.metadata['source'], 'page': doc.metadata['page'], 'chunk': int(id.split("chunk ")[-1])} for doc, id in zip(docs, document_ids)]
+    document_metadatas = []
+
+    for doc, id in zip(docs, document_ids):
+        chunk_number = int(id.split("chunk ")[-1])
+        doc.metadata['chunk'] = chunk_number
+        doc.metadata['extension'] = os.path.basename(doc.metadata['source']).split(".")[-1]
+        document_metadatas.append(doc.metadata)
+
+    documents = [docs.metadata['source'].split(".")[0] + docs.page_content for docs in docs]
+
+    return document_ids, documents, document_metadatas
+
+def check_existed_data(supabase):
+    response = supabase.table(document_table).select("id, metadata").execute()
+    existed_data = list(set([data['metadata']['source'] for data in response.data]))
+    # existed_data = [(data['id'], data['metadata']['source']) for data in response.data]
+    return existed_data
+
+
+class GetVectorStore(SupabaseVectorStore):
+    def __init__(self, embeddings, supabase, table_name):
+        super().__init__(embedding=embeddings, client=supabase, table_name=table_name, query_name="match_documents")
+
+    def insert(self, documents, document_metadatas):
+        self.add_texts(
+            texts=documents,
+            metadatas=document_metadatas,
+        )
+
+    def delete(self, file_list):
+        for file in file_list:
+            file_name = os.path.basename(file)
+            self._client.table(self.table_name).delete().eq('metadata->>source', file_name).execute()
+
+    def update(self, documents, document_metadatas, update_existing_data=False):
+        if not document_metadatas:  # no new data
+            return
+
+        if update_existing_data:
+            file_list = list(set(metadata['source'] for metadata in document_metadatas))
+            self.delete(file_list)
+
+        self.insert(documents, document_metadatas)
+
+if __name__ == "__main__":
+
+    load_dotenv("../.env")
+    supabase_url = os.environ.get("SUPABASE_URL")
+    supabase_key = os.environ.get("SUPABASE_KEY")
+    document_table = "documents"
+    supabase: Client = create_client(supabase_url, supabase_key)
+
+    embeddings = OpenAIEmbeddings()
+
+    ###################################################################################
+    # get vector store
+    vector_store = GetVectorStore(embeddings, supabase, document_table)
+
+    ###################################################################################
+    # update data (old + new / all new / all old)
+    path = "/home/mia/systex/Documents"
+    extension = "pdf"
+    # file = None
+
+    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
+    # file = [os.path.join(path, file) for file in file_list]
+    # file_list = glob.glob(os.path.join(path, "*"))
+    file_list =glob.glob(os.path.join(path, f"*.{extension}"))
+    # print(file_list)
+    
+    # update = False
+    # document_ids, documents, document_metadatas = get_document(data_list=file_list, path=path, extension=extension, update=update)
+    # vector_store.update(documents, document_metadatas, update_existing_data=update)
+
+    ###################################################################################
+    # insert new data (all new)
+    # vector_store.insert(documents, document_metadatas)
+
+    ###################################################################################
+    # delete data
+    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
+    # file_list = glob.glob(os.path.join(path, f"*.docx"))
+    # file_list = [os.path.basename(file_path) for file_path in file_list]
+    # print(file_list)
+    # vector_store.delete(file_list)
+
+    ###################################################################################
+    # get retriver
     # retriever = vector_store.as_retriever(search_kwargs={"k": 6})

BIN
file_loader/new_information.docx


+ 29 - 0
file_loader/new_information_loader.py

@@ -0,0 +1,29 @@
+from add_vectordb import GetVectorStore, get_data_list, read_and_split_files, create_ids, get_document, check_existed_data
+
+from dotenv import load_dotenv
+import os
+from langchain_community.vectorstores import SupabaseVectorStore
+from langchain_openai import OpenAIEmbeddings
+from supabase.client import Client, create_client
+import gdown
+
+load_dotenv("../.env")
+supabase_url = os.environ.get("SUPABASE_URL")
+supabase_key = os.environ.get("SUPABASE_KEY")
+document_table = "documents2"
+supabase: Client = create_client(supabase_url, supabase_key)
+
+embeddings = OpenAIEmbeddings()
+vector_store = GetVectorStore(embeddings, supabase, document_table)
+
+# a file
+url = "https://docs.google.com/document/u/0/export?format=docx&id=1bg1yOYlFd8GkDy_JuASKIWVN4MNbd9moZ4P-3stqaoI&token=AC4w5Vj1CZYNkmPrnJXQrJbcE5VVua5sig%3A1727167683932&ouid=103663058481204095886&includes_info_params=true&usp=drive_web&cros_files=false&inspectorResult=%7B%22pc%22%3A97%2C%22lplc%22%3A9%7D"
+path = "/home/ling/systex/file_loader"
+output = "new_information.docx"
+gdown.download(url, os.path.join(path, output))
+
+vector_store.delete([output])
+
+file_list = [os.path.join(path, output)]
+document_ids, documents, document_metadatas = get_document(data_list=file_list, update=True)
+vector_store.insert(documents, document_metadatas)