1 year ago · 8cf6ac110e
--- a/Documents/低碳產品獎勵辦法.docx
+++ b/Documents/低碳產品獎勵辦法.docx
--- a/Documents/公私場所固定污染源空氣污染物排放量申報管理辦法.docx
+++ b/Documents/公私場所固定污染源空氣污染物排放量申報管理辦法.docx
--- a/Documents/新設或變更溫室氣體排放源排放量規模.docx
+++ b/Documents/新設或變更溫室氣體排放源排放量規模.docx
--- a/Documents/氣候變遷因應法.docx
+++ b/Documents/氣候變遷因應法.docx
--- a/Documents/氣候變遷因應法施行細則.docx
+++ b/Documents/氣候變遷因應法施行細則.docx
--- a/Documents/淘汰老舊機車換購電動機車溫室氣體減量獎勵辦法訂定總說明及逐條說明.docx
+++ b/Documents/淘汰老舊機車換購電動機車溫室氣體減量獎勵辦法訂定總說明及逐條說明.docx
--- a/Documents/溫室氣體抵換專案管理辦法.docx
+++ b/Documents/溫室氣體抵換專案管理辦法.docx
--- a/Documents/溫室氣體排放源符合效能標準獎勵辦法.docx
+++ b/Documents/溫室氣體排放源符合效能標準獎勵辦法.docx
--- a/Documents/溫室氣體排放量增量抵換管理辦法.docx
+++ b/Documents/溫室氣體排放量增量抵換管理辦法.docx
--- a/Documents/溫室氣體排放量增量抵換管理辦法訂定總說明及逐條說明.docx
+++ b/Documents/溫室氣體排放量增量抵換管理辦法訂定總說明及逐條說明.docx
--- a/Documents/溫室氣體排放量盤查登錄及查驗管理辦法.docx
+++ b/Documents/溫室氣體排放量盤查登錄及查驗管理辦法.docx
--- a/Documents/溫室氣體排放量盤查登錄及查驗管理辦法修正條文.docx
+++ b/Documents/溫室氣體排放量盤查登錄及查驗管理辦法修正條文.docx
--- a/Documents/溫室氣體排放量盤查登錄管理辦法(溫室氣體排放量盤查登錄及查驗管理辦法修正條文前身).docx
+++ b/Documents/溫室氣體排放量盤查登錄管理辦法(溫室氣體排放量盤查登錄及查驗管理辦法修正條文前身).docx
--- a/Documents/溫室氣體排放量盤查登錄管理辦法_修正總說明及條文對照表.docx
+++ b/Documents/溫室氣體排放量盤查登錄管理辦法_修正總說明及條文對照表.docx
--- a/Documents/溫室氣體減量及管理法(氣候變遷因應法前身).docx
+++ b/Documents/溫室氣體減量及管理法(氣候變遷因應法前身).docx
--- a/Documents/溫室氣體減量及管理法施行細則修正總說明_中華民國112年12月29日修正總說明及條文對照表.docx
+++ b/Documents/溫室氣體減量及管理法施行細則修正總說明_中華民國112年12月29日修正總說明及條文對照表.docx
--- a/Documents/溫室氣體自願減量專案管理辦法.docx
+++ b/Documents/溫室氣體自願減量專案管理辦法.docx
--- a/Documents/溫室氣體自願減量專案管理辦法中華民國112年10月12日訂定總說明及逐條說明.docx
+++ b/Documents/溫室氣體自願減量專案管理辦法中華民國112年10月12日訂定總說明及逐條說明.docx
--- a/Documents/溫室氣體認證機構及查驗機構管理辦法.docx
+++ b/Documents/溫室氣體認證機構及查驗機構管理辦法.docx
--- a/Documents/溫室氣體階段管制目標及管制方式作業準則.docx
+++ b/Documents/溫室氣體階段管制目標及管制方式作業準則.docx
--- a/Documents/碳費費率審議會設置要點_訂定總說明及逐點說明.docx
+++ b/Documents/碳費費率審議會設置要點_訂定總說明及逐點說明.docx
--- a/Documents/碳足跡產品類別規則訂定、引用及修訂指引.docx
+++ b/Documents/碳足跡產品類別規則訂定、引用及修訂指引.docx
--- a/Documents/老舊汽車汰舊換新溫室氣體減量獎勵辦法中華民國112年1月11日訂定總說明及逐條說明.docx
+++ b/Documents/老舊汽車汰舊換新溫室氣體減量獎勵辦法中華民國112年1月11日訂定總說明及逐條說明.docx
--- a/Documents/臺中市溫室氣體排放源自主管理辦法.docx
+++ b/Documents/臺中市溫室氣體排放源自主管理辦法.docx
--- a/Documents/行政院環境保護署審查開發行為溫室氣體排放量增量抵換處理原則.docx
+++ b/Documents/行政院環境保護署審查開發行為溫室氣體排放量增量抵換處理原則.docx
--- a/Documents/行政院環境保護署推動產品碳足跡管理要點.docx
+++ b/Documents/行政院環境保護署推動產品碳足跡管理要點.docx
--- a/RAG_app.py
+++ b/RAG_app.py
@@ -28,6 +28,13 @@ from Indexing_Split import gen_doc_from_database, gen_doc_from_history
 
															 from dotenv import load_dotenv

														
 
															 import os

														
 
															+from langchain_community.vectorstores import SupabaseVectorStore

														
 
															+from langchain_openai import OpenAIEmbeddings

														
 
															+from supabase.client import Client, create_client

														
 
															+

														
 
															+

														
 
															+from add_vectordb import GetVectorStore

														
 
															+

														
 
															 load_dotenv()

														
 
															 URI = os.getenv("SUPABASE_URI")

														
@@ -36,16 +43,32 @@ global_retriever = None
 
															 @asynccontextmanager

														
 
															 async def lifespan(app: FastAPI):

														
 
															     global global_retriever

														
 
															+    global vector_store

														
 
															+    

														
 
															     start = time.time()

														
 
															-    global_retriever = split_retriever(path='./Documents', extension="docx")

														
 
															+    # global_retriever = split_retriever(path='./Documents', extension="docx")

														
 
															     # global_retriever = raptor_retriever(path='../Documents', extension="txt")

														
 
															     # global_retriever = unstructured_retriever(path='../Documents')

														
 
															+

														
 
															+    supabase_url = os.environ.get("SUPABASE_URL")

														
 
															+    supabase_key = os.environ.get("SUPABASE_KEY")

														
 
															+    document_table = "documents"

														
 
															+    supabase: Client = create_client(supabase_url, supabase_key)

														
 
															+

														
 
															+    embeddings = OpenAIEmbeddings()

														
 
															+    vector_store = GetVectorStore(embeddings, supabase, document_table)

														
 
															+    global_retriever = vector_store.as_retriever(search_kwargs={"k": 4})

														
 
															+

														
 
															     print(time.time() - start)

														
 
															     yield

														
 
															 def get_retriever():

														
 
															     return global_retriever

														
 
															+

														
 
															+def get_vector_store():

														
 
															+    return vector_store

														
 
															+

														
 
															 app = FastAPI(lifespan=lifespan)

														
 
															 # templates = Jinja2Templates(directory="temp")

														
@@ -89,25 +112,35 @@ class ChatHistoryItem(BaseModel):
 
															 @app.post("/answer_with_history")

														
 
															 def multi_query_answer(question: Optional[str] = '', chat_history: List[ChatHistoryItem] = Body(...), retriever=Depends(get_retriever)):

														
 
															     start = time.time()

														
 
															-

														
 
															+    

														
 
															     chat_history = [(item.q, item.a) for item in chat_history if item.a != ""]

														
 
															     print(chat_history)

														
 
															+    # TODO: similarity search

														
 
															+    

														
 
															     with get_openai_callback() as cb:

														
 
															-        # qa_doc = gen_doc_from_database()

														
 
															-        # qa_history_doc = gen_doc_from_history()

														
 
															-        # qa_doc.extend(qa_history_doc)

														
 
															-        # vectorstore = Chroma.from_documents(documents=qa_doc, embedding=OpenAIEmbeddings(), collection_name="qa_pairs")

														
 
															-        # retriever_qa = vectorstore.as_retriever(search_kwargs={"k": 3})

														
 
															-        # final_answer, reference_docs = naive_rag_for_qapairs(question, retriever_qa)

														
 
															-        final_answer = 'False'

														
 
															-        if final_answer == 'False':

														
 
															-            final_answer, reference_docs = multi_query(question, retriever, chat_history)

														
 
															+        final_answer, reference_docs = multi_query(question, retriever, chat_history)

														
 
															+    processing_time = time.time() - start

														
 
															+    print(processing_time)

														
 
															+    save_history(question, final_answer, reference_docs, cb, processing_time)

														
 
															-    # print(CHAT_HISTORY)

														
 
															+    return {"Answer": final_answer}

														
 
															+

														
 
															+

														
 
															+@app.post("/answer_with_history2")

														
 
															+def multi_query_answer(question: Optional[str] = '', extension: Optional[str] = 'pdf', chat_history: List[ChatHistoryItem] = Body(...), retriever=Depends(get_retriever)):

														
 
															+    start = time.time()

														
 
															+

														
 
															+    retriever = vector_store.as_retriever(search_kwargs={"k": 4,

														
 
															+                                                         'filter': {'extension':extension}})

														
 
															-    # with get_openai_callback() as cb:

														
 
															-    #     final_answer, reference_docs = multi_query(question, retriever)

														
 
															+    chat_history = [(item.q, item.a) for item in chat_history if item.a != ""]

														
 
															+    print(chat_history)

														
 
															+

														
 
															+    # TODO: similarity search

														
 
															+    

														
 
															+    with get_openai_callback() as cb:

														
 
															+        final_answer, reference_docs = multi_query(question, retriever, chat_history)

														
 
															     processing_time = time.time() - start

														
 
															     print(processing_time)

														
 
															     save_history(question, final_answer, reference_docs, cb, processing_time)

														
@@ -148,6 +181,6 @@ async def get_history():
 
															     return result.values()

														
 
															 if __name__ == "__main__":

														
 
															-    uvicorn.run("RAG_app:app", host='cmm.ai', port=8081, ssl_keyfile="/etc/letsencrypt/live/cmm.ai/privkey.pem", 

														
 
															+    uvicorn.run("RAG_app:app", host='cmm.ai', port=8081, reload=True, ssl_keyfile="/etc/letsencrypt/live/cmm.ai/privkey.pem", 

														
 
															                 ssl_certfile="/etc/letsencrypt/live/cmm.ai/fullchain.pem")

														
--- a/RAG_strategy.py
+++ b/RAG_strategy.py
@@ -52,7 +52,12 @@ def multi_query(question, retriever, chat_history):
 
															         different versions of the given user question to retrieve relevant documents from a vector 

														
 
															         database. By generating multiple perspectives on the user question, your goal is to help

														
 
															         the user overcome some of the limitations of the distance-based similarity search. 

														
 
															-        Provide these alternative questions separated by newlines. Original question: {question}"""

														
 
															+        Provide these alternative questions separated by newlines. 

														
 
															+

														
 
															+        You must return original question also, which means that you return 1 original version + 3 different versions = 4 questions.

														
 
															+        

														
 
															+        

														
 
															+        Original question: {question}"""

														
 
															         prompt_perspectives = ChatPromptTemplate.from_template(template)

														
@@ -117,7 +122,13 @@ def multi_query_rag_prompt(retrieval_chain, question):
 
															         | StrOutputParser()

														
 
															     )

														
 
															-    answer = final_rag_chain.invoke({"question":question})

														
 
															+    # answer = final_rag_chain.invoke({"question":question})

														
 
															+

														
 
															+    answer = ""

														
 
															+    for text in final_rag_chain.stream({"question":question}):

														
 
															+        print(text, end="", flush=True)

														
 
															+        answer += text

														
 
															+

														
 
															     return answer

														
 
															 ########################################################################################################################

														
@@ -125,18 +136,36 @@ def multi_query_rag_prompt(retrieval_chain, question):
 
															 def get_search_query():

														
 
															     # Condense a chat history and follow-up question into a standalone question

														
 
															     # 

														
 
															-    _template = """Given the following conversation and a follow up question, 

														
 
															-    rephrase the follow up question to be a standalone question to help others understand the question without having to go back to the conversation transcript.

														
 
															-    Generate standalone question in its original language.

														
 
															-    Chat History:

														
 
															+    # _template = """Given the following conversation and a follow up question, 

														
 
															+    # rephrase the follow up question to be a standalone question to help others understand the question without having to go back to the conversation transcript.

														
 
															+    # Generate standalone question in its original language.

														
 
															+    # Chat History:

														
 
															+    # {chat_history}

														
 
															+    # Follow Up Input: {question}

														
 
															+

														
 
															+    # Hint:

														
 
															+    # * Refer to chat history and add the subject to the question

														
 
															+    # * Replace the pronouns in the question with the correct person or thing, please refer to chat history

														
 
															+    

														
 
															+    # Standalone question:"""  # noqa: E501

														
 
															+    _template = """Rewrite the following query by incorporating relevant context from the conversation history.

														
 
															+    The rewritten query should:

														
 
															+    

														
 
															+    - Preserve the core intent and meaning of the original query

														
 
															+    - Expand and clarify the query to make it more specific and informative for retrieving relevant context

														
 
															+    - Avoid introducing new topics or queries that deviate from the original query

														
 
															+    - DONT EVER ANSWER the Original query, but instead focus on rephrasing and expanding it into a new query

														
 
															+    - The rewritten query should be in its original language.

														
 
															+    

														
 
															+    Return ONLY the rewritten query text, without any additional formatting or explanations.

														
 
															+    

														
 
															+    Conversation History:

														
 
															     {chat_history}

														
 
															-    Follow Up Input: {question}

														
 
															-

														
 
															-    Hint:

														
 
															-    * Refer to chat history and add the subject to the question

														
 
															-    * Replace the pronouns in the question with the correct person or thing, please refer to chat history

														
 
															-    Standalone question:"""  # noqa: E501

														
 
															+    Original query: [{question}]

														
 
															+    

														
 
															+    Rewritten query: 

														
 
															+    """

														
 
															     CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

														
 
															     def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:

														
--- a/add_vectordb.py
+++ b/add_vectordb.py
@@ -6,16 +6,32 @@ from langchain_community.vectorstores import Chroma
 
															 from langchain_community.document_loaders import TextLoader
														
 
															 from langchain_text_splitters import RecursiveCharacterTextSplitter
														
 
															 from langchain_community.document_loaders import PyPDFLoader
														
 
															+from langchain_community.document_loaders import Docx2txtLoader
														
 
															 import os
														
 
															 import glob
														
 
															-def read_and_split_files(path='Documents', extension="pdf"):
														
 
															-    txt_files = glob.glob(os.path.join(path, f"*.{extension}"))
														
 
															-        
														
 
															-    doc = []
														
 
															-    for file_path in txt_files:
														
 
															-        doc.append(file_path)
														
 
															+from langchain_community.vectorstores import SupabaseVectorStore
														
 
															+from langchain_openai import OpenAIEmbeddings
														
 
															+from supabase.client import Client, create_client
														
 
															+
														
 
															+
														
 
															+def get_data_list(data_list=None, path=None, extension=None, update=False):
														
 
															+    files = data_list or glob.glob(os.path.join(path, f"*.{extension}"))
														
 
															+    if update:    
														
 
															+        doc = files.copy()
														
 
															+    else:
														
 
															+        existed_data = check_existed_data(supabase)
														
 
															+        doc = []
														
 
															+        for file_path in files:
														
 
															+            filename = os.path.basename(file_path)
														
 
															+            if filename not in existed_data:
														
 
															+                doc.append(file_path)
														
 
															+
														
 
															+    return doc
														
 
															+
														
 
															+
														
 
															+def read_and_split_files(data_list=None, path=None, extension=None, update=False):
														
 
															     def load_and_split(file_list):
														
 
															         chunks = []
														
@@ -24,56 +40,150 @@ def read_and_split_files(path='Documents', extension="pdf"):
 
															                 loader = TextLoader(file, encoding='utf-8')
														
 
															             elif file.endswith(".pdf"):
														
 
															                 loader = PyPDFLoader(file)
														
 
															+            elif file.endswith(".docx"):
														
 
															+                loader = Docx2txtLoader(file)
														
 
															             else:
														
 
															-                raise ValueError(f"Unsupported file extension: {file}")
														
 
															+                print(f"Unsupported file extension: {file}")
														
 
															+                continue
														
 
															             docs = loader.load()
														
 
															             # Split
														
 
															-            text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
														
 
															-                chunk_size=1000, chunk_overlap=200)
														
 
															+            if file.endswith(".docx"):
														
 
															+                separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']
														
 
															+                text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=separators, chunk_size=300, chunk_overlap=0)
														
 
															+            else:
														
 
															+                text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=0)
														
 
															             splits = text_splitter.split_documents(docs)
														
 
															             chunks.extend(splits)
														
 
															         return chunks
														
 
															+
														
 
															+    doc = get_data_list(data_list=data_list, path=path, extension=extension, update=update)
														
 
															     # Index
														
 
															     docs = load_and_split(doc)
														
 
															-    # qa_history_doc = gen_doc_from_history()
														
 
															-    # docs.extend(qa_history_doc)
														
 
															-    # web_doc = web_data(os.path.join(path, 'web_url.csv'))
														
 
															-    # docs.extend(web_doc)
														
 
															     return docs
														
 
															+def create_ids(docs):
														
 
															+    # Create a dictionary to count occurrences of each page in each document
														
 
															+    page_counter = {}
														
 
															+
														
 
															+    # List to store the resulting IDs
														
 
															+    document_ids = []
														
 
															+
														
 
															+    # Generate IDs
														
 
															+    for doc in [docs[i].metadata for i in range(len(docs))]:
														
 
															+        source = doc['source']
														
 
															+        file_name = os.path.basename(source).split('.')[0]
														
 
															+
														
 
															+        if "page" in doc.keys():
														
 
															+            page = doc['page']
														
 
															+            key = f"{source}_{page}"
														
 
															+        else:
														
 
															+            key = f"{source}"
														
 
															+
														
 
															+        if key not in page_counter:
														
 
															+            page_counter[key] = 1
														
 
															+        else:
														
 
															+            page_counter[key] += 1
														
 
															+        
														
 
															+        if "page" in doc.keys():
														
 
															+            doc_id = f"{file_name} | page {page} | chunk {page_counter[key]}"
														
 
															+        else:
														
 
															+            doc_id = f"{file_name} | chunk {page_counter[key]}"
														
 
															+
														
 
															+        
														
 
															+        document_ids.append(doc_id)
														
 
															+
														
 
															+    return document_ids
														
 
															+
														
 
															+def get_document(data_list=None, path=None, extension=None, update=False):
														
 
															+    docs = read_and_split_files(data_list=data_list, path=path, extension=extension, update=update)
														
 
															+    document_ids = create_ids(docs)
														
 
															+
														
 
															+    for doc in docs:
														
 
															+        doc.metadata['source'] = os.path.basename(doc.metadata['source'])
														
 
															+        # print(doc.metadata)
														
 
															+
														
 
															+    # document_metadatas = [{'source': doc.metadata['source'], 'page': doc.metadata['page'], 'chunk': int(id.split("chunk ")[-1])} for doc, id in zip(docs, document_ids)]
														
 
															+    document_metadatas = []
														
 
															+
														
 
															+    for doc, id in zip(docs, document_ids):
														
 
															+        chunk_number = int(id.split("chunk ")[-1])
														
 
															+        doc.metadata['chunk'] = chunk_number
														
 
															+        doc.metadata['extension'] = os.path.basename(doc.metadata['source']).split(".")[-1]
														
 
															+        document_metadatas.append(doc.metadata)
														
 
															+
														
 
															+    documents = [docs.metadata['source'].split(".")[0] + docs.page_content for docs in docs]
														
 
															+
														
 
															+    return document_ids, documents, document_metadatas
														
 
															-def create_vectordb(docs):
														
 
															-    path = "../SYSTEX_精誠/RAG/Documents/"
														
 
															-    docs = read_and_split_files(path)
														
 
															+def check_existed_data(supabase):
														
 
															+    response = supabase.table('documents').select("id, metadata").execute()
														
 
															+    existed_data = list(set([data['metadata']['source'] for data in response.data]))
														
 
															+    # existed_data = [(data['id'], data['metadata']['source']) for data in response.data]
														
 
															+    return existed_data
														
 
															-    persist_directory = 'db'
														
 
															+class GetVectorStore(SupabaseVectorStore):
														
 
															+    def __init__(self, embeddings, supabase, table_name):
														
 
															+        super().__init__(embedding=embeddings, client=supabase, table_name=table_name, query_name="match_documents")
														
 
															-    embedding = OpenAIEmbeddings()
														
 
															+    def insert(self, documents, document_metadatas):
														
 
															+        self.add_texts(
														
 
															+            texts=documents,
														
 
															+            metadatas=document_metadatas,
														
 
															+        )
														
 
															-    vectordb = Chroma.from_documents(documents = docs,
														
 
															-                                    embedding = embedding,
														
 
															-                                    persist_directory = persist_directory)
														
 
															+    def delete(self, file_list):
														
 
															+        for file_name in file_list:
														
 
															+            self._client.table(self.table_name).delete().eq('metadata->>source', file_name).execute()
														
 
															-    # 用persist方式執行vectordb，可以將db資料寫到磁碟中
														
 
															-    vectordb.persist()
														
 
															+    def update(self, documents, document_metadatas, update_existing_data=False):
														
 
															+        if not document_metadatas:  # no new data
														
 
															+            return
														
 
															+        if update_existing_data:
														
 
															+            file_list = list(set(metadata['source'] for metadata in document_metadatas))
														
 
															+            self.delete(file_list)
														
 
															-def use_vectordb(persist_directory):
														
 
															-    # embedding使用OpenAI的Embedding
														
 
															-    embedding = OpenAIEmbeddings()
														
 
															+        self.insert(documents, document_metadatas)
														
 
															-    # 使用的vectordb
														
 
															+if __name__ == "__main__":
														
 
															-    vectordb = Chroma(persist_directory=persist_directory, 
														
 
															-                    embedding_function=embedding)
														
 
															+    load_dotenv()
														
 
															+    supabase_url = os.environ.get("SUPABASE_URL")
														
 
															+    supabase_key = os.environ.get("SUPABASE_KEY")
														
 
															+    document_table = "documents"
														
 
															+    supabase: Client = create_client(supabase_url, supabase_key)
														
 
															+
														
 
															+    embeddings = OpenAIEmbeddings()
														
 
															+
														
 
															+    # get vector store
														
 
															+    vector_store = GetVectorStore(embeddings, supabase, document_table)
														
 
															+
														
 
															+    # update data (old + new / all new / all old)
														
 
															+    path = "/home/mia/systex/Documents"
														
 
															+    extension = "pdf"
														
 
															+    # file = None
														
 
															+
														
 
															+    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
														
 
															+    # file = [os.path.join(path, file) for file in file_list]
														
 
															+    file_list = glob.glob(os.path.join(path, "*"))
														
 
															+    print(file_list)
														
 
															-def use_retriever(vectordb):
														
 
															-    retriever = vectordb.as_retriever(search_kwargs={'k': 2})
														
 
															-    ans2 = retriever.invoke('溫室氣體種類')
														
 
															-    print(ans2)
														
 
															+    update = True
														
 
															+    document_ids, documents, document_metadatas = get_document(data_list=file_list, path=path, extension=extension, update=update)
														
 
															+    vector_store.update(documents, document_metadatas, update_existing_data=update)
														
 
															+
														
 
															+    # insert new data (all new)
														
 
															+    # vector_store.insert(documents, document_metadatas)
														
 
															+
														
 
															+    # delete data
														
 
															+    # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
														
 
															+    # vector_store.delete(file_list)
														
 
															+
														
 
															+    # get retriver
														
 
															+    # retriever = vector_store.as_retriever(search_kwargs={"k": 6})