Browse Source

create file_loader folder and add website loader file

ling 4 months ago
parent
commit
fb4ff0d1ca
6 changed files with 407 additions and 12 deletions
  1. 1 1
      RAG_app.py
  2. 1 1
      faiss_index.py
  3. 42 10
      file_loader/add_vectordb.py
  4. 60 0
      file_loader/news_documents.py
  5. 30 0
      file_loader/news_vectordb.py
  6. 273 0
      file_loader/website_loader.py

+ 1 - 1
RAG_app.py

@@ -38,7 +38,7 @@ from langchain_openai import OpenAIEmbeddings
 from supabase.client import Client, create_client
 
 
-from add_vectordb import GetVectorStore
+from file_loader.add_vectordb import GetVectorStore
 from faiss_index import create_faiss_retriever, faiss_query
 from local_llm import ollama_, hf
 # from local_llm import ollama_, taide_llm, hf

+ 1 - 1
faiss_index.py

@@ -28,7 +28,7 @@ system_prompt: str = "你是一個來自台灣的AI助理,你的名字是 TAID
 
 from langchain.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-from add_vectordb import GetVectorStore
+from file_loader.add_vectordb import GetVectorStore
 # from local_llm import ollama_, hf
 # # from local_llm import ollama_, taide_llm, hf
 # llm = hf()

+ 42 - 10
add_vectordb.py → file_loader/add_vectordb.py

@@ -1,5 +1,5 @@
 from dotenv import load_dotenv
-load_dotenv()
+load_dotenv("../.env")
 
 from langchain_openai import OpenAIEmbeddings
 from langchain_community.vectorstores import Chroma
@@ -7,6 +7,7 @@ from langchain_community.document_loaders import TextLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.document_loaders import Docx2txtLoader
+from langchain.document_loaders import CSVLoader
 
 import os
 import glob
@@ -15,7 +16,7 @@ from langchain_community.vectorstores import SupabaseVectorStore
 from langchain_openai import OpenAIEmbeddings
 from supabase.client import Client, create_client
 
-
+document_table = "documents"
 def get_data_list(data_list=None, path=None, extension=None, update=False):
     files = data_list or glob.glob(os.path.join(path, f"*.{extension}"))
     if update:    
@@ -33,6 +34,22 @@ def get_data_list(data_list=None, path=None, extension=None, update=False):
 
 def read_and_split_files(data_list=None, path=None, extension=None, update=False):
 
+    def read_csv(path):
+        extension = "csv"
+        # path = r"./Phase2/"
+        files = glob.glob(os.path.join(path, f"*.{extension}"))
+            
+        documents = []
+        for file_path in files:
+            print(file_path)
+
+            loader = CSVLoader(file_path, encoding="utf-8")
+            doc = loader.load()
+
+            documents.extend(doc)
+        
+        return documents
+    
     def load_and_split(file_list):
         chunks = []
         for file in file_list:
@@ -48,6 +65,7 @@ def read_and_split_files(data_list=None, path=None, extension=None, update=False
 
             docs = loader.load()
 
+
             # Split
             if file.endswith(".docx"):
                 separators = ['\u25cb\s*第.*?條', '\u25cf\s*第.*?條']
@@ -57,11 +75,21 @@ def read_and_split_files(data_list=None, path=None, extension=None, update=False
             splits = text_splitter.split_documents(docs)
 
             chunks.extend(splits)
+            doc = read_csv(path)
+            chunks.extend(doc)
 
         return chunks
+    
 
 
+    # specific data type
     doc = get_data_list(data_list=data_list, path=path, extension=extension, update=update)
+
+    # web url
+
+    # csv
+    # doc = read_csv(path)
+
     # Index
     docs = load_and_split(doc)
 
@@ -122,7 +150,7 @@ def get_document(data_list=None, path=None, extension=None, update=False):
     return document_ids, documents, document_metadatas
 
 def check_existed_data(supabase):
-    response = supabase.table('documents').select("id, metadata").execute()
+    response = supabase.table(document_table).select("id, metadata").execute()
     existed_data = list(set([data['metadata']['source'] for data in response.data]))
     # existed_data = [(data['id'], data['metadata']['source']) for data in response.data]
     return existed_data
@@ -153,7 +181,7 @@ class GetVectorStore(SupabaseVectorStore):
 
 if __name__ == "__main__":
 
-    load_dotenv()
+    load_dotenv("../.env")
     supabase_url = os.environ.get("SUPABASE_URL")
     supabase_key = os.environ.get("SUPABASE_KEY")
     document_table = "documents"
@@ -173,12 +201,13 @@ if __name__ == "__main__":
 
     # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
     # file = [os.path.join(path, file) for file in file_list]
-    file_list = glob.glob(os.path.join(path, "*"))
-    print(file_list)
+    # file_list = glob.glob(os.path.join(path, "*"))
+    file_list =glob.glob(os.path.join(path, f"*.{extension}"))
+    # print(file_list)
     
-    update = False
-    document_ids, documents, document_metadatas = get_document(data_list=file_list, path=path, extension=extension, update=update)
-    vector_store.update(documents, document_metadatas, update_existing_data=update)
+    # update = False
+    # document_ids, documents, document_metadatas = get_document(data_list=file_list, path=path, extension=extension, update=update)
+    # vector_store.update(documents, document_metadatas, update_existing_data=update)
 
     ###################################################################################
     # insert new data (all new)
@@ -187,7 +216,10 @@ if __name__ == "__main__":
     ###################################################################################
     # delete data
     # file_list = ["溫室氣體排放量盤查作業指引113年版.pdf"]
-    # vector_store.delete(file_list)
+    file_list = glob.glob(os.path.join(path, f"*.docx"))
+    file_list = [os.path.basename(file_path) for file_path in file_list]
+    print(file_list)
+    vector_store.delete(file_list)
 
     ###################################################################################
     # get retriver

+ 60 - 0
file_loader/news_documents.py

@@ -0,0 +1,60 @@
+import re
+from langchain_core.documents import Document
+from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
+
+from langchain_community.document_loaders import WebBaseLoader
+
+from website_loader import get_web_loader
+
+class NewsLoader(WebBaseLoader):
+    def __init__(self, url: str, supabase_data_list):
+        super().__init__(url)
+        self.supabase_data_list = supabase_data_list
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+
+
+    def aload(self) -> List[Document]:  # type: ignore
+        """Load text from the urls in web_path async into Documents."""
+
+        # results = self.scrape_all(self.web_paths)
+        docs = []
+        documents = [] 
+        document_metadatas = []
+        for url, supabase_data in zip(self.web_paths, self.supabase_data_list):
+            print(url)
+            web_loader = get_web_loader(url)
+            text = web_loader(url, self.headers)
+            documents.append(text)
+
+            metadata = supabase_data
+            document_metadatas.append(metadata)
+
+            docs.append(Document(page_content=text, metadata=metadata))
+            
+        return docs, documents, document_metadatas
+    
+if __name__ == '__main__':
+    import os
+    from supabase import create_client, Client
+    from dotenv import load_dotenv
+    load_dotenv("../.env")
+
+    SUPABASE_URL = os.getenv('SUPABASE_URL')
+    SUPABASE_KEY = os.getenv('SUPABASE_KEY')
+    supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
+
+    response = supabase.table("systex_website_data").select("title", "date", "url", "search_kw", "category", "related_kw", "official_website_source").execute()
+    url_list = [data['url'] for data in response.data]
+    supabase_data_list = response.data
+
+    import nest_asyncio
+
+    nest_asyncio.apply()
+
+    loader = NewsLoader(url_list[-5:], supabase_data_list[-5:])
+    loader.requests_per_second = 1
+    docs, documents, document_metadatas = loader.aload()
+    print(docs)
+

+ 30 - 0
file_loader/news_vectordb.py

@@ -0,0 +1,30 @@
+import os
+from dotenv import load_dotenv
+from langchain_openai import OpenAIEmbeddings
+import pandas as pd
+from supabase import Client, create_client
+
+import nest_asyncio
+nest_asyncio.apply()
+
+from news_documents import NewsLoader
+from add_vectordb import GetVectorStore
+
+load_dotenv("../.env")
+supabase_url = os.environ.get("SUPABASE_URL")
+supabase_key = os.environ.get("SUPABASE_KEY")
+document_table = "documents"
+supabase: Client = create_client(supabase_url, supabase_key)
+
+embeddings = OpenAIEmbeddings()
+
+vector_store = GetVectorStore(embeddings, supabase, document_table)
+
+response = supabase.table("systex_website_data").select("title", "date", "url", "search_kw", "category", "related_kw", "official_website_source").execute()
+url_list = [data['url'] for data in response.data]
+supabase_data_list = response.data
+
+loader = NewsLoader(url_list, supabase_data_list)
+loader.requests_per_second = 2
+_, documents, document_metadatas = loader.aload()
+vector_store.insert(documents, document_metadatas)

+ 273 - 0
file_loader/website_loader.py

@@ -0,0 +1,273 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+
+
+def einfo(url, headers):
+
+    web_url = "https://e-info.org.tw/"
+    if not url.startswith(web_url):
+        raise ValueError("URL must start with {}".format(web_url))
+    
+
+    # get news content soup
+    response = requests.get(url, headers=headers)
+    response.encoding = 'utf-8'
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    # get news title
+    news_title = soup.find('title')
+    title = news_title.get_text(strip=True)
+
+    # get news content
+    news_content = []
+    news_content_divs = soup.find_all('div', class_='field-item even')
+    if news_content_divs and len(news_content_divs) > 0 :
+        for div in news_content_divs:
+            for tag in div.find_all(['h1', 'h2', 'h3', 'p']):
+                news_content.append(tag.get_text(strip=True))
+    else:
+        news_text = "未找到新聞内容"
+        raise Exception(f'news content is empty. url: {url}')
+
+    if len(news_content) == 0:
+        raise Exception(f'news content is empty. url: {url}')
+
+    # coonbine all text
+    news_text = title + "\n" + "\n".join(news_content)
+
+    return news_text
+
+def csrone(url, headers):
+
+    web_url = "https://csrone.com/"
+    if not url.startswith(web_url):
+        raise ValueError("URL must start with {}".format(web_url))
+    
+
+    # get news content soup
+    response = requests.get(url, headers=headers)
+    response.encoding = 'utf-8'
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+
+    # get news title
+    news_title = soup.find('h2', class_=False)
+    title = news_title.get_text(strip=True)
+
+    # get news content
+    news_content = []
+    news_content_divs = soup.find_all('div', class_="article_content text-break")
+    if news_content_divs and len(news_content_divs) > 0:
+        for div in news_content_divs:
+            for tag in div.find_all(['h1', 'h2', 'h3', 'p', 'pre']):
+                news_content.append(tag.get_text(strip=True))
+    else:
+        news_text = "未找到新聞内容"
+        raise Exception(f'news content is empty. url: {url}')
+
+    if len(news_content) == 0:
+        raise Exception(f'news content is empty. url: {url}')
+
+    # coonbine all text
+    news_text = title + "\n" + "\n".join(news_content)
+
+    return news_text
+
+def enews(url, headers):
+
+    web_url = "https://enews.moenv.gov.tw/"
+    if not url.startswith(web_url):
+        raise ValueError("URL must start with {}".format(web_url))
+    
+
+    # get news content soup
+    response = requests.get(url, headers=headers)
+    response.encoding = 'utf-8'
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    # get news title
+    news_title = soup.find('h2', class_="main-title")
+    title = news_title.get_text(strip=True)
+
+    # get news content
+    news_content = []
+    news_content_divs = soup.find_all('div', class_="news-info-paragraph")
+    if news_content_divs and len(news_content_divs) > 0 :
+        for div in news_content_divs:
+            for tag in div.find_all("span"):
+                news_content.append(tag.get_text(strip=True))
+    else:
+        news_text = "未找到新聞内容"
+        raise Exception(f'news content is empty. url: {url}')
+    
+    if len(news_content) == 0:
+        raise Exception(f'news content is empty. url: {url}')
+
+    # coonbine all text
+    news_text = title + "\n" + "\n".join(news_content)
+
+    return news_text
+
+def esg_gvm(url, headers):
+
+    web_url = "https://esg.gvm.com.tw/"
+    if not url.startswith(web_url):
+        raise ValueError("URL must start with {}".format(web_url))
+    
+
+    # get news content soup
+    response = requests.get(url, headers=headers)
+    response.encoding = 'utf-8'
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+
+    # get news title
+    news_title = soup.find('h1')
+    title = news_title.get_text(strip=True)
+
+    # get news content
+    news_content = []
+    abstract_content = soup.find('h2', class_="post_excerpt my-4 text-primary")
+    abstract = abstract_content.get_text(strip=True)
+    news_content.append(abstract)
+    
+    news_content_divs = soup.find_all('div', class_="col-xl-7 col-lg-10 post-content-container")
+    if news_content_divs and len(news_content_divs) > 0 :
+        for div in news_content_divs:
+            for tag in div.find_all(["h2", "h3", "p"], class_=False):
+                news_content.append(tag.get_text(strip=True))
+    else:
+        news_text = "未找到新聞内容"
+        raise Exception(f'news content is empty. url: {url}')
+
+    if len(news_content) == 0:
+        raise Exception(f'news content is empty. url: {url}')
+
+    # coonbine all text
+    news_text = title + "\n" + "\n".join(news_content)
+
+    return news_text
+
+def fsc(url, headers):
+
+    web_url = "https://www.fsc.gov.tw/"
+    if not url.startswith(web_url):
+        raise ValueError("URL must start with {}".format(web_url))
+    
+
+    # get news content soup
+    response = requests.get(url, headers=headers)
+    response.encoding = 'utf-8'
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+
+    # get news title
+    news_title = soup.find('h3')
+    title = news_title.get_text(strip=True)
+
+    # get news content
+    news_content = []
+    news_content_div = soup.find('div', class_="main-a_03")
+    news_article = news_content_div.get_text(strip=True)
+    news_content.append(news_article)
+    
+    if len(news_content) == 0:
+        raise Exception(f'news content is empty. url: {url}')
+
+    # coonbine all text
+    news_text = title + "\n" + "\n".join(news_content)
+
+    return news_text
+
+def moeaea(url, headers):
+
+    web_url = "https://www.moeaea.gov.tw/"
+    if not url.startswith(web_url):
+        raise ValueError("URL must start with {}".format(web_url))
+    
+
+    # get news content soup
+    response = requests.get(url, headers=headers)
+    response.encoding = 'utf-8'
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    # get news title
+    news_title = soup.find('div', class_="divTitle")
+    title = news_title.get_text(strip=True)
+
+    # get news content
+    news_content = []
+    news_content_div = soup.find('div', style="clear: both; margin-top: 5px;")
+    news_article = news_content_div.get_text(strip=True)
+    news_content.append(news_article)
+    
+    if len(news_content) == 0:
+        raise Exception(f'news content is empty. url: {url}')
+
+    # coonbine all text
+    news_text = title + "\n" + "\n".join(news_content)
+
+    return news_text
+
+def tcx(url, headers):
+
+    web_url = "https://www.tcx.com.tw/"
+    if not url.startswith(web_url):
+        raise ValueError("URL must start with {}".format(web_url))
+    
+
+    # get news content soup
+    id = url.split("?")[-1]
+    api_url = "https://www.tcx.com.tw/learn/front/newsDetailApi/"+id
+    response = requests.get(api_url, headers=headers)
+    response.encoding = 'utf-8'
+    data = response.json()
+
+    # get news title
+    title = data['detail']['title'].strip()
+
+    # get news content
+    news_content = []
+    soup = BeautifulSoup(data['detail']['content'], 'html.parser')
+    news_content_divs = soup.find_all('p', class_=False, style=False)
+    if news_content_divs and len(news_content_divs) > 0 :
+        for div in news_content_divs:
+            news_content.append(div.get_text(strip=True))
+    else:
+        news_text = "未找到新聞内容"
+        raise Exception(f'news content is empty. url: {url}')
+    
+    if len(news_content) == 0:
+        raise Exception(f'news content is empty. url: {url}')
+
+    # coonbine all text
+    news_text = title + "\n" + "\n".join(news_content)
+
+    return news_text
+
+
+def get_web_loader(url, web_loaders=None):
+    if web_loaders is None:
+        web_loaders = [
+            {"web": "https://e-info.org.tw/", "web_loader": einfo},
+            {"web": "https://csrone.com/", "web_loader": csrone},
+            {"web": "https://enews.moenv.gov.tw/", "web_loader": enews},
+            {"web": "https://esg.gvm.com.tw/", "web_loader": esg_gvm},
+            {"web": "https://www.fsc.gov.tw/", "web_loader": fsc},
+            {"web": "https://www.moeaea.gov.tw/", "web_loader": moeaea},
+            {"web": "https://www.tcx.com.tw/", "web_loader": tcx}
+            ]
+    for web_loader in web_loaders:
+        if url.startswith(web_loader["web"]):
+            return web_loader["web_loader"]
+    return None
+
+if __name__ == "__main__":
+    url = "https://enews.moenv.gov.tw/Page/3B3C62C78849F32F/871dc06b-4028-42e4-8d36-656e2427180c"
+    web_loader = get_web_loader(url)
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+    text = web_loader(url, headers)
+    print(text)