123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960 |
- import re
- from langchain_core.documents import Document
- from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
- from langchain_community.document_loaders import WebBaseLoader
- from website_loader import get_web_loader
- class NewsLoader(WebBaseLoader):
- def __init__(self, url: str, supabase_data_list):
- super().__init__(url)
- self.supabase_data_list = supabase_data_list
- self.headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
- }
- def aload(self) -> List[Document]: # type: ignore
- """Load text from the urls in web_path async into Documents."""
- # results = self.scrape_all(self.web_paths)
- docs = []
- documents = []
- document_metadatas = []
- for url, supabase_data in zip(self.web_paths, self.supabase_data_list):
- print(url)
- web_loader = get_web_loader(url)
- text = web_loader(url, self.headers)
- documents.append(text)
- metadata = supabase_data
- document_metadatas.append(metadata)
- docs.append(Document(page_content=text, metadata=metadata))
-
- return docs, documents, document_metadatas
-
- if __name__ == '__main__':
- import os
- from supabase import create_client, Client
- from dotenv import load_dotenv
- load_dotenv("../.env")
- SUPABASE_URL = os.getenv('SUPABASE_URL')
- SUPABASE_KEY = os.getenv('SUPABASE_KEY')
- supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
- response = supabase.table("systex_website_data").select("title", "date", "url", "search_kw", "category", "related_kw", "official_website_source").execute()
- url_list = [data['url'] for data in response.data]
- supabase_data_list = response.data
- import nest_asyncio
- nest_asyncio.apply()
- loader = NewsLoader(url_list[-5:], supabase_data_list[-5:])
- loader.requests_per_second = 1
- docs, documents, document_metadatas = loader.aload()
- print(docs)
|