import re from langchain_core.documents import Document from typing import Any, Dict, Iterator, List, Optional, Sequence, Union from langchain_community.document_loaders import WebBaseLoader from website_loader import get_web_loader class NewsLoader(WebBaseLoader): def __init__(self, url: str, supabase_data_list): super().__init__(url) self.supabase_data_list = supabase_data_list self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } def aload(self) -> List[Document]: # type: ignore """Load text from the urls in web_path async into Documents.""" # results = self.scrape_all(self.web_paths) docs = [] documents = [] document_metadatas = [] for url, supabase_data in zip(self.web_paths, self.supabase_data_list): print(url) web_loader = get_web_loader(url) text = web_loader(url, self.headers) documents.append(text) metadata = supabase_data document_metadatas.append(metadata) docs.append(Document(page_content=text, metadata=metadata)) return docs, documents, document_metadatas if __name__ == '__main__': import os from supabase import create_client, Client from dotenv import load_dotenv load_dotenv("../.env") SUPABASE_URL = os.getenv('SUPABASE_URL') SUPABASE_KEY = os.getenv('SUPABASE_KEY') supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY) response = supabase.table("systex_website_data").select("title", "date", "url", "search_kw", "category", "related_kw", "official_website_source").execute() url_list = [data['url'] for data in response.data] supabase_data_list = response.data import nest_asyncio nest_asyncio.apply() loader = NewsLoader(url_list[-5:], supabase_data_list[-5:]) loader.requests_per_second = 1 docs, documents, document_metadatas = loader.aload() print(docs)