news_documents.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. import re
  2. from langchain_core.documents import Document
  3. from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
  4. from langchain_community.document_loaders import WebBaseLoader
  5. from website_loader import get_web_loader
  6. class NewsLoader(WebBaseLoader):
  7. def __init__(self, url: str, supabase_data_list):
  8. super().__init__(url)
  9. self.supabase_data_list = supabase_data_list
  10. self.headers = {
  11. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  12. }
  13. def aload(self) -> List[Document]: # type: ignore
  14. """Load text from the urls in web_path async into Documents."""
  15. # results = self.scrape_all(self.web_paths)
  16. docs = []
  17. documents = []
  18. document_metadatas = []
  19. for url, supabase_data in zip(self.web_paths, self.supabase_data_list):
  20. print(url)
  21. web_loader = get_web_loader(url)
  22. text = web_loader(url, self.headers)
  23. documents.append(text)
  24. metadata = supabase_data
  25. document_metadatas.append(metadata)
  26. docs.append(Document(page_content=text, metadata=metadata))
  27. return docs, documents, document_metadatas
  28. if __name__ == '__main__':
  29. import os
  30. from supabase import create_client, Client
  31. from dotenv import load_dotenv
  32. load_dotenv("../.env")
  33. SUPABASE_URL = os.getenv('SUPABASE_URL')
  34. SUPABASE_KEY = os.getenv('SUPABASE_KEY')
  35. supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
  36. response = supabase.table("systex_website_data").select("title", "date", "url", "search_kw", "category", "related_kw", "official_website_source").execute()
  37. url_list = [data['url'] for data in response.data]
  38. supabase_data_list = response.data
  39. import nest_asyncio
  40. nest_asyncio.apply()
  41. loader = NewsLoader(url_list[-5:], supabase_data_list[-5:])
  42. loader.requests_per_second = 1
  43. docs, documents, document_metadatas = loader.aload()
  44. print(docs)