@@ -0,0 +1,273 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+def einfo(url, headers):
+ web_url = "https://e-info.org.tw/"
+ if not url.startswith(web_url):
+ raise ValueError("URL must start with {}".format(web_url))
+ # get news content soup
+ response = requests.get(url, headers=headers)
+ response.encoding = 'utf-8'
+ soup = BeautifulSoup(response.text, 'html.parser')
+ # get news title
+ news_title = soup.find('title')
+ title = news_title.get_text(strip=True)
+ # get news content
+ news_content = []
+ news_content_divs = soup.find_all('div', class_='field-item even')
+ if news_content_divs and len(news_content_divs) > 0 :
+ for div in news_content_divs:
+ for tag in div.find_all(['h1', 'h2', 'h3', 'p']):
+ news_content.append(tag.get_text(strip=True))
+ else:
+ news_text = "未找到新聞内容"
+ raise Exception(f'news content is empty. url: {url}')
+ if len(news_content) == 0:
+ raise Exception(f'news content is empty. url: {url}')
+ # coonbine all text
+ news_text = title + "\n" + "\n".join(news_content)
+ return news_text
+def csrone(url, headers):
+ web_url = "https://csrone.com/"
+ if not url.startswith(web_url):
+ raise ValueError("URL must start with {}".format(web_url))
+ # get news content soup
+ response = requests.get(url, headers=headers)
+ response.encoding = 'utf-8'
+ soup = BeautifulSoup(response.text, 'html.parser')
+ # get news title
+ news_title = soup.find('h2', class_=False)
+ title = news_title.get_text(strip=True)
+ # get news content
+ news_content = []
+ news_content_divs = soup.find_all('div', class_="article_content text-break")
+ if news_content_divs and len(news_content_divs) > 0:
+ for div in news_content_divs:
+ for tag in div.find_all(['h1', 'h2', 'h3', 'p', 'pre']):
+ news_content.append(tag.get_text(strip=True))
+ else:
+ news_text = "未找到新聞内容"
+ raise Exception(f'news content is empty. url: {url}')
+ if len(news_content) == 0:
+ raise Exception(f'news content is empty. url: {url}')
+ # coonbine all text
+ news_text = title + "\n" + "\n".join(news_content)
+ return news_text
+def enews(url, headers):
+ web_url = "https://enews.moenv.gov.tw/"
+ if not url.startswith(web_url):
+ raise ValueError("URL must start with {}".format(web_url))
+ # get news content soup
+ response = requests.get(url, headers=headers)
+ response.encoding = 'utf-8'
+ soup = BeautifulSoup(response.text, 'html.parser')
+ # get news title
+ news_title = soup.find('h2', class_="main-title")
+ title = news_title.get_text(strip=True)
+ # get news content
+ news_content = []
+ news_content_divs = soup.find_all('div', class_="news-info-paragraph")
+ if news_content_divs and len(news_content_divs) > 0 :
+ for div in news_content_divs:
+ for tag in div.find_all("span"):
+ news_content.append(tag.get_text(strip=True))
+ else:
+ news_text = "未找到新聞内容"
+ raise Exception(f'news content is empty. url: {url}')
+ if len(news_content) == 0:
+ raise Exception(f'news content is empty. url: {url}')
+ # coonbine all text
+ news_text = title + "\n" + "\n".join(news_content)
+ return news_text
+def esg_gvm(url, headers):
+ web_url = "https://esg.gvm.com.tw/"
+ if not url.startswith(web_url):
+ raise ValueError("URL must start with {}".format(web_url))
+ # get news content soup
+ response = requests.get(url, headers=headers)
+ response.encoding = 'utf-8'
+ soup = BeautifulSoup(response.text, 'html.parser')
+ # get news title
+ news_title = soup.find('h1')
+ title = news_title.get_text(strip=True)
+ # get news content
+ news_content = []
+ abstract_content = soup.find('h2', class_="post_excerpt my-4 text-primary")
+ abstract = abstract_content.get_text(strip=True)
+ news_content.append(abstract)
+ news_content_divs = soup.find_all('div', class_="col-xl-7 col-lg-10 post-content-container")
+ if news_content_divs and len(news_content_divs) > 0 :
+ for div in news_content_divs:
+ for tag in div.find_all(["h2", "h3", "p"], class_=False):
+ news_content.append(tag.get_text(strip=True))
+ else:
+ news_text = "未找到新聞内容"
+ raise Exception(f'news content is empty. url: {url}')
+ if len(news_content) == 0:
+ raise Exception(f'news content is empty. url: {url}')
+ # coonbine all text
+ news_text = title + "\n" + "\n".join(news_content)
+ return news_text
+def fsc(url, headers):
+ web_url = "https://www.fsc.gov.tw/"
+ if not url.startswith(web_url):
+ raise ValueError("URL must start with {}".format(web_url))
+ # get news content soup
+ response = requests.get(url, headers=headers)
+ response.encoding = 'utf-8'
+ soup = BeautifulSoup(response.text, 'html.parser')
+ # get news title
+ news_title = soup.find('h3')
+ title = news_title.get_text(strip=True)
+ # get news content
+ news_content = []
+ news_content_div = soup.find('div', class_="main-a_03")
+ news_article = news_content_div.get_text(strip=True)
+ news_content.append(news_article)
+ if len(news_content) == 0:
+ raise Exception(f'news content is empty. url: {url}')
+ # coonbine all text
+ news_text = title + "\n" + "\n".join(news_content)
+ return news_text
+def moeaea(url, headers):
+ web_url = "https://www.moeaea.gov.tw/"
+ if not url.startswith(web_url):
+ raise ValueError("URL must start with {}".format(web_url))
+ # get news content soup
+ response = requests.get(url, headers=headers)
+ response.encoding = 'utf-8'
+ soup = BeautifulSoup(response.text, 'html.parser')
+ # get news title
+ news_title = soup.find('div', class_="divTitle")
+ title = news_title.get_text(strip=True)
+ # get news content
+ news_content = []
+ news_content_div = soup.find('div', style="clear: both; margin-top: 5px;")
+ news_article = news_content_div.get_text(strip=True)
+ news_content.append(news_article)
+ if len(news_content) == 0:
+ raise Exception(f'news content is empty. url: {url}')
+ # coonbine all text
+ news_text = title + "\n" + "\n".join(news_content)
+ return news_text
+def tcx(url, headers):
+ web_url = "https://www.tcx.com.tw/"
+ if not url.startswith(web_url):
+ raise ValueError("URL must start with {}".format(web_url))
+ # get news content soup
+ id = url.split("?")[-1]
+ api_url = "https://www.tcx.com.tw/learn/front/newsDetailApi/"+id
+ response = requests.get(api_url, headers=headers)
+ response.encoding = 'utf-8'
+ data = response.json()
+ # get news title
+ title = data['detail']['title'].strip()
+ # get news content
+ news_content = []
+ soup = BeautifulSoup(data['detail']['content'], 'html.parser')
+ news_content_divs = soup.find_all('p', class_=False, style=False)
+ if news_content_divs and len(news_content_divs) > 0 :
+ for div in news_content_divs:
+ news_content.append(div.get_text(strip=True))
+ else:
+ news_text = "未找到新聞内容"
+ raise Exception(f'news content is empty. url: {url}')
+ if len(news_content) == 0:
+ raise Exception(f'news content is empty. url: {url}')
+ # coonbine all text
+ news_text = title + "\n" + "\n".join(news_content)
+ return news_text
+def get_web_loader(url, web_loaders=None):
+ if web_loaders is None:
+ web_loaders = [
+ {"web": "https://e-info.org.tw/", "web_loader": einfo},
+ {"web": "https://csrone.com/", "web_loader": csrone},
+ {"web": "https://enews.moenv.gov.tw/", "web_loader": enews},
+ {"web": "https://esg.gvm.com.tw/", "web_loader": esg_gvm},
+ {"web": "https://www.fsc.gov.tw/", "web_loader": fsc},
+ {"web": "https://www.moeaea.gov.tw/", "web_loader": moeaea},
+ {"web": "https://www.tcx.com.tw/", "web_loader": tcx}
+ ]
+ for web_loader in web_loaders:
+ if url.startswith(web_loader["web"]):
+ return web_loader["web_loader"]
+ return None
+if __name__ == "__main__":
+ url = "https://enews.moenv.gov.tw/Page/3B3C62C78849F32F/871dc06b-4028-42e4-8d36-656e2427180c"
+ web_loader = get_web_loader(url)
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+ }
+ text = web_loader(url, headers)
+ print(text)