|
@@ -0,0 +1,273 @@
|
|
|
|
+import requests
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
+import re
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def einfo(url, headers):
|
|
|
|
+
|
|
|
|
+ web_url = "https://e-info.org.tw/"
|
|
|
|
+ if not url.startswith(web_url):
|
|
|
|
+ raise ValueError("URL must start with {}".format(web_url))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # get news content soup
|
|
|
|
+ response = requests.get(url, headers=headers)
|
|
|
|
+ response.encoding = 'utf-8'
|
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
+
|
|
|
|
+ # get news title
|
|
|
|
+ news_title = soup.find('title')
|
|
|
|
+ title = news_title.get_text(strip=True)
|
|
|
|
+
|
|
|
|
+ # get news content
|
|
|
|
+ news_content = []
|
|
|
|
+ news_content_divs = soup.find_all('div', class_='field-item even')
|
|
|
|
+ if news_content_divs and len(news_content_divs) > 0 :
|
|
|
|
+ for div in news_content_divs:
|
|
|
|
+ for tag in div.find_all(['h1', 'h2', 'h3', 'p']):
|
|
|
|
+ news_content.append(tag.get_text(strip=True))
|
|
|
|
+ else:
|
|
|
|
+ news_text = "未找到新聞内容"
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ if len(news_content) == 0:
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ # coonbine all text
|
|
|
|
+ news_text = title + "\n" + "\n".join(news_content)
|
|
|
|
+
|
|
|
|
+ return news_text
|
|
|
|
+
|
|
|
|
+def csrone(url, headers):
|
|
|
|
+
|
|
|
|
+ web_url = "https://csrone.com/"
|
|
|
|
+ if not url.startswith(web_url):
|
|
|
|
+ raise ValueError("URL must start with {}".format(web_url))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # get news content soup
|
|
|
|
+ response = requests.get(url, headers=headers)
|
|
|
|
+ response.encoding = 'utf-8'
|
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # get news title
|
|
|
|
+ news_title = soup.find('h2', class_=False)
|
|
|
|
+ title = news_title.get_text(strip=True)
|
|
|
|
+
|
|
|
|
+ # get news content
|
|
|
|
+ news_content = []
|
|
|
|
+ news_content_divs = soup.find_all('div', class_="article_content text-break")
|
|
|
|
+ if news_content_divs and len(news_content_divs) > 0:
|
|
|
|
+ for div in news_content_divs:
|
|
|
|
+ for tag in div.find_all(['h1', 'h2', 'h3', 'p', 'pre']):
|
|
|
|
+ news_content.append(tag.get_text(strip=True))
|
|
|
|
+ else:
|
|
|
|
+ news_text = "未找到新聞内容"
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ if len(news_content) == 0:
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ # coonbine all text
|
|
|
|
+ news_text = title + "\n" + "\n".join(news_content)
|
|
|
|
+
|
|
|
|
+ return news_text
|
|
|
|
+
|
|
|
|
+def enews(url, headers):
|
|
|
|
+
|
|
|
|
+ web_url = "https://enews.moenv.gov.tw/"
|
|
|
|
+ if not url.startswith(web_url):
|
|
|
|
+ raise ValueError("URL must start with {}".format(web_url))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # get news content soup
|
|
|
|
+ response = requests.get(url, headers=headers)
|
|
|
|
+ response.encoding = 'utf-8'
|
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
+
|
|
|
|
+ # get news title
|
|
|
|
+ news_title = soup.find('h2', class_="main-title")
|
|
|
|
+ title = news_title.get_text(strip=True)
|
|
|
|
+
|
|
|
|
+ # get news content
|
|
|
|
+ news_content = []
|
|
|
|
+ news_content_divs = soup.find_all('div', class_="news-info-paragraph")
|
|
|
|
+ if news_content_divs and len(news_content_divs) > 0 :
|
|
|
|
+ for div in news_content_divs:
|
|
|
|
+ for tag in div.find_all("span"):
|
|
|
|
+ news_content.append(tag.get_text(strip=True))
|
|
|
|
+ else:
|
|
|
|
+ news_text = "未找到新聞内容"
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ if len(news_content) == 0:
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ # coonbine all text
|
|
|
|
+ news_text = title + "\n" + "\n".join(news_content)
|
|
|
|
+
|
|
|
|
+ return news_text
|
|
|
|
+
|
|
|
|
+def esg_gvm(url, headers):
|
|
|
|
+
|
|
|
|
+ web_url = "https://esg.gvm.com.tw/"
|
|
|
|
+ if not url.startswith(web_url):
|
|
|
|
+ raise ValueError("URL must start with {}".format(web_url))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # get news content soup
|
|
|
|
+ response = requests.get(url, headers=headers)
|
|
|
|
+ response.encoding = 'utf-8'
|
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # get news title
|
|
|
|
+ news_title = soup.find('h1')
|
|
|
|
+ title = news_title.get_text(strip=True)
|
|
|
|
+
|
|
|
|
+ # get news content
|
|
|
|
+ news_content = []
|
|
|
|
+ abstract_content = soup.find('h2', class_="post_excerpt my-4 text-primary")
|
|
|
|
+ abstract = abstract_content.get_text(strip=True)
|
|
|
|
+ news_content.append(abstract)
|
|
|
|
+
|
|
|
|
+ news_content_divs = soup.find_all('div', class_="col-xl-7 col-lg-10 post-content-container")
|
|
|
|
+ if news_content_divs and len(news_content_divs) > 0 :
|
|
|
|
+ for div in news_content_divs:
|
|
|
|
+ for tag in div.find_all(["h2", "h3", "p"], class_=False):
|
|
|
|
+ news_content.append(tag.get_text(strip=True))
|
|
|
|
+ else:
|
|
|
|
+ news_text = "未找到新聞内容"
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ if len(news_content) == 0:
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ # coonbine all text
|
|
|
|
+ news_text = title + "\n" + "\n".join(news_content)
|
|
|
|
+
|
|
|
|
+ return news_text
|
|
|
|
+
|
|
|
|
+def fsc(url, headers):
|
|
|
|
+
|
|
|
|
+ web_url = "https://www.fsc.gov.tw/"
|
|
|
|
+ if not url.startswith(web_url):
|
|
|
|
+ raise ValueError("URL must start with {}".format(web_url))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # get news content soup
|
|
|
|
+ response = requests.get(url, headers=headers)
|
|
|
|
+ response.encoding = 'utf-8'
|
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # get news title
|
|
|
|
+ news_title = soup.find('h3')
|
|
|
|
+ title = news_title.get_text(strip=True)
|
|
|
|
+
|
|
|
|
+ # get news content
|
|
|
|
+ news_content = []
|
|
|
|
+ news_content_div = soup.find('div', class_="main-a_03")
|
|
|
|
+ news_article = news_content_div.get_text(strip=True)
|
|
|
|
+ news_content.append(news_article)
|
|
|
|
+
|
|
|
|
+ if len(news_content) == 0:
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ # coonbine all text
|
|
|
|
+ news_text = title + "\n" + "\n".join(news_content)
|
|
|
|
+
|
|
|
|
+ return news_text
|
|
|
|
+
|
|
|
|
+def moeaea(url, headers):
|
|
|
|
+
|
|
|
|
+ web_url = "https://www.moeaea.gov.tw/"
|
|
|
|
+ if not url.startswith(web_url):
|
|
|
|
+ raise ValueError("URL must start with {}".format(web_url))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # get news content soup
|
|
|
|
+ response = requests.get(url, headers=headers)
|
|
|
|
+ response.encoding = 'utf-8'
|
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
+
|
|
|
|
+ # get news title
|
|
|
|
+ news_title = soup.find('div', class_="divTitle")
|
|
|
|
+ title = news_title.get_text(strip=True)
|
|
|
|
+
|
|
|
|
+ # get news content
|
|
|
|
+ news_content = []
|
|
|
|
+ news_content_div = soup.find('div', style="clear: both; margin-top: 5px;")
|
|
|
|
+ news_article = news_content_div.get_text(strip=True)
|
|
|
|
+ news_content.append(news_article)
|
|
|
|
+
|
|
|
|
+ if len(news_content) == 0:
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ # coonbine all text
|
|
|
|
+ news_text = title + "\n" + "\n".join(news_content)
|
|
|
|
+
|
|
|
|
+ return news_text
|
|
|
|
+
|
|
|
|
+def tcx(url, headers):
|
|
|
|
+
|
|
|
|
+ web_url = "https://www.tcx.com.tw/"
|
|
|
|
+ if not url.startswith(web_url):
|
|
|
|
+ raise ValueError("URL must start with {}".format(web_url))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # get news content soup
|
|
|
|
+ id = url.split("?")[-1]
|
|
|
|
+ api_url = "https://www.tcx.com.tw/learn/front/newsDetailApi/"+id
|
|
|
|
+ response = requests.get(api_url, headers=headers)
|
|
|
|
+ response.encoding = 'utf-8'
|
|
|
|
+ data = response.json()
|
|
|
|
+
|
|
|
|
+ # get news title
|
|
|
|
+ title = data['detail']['title'].strip()
|
|
|
|
+
|
|
|
|
+ # get news content
|
|
|
|
+ news_content = []
|
|
|
|
+ soup = BeautifulSoup(data['detail']['content'], 'html.parser')
|
|
|
|
+ news_content_divs = soup.find_all('p', class_=False, style=False)
|
|
|
|
+ if news_content_divs and len(news_content_divs) > 0 :
|
|
|
|
+ for div in news_content_divs:
|
|
|
|
+ news_content.append(div.get_text(strip=True))
|
|
|
|
+ else:
|
|
|
|
+ news_text = "未找到新聞内容"
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ if len(news_content) == 0:
|
|
|
|
+ raise Exception(f'news content is empty. url: {url}')
|
|
|
|
+
|
|
|
|
+ # coonbine all text
|
|
|
|
+ news_text = title + "\n" + "\n".join(news_content)
|
|
|
|
+
|
|
|
|
+ return news_text
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_web_loader(url, web_loaders=None):
|
|
|
|
+ if web_loaders is None:
|
|
|
|
+ web_loaders = [
|
|
|
|
+ {"web": "https://e-info.org.tw/", "web_loader": einfo},
|
|
|
|
+ {"web": "https://csrone.com/", "web_loader": csrone},
|
|
|
|
+ {"web": "https://enews.moenv.gov.tw/", "web_loader": enews},
|
|
|
|
+ {"web": "https://esg.gvm.com.tw/", "web_loader": esg_gvm},
|
|
|
|
+ {"web": "https://www.fsc.gov.tw/", "web_loader": fsc},
|
|
|
|
+ {"web": "https://www.moeaea.gov.tw/", "web_loader": moeaea},
|
|
|
|
+ {"web": "https://www.tcx.com.tw/", "web_loader": tcx}
|
|
|
|
+ ]
|
|
|
|
+ for web_loader in web_loaders:
|
|
|
|
+ if url.startswith(web_loader["web"]):
|
|
|
|
+ return web_loader["web_loader"]
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
+ url = "https://enews.moenv.gov.tw/Page/3B3C62C78849F32F/871dc06b-4028-42e4-8d36-656e2427180c"
|
|
|
|
+ web_loader = get_web_loader(url)
|
|
|
|
+ headers = {
|
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
|
|
+ }
|
|
|
|
+ text = web_loader(url, headers)
|
|
|
|
+ print(text)
|