123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273 |
- import requests
- from bs4 import BeautifulSoup
- import re
- def einfo(url, headers):
- web_url = "https://e-info.org.tw/"
- if not url.startswith(web_url):
- raise ValueError("URL must start with {}".format(web_url))
-
- # get news content soup
- response = requests.get(url, headers=headers)
- response.encoding = 'utf-8'
- soup = BeautifulSoup(response.text, 'html.parser')
- # get news title
- news_title = soup.find('title')
- title = news_title.get_text(strip=True)
- # get news content
- news_content = []
- news_content_divs = soup.find_all('div', class_='field-item even')
- if news_content_divs and len(news_content_divs) > 0 :
- for div in news_content_divs:
- for tag in div.find_all(['h1', 'h2', 'h3', 'p']):
- news_content.append(tag.get_text(strip=True))
- else:
- news_text = "未找到新聞内容"
- raise Exception(f'news content is empty. url: {url}')
- if len(news_content) == 0:
- raise Exception(f'news content is empty. url: {url}')
- # coonbine all text
- news_text = title + "\n" + "\n".join(news_content)
- return news_text
- def csrone(url, headers):
- web_url = "https://csrone.com/"
- if not url.startswith(web_url):
- raise ValueError("URL must start with {}".format(web_url))
-
- # get news content soup
- response = requests.get(url, headers=headers)
- response.encoding = 'utf-8'
- soup = BeautifulSoup(response.text, 'html.parser')
- # get news title
- news_title = soup.find('h2', class_=False)
- title = news_title.get_text(strip=True)
- # get news content
- news_content = []
- news_content_divs = soup.find_all('div', class_="article_content text-break")
- if news_content_divs and len(news_content_divs) > 0:
- for div in news_content_divs:
- for tag in div.find_all(['h1', 'h2', 'h3', 'p', 'pre']):
- news_content.append(tag.get_text(strip=True))
- else:
- news_text = "未找到新聞内容"
- raise Exception(f'news content is empty. url: {url}')
- if len(news_content) == 0:
- raise Exception(f'news content is empty. url: {url}')
- # coonbine all text
- news_text = title + "\n" + "\n".join(news_content)
- return news_text
- def enews(url, headers):
- web_url = "https://enews.moenv.gov.tw/"
- if not url.startswith(web_url):
- raise ValueError("URL must start with {}".format(web_url))
-
- # get news content soup
- response = requests.get(url, headers=headers)
- response.encoding = 'utf-8'
- soup = BeautifulSoup(response.text, 'html.parser')
- # get news title
- news_title = soup.find('h2', class_="main-title")
- title = news_title.get_text(strip=True)
- # get news content
- news_content = []
- news_content_divs = soup.find_all('div', class_="news-info-paragraph")
- if news_content_divs and len(news_content_divs) > 0 :
- for div in news_content_divs:
- for tag in div.find_all("span"):
- news_content.append(tag.get_text(strip=True))
- else:
- news_text = "未找到新聞内容"
- raise Exception(f'news content is empty. url: {url}')
-
- if len(news_content) == 0:
- raise Exception(f'news content is empty. url: {url}')
- # coonbine all text
- news_text = title + "\n" + "\n".join(news_content)
- return news_text
- def esg_gvm(url, headers):
- web_url = "https://esg.gvm.com.tw/"
- if not url.startswith(web_url):
- raise ValueError("URL must start with {}".format(web_url))
-
- # get news content soup
- response = requests.get(url, headers=headers)
- response.encoding = 'utf-8'
- soup = BeautifulSoup(response.text, 'html.parser')
- # get news title
- news_title = soup.find('h1')
- title = news_title.get_text(strip=True)
- # get news content
- news_content = []
- abstract_content = soup.find('h2', class_="post_excerpt my-4 text-primary")
- abstract = abstract_content.get_text(strip=True)
- news_content.append(abstract)
-
- news_content_divs = soup.find_all('div', class_="col-xl-7 col-lg-10 post-content-container")
- if news_content_divs and len(news_content_divs) > 0 :
- for div in news_content_divs:
- for tag in div.find_all(["h2", "h3", "p"], class_=False):
- news_content.append(tag.get_text(strip=True))
- else:
- news_text = "未找到新聞内容"
- raise Exception(f'news content is empty. url: {url}')
- if len(news_content) == 0:
- raise Exception(f'news content is empty. url: {url}')
- # coonbine all text
- news_text = title + "\n" + "\n".join(news_content)
- return news_text
- def fsc(url, headers):
- web_url = "https://www.fsc.gov.tw/"
- if not url.startswith(web_url):
- raise ValueError("URL must start with {}".format(web_url))
-
- # get news content soup
- response = requests.get(url, headers=headers)
- response.encoding = 'utf-8'
- soup = BeautifulSoup(response.text, 'html.parser')
- # get news title
- news_title = soup.find('h3')
- title = news_title.get_text(strip=True)
- # get news content
- news_content = []
- news_content_div = soup.find('div', class_="main-a_03")
- news_article = news_content_div.get_text(strip=True)
- news_content.append(news_article)
-
- if len(news_content) == 0:
- raise Exception(f'news content is empty. url: {url}')
- # coonbine all text
- news_text = title + "\n" + "\n".join(news_content)
- return news_text
- def moeaea(url, headers):
- web_url = "https://www.moeaea.gov.tw/"
- if not url.startswith(web_url):
- raise ValueError("URL must start with {}".format(web_url))
-
- # get news content soup
- response = requests.get(url, headers=headers)
- response.encoding = 'utf-8'
- soup = BeautifulSoup(response.text, 'html.parser')
- # get news title
- news_title = soup.find('div', class_="divTitle")
- title = news_title.get_text(strip=True)
- # get news content
- news_content = []
- news_content_div = soup.find('div', style="clear: both; margin-top: 5px;")
- news_article = news_content_div.get_text(strip=True)
- news_content.append(news_article)
-
- if len(news_content) == 0:
- raise Exception(f'news content is empty. url: {url}')
- # coonbine all text
- news_text = title + "\n" + "\n".join(news_content)
- return news_text
- def tcx(url, headers):
- web_url = "https://www.tcx.com.tw/"
- if not url.startswith(web_url):
- raise ValueError("URL must start with {}".format(web_url))
-
- # get news content soup
- id = url.split("?")[-1]
- api_url = "https://www.tcx.com.tw/learn/front/newsDetailApi/"+id
- response = requests.get(api_url, headers=headers)
- response.encoding = 'utf-8'
- data = response.json()
- # get news title
- title = data['detail']['title'].strip()
- # get news content
- news_content = []
- soup = BeautifulSoup(data['detail']['content'], 'html.parser')
- news_content_divs = soup.find_all('p', class_=False, style=False)
- if news_content_divs and len(news_content_divs) > 0 :
- for div in news_content_divs:
- news_content.append(div.get_text(strip=True))
- else:
- news_text = "未找到新聞内容"
- raise Exception(f'news content is empty. url: {url}')
-
- if len(news_content) == 0:
- raise Exception(f'news content is empty. url: {url}')
- # coonbine all text
- news_text = title + "\n" + "\n".join(news_content)
- return news_text
- def get_web_loader(url, web_loaders=None):
- if web_loaders is None:
- web_loaders = [
- {"web": "https://e-info.org.tw/", "web_loader": einfo},
- {"web": "https://csrone.com/", "web_loader": csrone},
- {"web": "https://enews.moenv.gov.tw/", "web_loader": enews},
- {"web": "https://esg.gvm.com.tw/", "web_loader": esg_gvm},
- {"web": "https://www.fsc.gov.tw/", "web_loader": fsc},
- {"web": "https://www.moeaea.gov.tw/", "web_loader": moeaea},
- {"web": "https://www.tcx.com.tw/", "web_loader": tcx}
- ]
- for web_loader in web_loaders:
- if url.startswith(web_loader["web"]):
- return web_loader["web_loader"]
- return None
- if __name__ == "__main__":
- url = "https://enews.moenv.gov.tw/Page/3B3C62C78849F32F/871dc06b-4028-42e4-8d36-656e2427180c"
- web_loader = get_web_loader(url)
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
- }
- text = web_loader(url, headers)
- print(text)
|