import requests from bs4 import BeautifulSoup import re def einfo(url, headers): web_url = "https://e-info.org.tw/" if not url.startswith(web_url): raise ValueError("URL must start with {}".format(web_url)) # get news content soup response = requests.get(url, headers=headers) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # get news title news_title = soup.find('title') title = news_title.get_text(strip=True) # get news content news_content = [] news_content_divs = soup.find_all('div', class_='field-item even') if news_content_divs and len(news_content_divs) > 0 : for div in news_content_divs: for tag in div.find_all(['h1', 'h2', 'h3', 'p']): news_content.append(tag.get_text(strip=True)) else: news_text = "未找到新聞内容" raise Exception(f'news content is empty. url: {url}') if len(news_content) == 0: raise Exception(f'news content is empty. url: {url}') # coonbine all text news_text = title + "\n" + "\n".join(news_content) return news_text def csrone(url, headers): web_url = "https://csrone.com/" if not url.startswith(web_url): raise ValueError("URL must start with {}".format(web_url)) # get news content soup response = requests.get(url, headers=headers) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # get news title news_title = soup.find('h2', class_=False) title = news_title.get_text(strip=True) # get news content news_content = [] news_content_divs = soup.find_all('div', class_="article_content text-break") if news_content_divs and len(news_content_divs) > 0: for div in news_content_divs: for tag in div.find_all(['h1', 'h2', 'h3', 'p', 'pre']): news_content.append(tag.get_text(strip=True)) else: news_text = "未找到新聞内容" raise Exception(f'news content is empty. url: {url}') if len(news_content) == 0: raise Exception(f'news content is empty. url: {url}') # coonbine all text news_text = title + "\n" + "\n".join(news_content) return news_text def enews(url, headers): web_url = "https://enews.moenv.gov.tw/" if not url.startswith(web_url): raise ValueError("URL must start with {}".format(web_url)) # get news content soup response = requests.get(url, headers=headers) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # get news title news_title = soup.find('h2', class_="main-title") title = news_title.get_text(strip=True) # get news content news_content = [] news_content_divs = soup.find_all('div', class_="news-info-paragraph") if news_content_divs and len(news_content_divs) > 0 : for div in news_content_divs: for tag in div.find_all("span"): news_content.append(tag.get_text(strip=True)) else: news_text = "未找到新聞内容" raise Exception(f'news content is empty. url: {url}') if len(news_content) == 0: raise Exception(f'news content is empty. url: {url}') # coonbine all text news_text = title + "\n" + "\n".join(news_content) return news_text def esg_gvm(url, headers): web_url = "https://esg.gvm.com.tw/" if not url.startswith(web_url): raise ValueError("URL must start with {}".format(web_url)) # get news content soup response = requests.get(url, headers=headers) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # get news title news_title = soup.find('h1') title = news_title.get_text(strip=True) # get news content news_content = [] abstract_content = soup.find('h2', class_="post_excerpt my-4 text-primary") abstract = abstract_content.get_text(strip=True) news_content.append(abstract) news_content_divs = soup.find_all('div', class_="col-xl-7 col-lg-10 post-content-container") if news_content_divs and len(news_content_divs) > 0 : for div in news_content_divs: for tag in div.find_all(["h2", "h3", "p"], class_=False): news_content.append(tag.get_text(strip=True)) else: news_text = "未找到新聞内容" raise Exception(f'news content is empty. url: {url}') if len(news_content) == 0: raise Exception(f'news content is empty. url: {url}') # coonbine all text news_text = title + "\n" + "\n".join(news_content) return news_text def fsc(url, headers): web_url = "https://www.fsc.gov.tw/" if not url.startswith(web_url): raise ValueError("URL must start with {}".format(web_url)) # get news content soup response = requests.get(url, headers=headers) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # get news title news_title = soup.find('h3') title = news_title.get_text(strip=True) # get news content news_content = [] news_content_div = soup.find('div', class_="main-a_03") news_article = news_content_div.get_text(strip=True) news_content.append(news_article) if len(news_content) == 0: raise Exception(f'news content is empty. url: {url}') # coonbine all text news_text = title + "\n" + "\n".join(news_content) return news_text def moeaea(url, headers): web_url = "https://www.moeaea.gov.tw/" if not url.startswith(web_url): raise ValueError("URL must start with {}".format(web_url)) # get news content soup response = requests.get(url, headers=headers) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # get news title news_title = soup.find('div', class_="divTitle") title = news_title.get_text(strip=True) # get news content news_content = [] news_content_div = soup.find('div', style="clear: both; margin-top: 5px;") news_article = news_content_div.get_text(strip=True) news_content.append(news_article) if len(news_content) == 0: raise Exception(f'news content is empty. url: {url}') # coonbine all text news_text = title + "\n" + "\n".join(news_content) return news_text def tcx(url, headers): web_url = "https://www.tcx.com.tw/" if not url.startswith(web_url): raise ValueError("URL must start with {}".format(web_url)) # get news content soup id = url.split("?")[-1] api_url = "https://www.tcx.com.tw/learn/front/newsDetailApi/"+id response = requests.get(api_url, headers=headers) response.encoding = 'utf-8' data = response.json() # get news title title = data['detail']['title'].strip() # get news content news_content = [] soup = BeautifulSoup(data['detail']['content'], 'html.parser') news_content_divs = soup.find_all('p', class_=False, style=False) if news_content_divs and len(news_content_divs) > 0 : for div in news_content_divs: news_content.append(div.get_text(strip=True)) else: news_text = "未找到新聞内容" raise Exception(f'news content is empty. url: {url}') if len(news_content) == 0: raise Exception(f'news content is empty. url: {url}') # coonbine all text news_text = title + "\n" + "\n".join(news_content) return news_text def get_web_loader(url, web_loaders=None): if web_loaders is None: web_loaders = [ {"web": "https://e-info.org.tw/", "web_loader": einfo}, {"web": "https://csrone.com/", "web_loader": csrone}, {"web": "https://enews.moenv.gov.tw/", "web_loader": enews}, {"web": "https://esg.gvm.com.tw/", "web_loader": esg_gvm}, {"web": "https://www.fsc.gov.tw/", "web_loader": fsc}, {"web": "https://www.moeaea.gov.tw/", "web_loader": moeaea}, {"web": "https://www.tcx.com.tw/", "web_loader": tcx} ] for web_loader in web_loaders: if url.startswith(web_loader["web"]): return web_loader["web_loader"] return None if __name__ == "__main__": url = "https://enews.moenv.gov.tw/Page/3B3C62C78849F32F/871dc06b-4028-42e4-8d36-656e2427180c" web_loader = get_web_loader(url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } text = web_loader(url, headers) print(text)