import requests from bs4 import BeautifulSoup from datetime import datetime, timezone, timedelta import os import re def hsinchu_news(date): # Base URL for the news list base_url = "https://www.hsinchu.gov.tw/News.aspx?n=153&sms=8603" # Send a GET request to the base URL response = requests.get(base_url) response.raise_for_status() # Check for request errors # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') # print(soup) # Confirm that there is news on the date publish_date = [td.find('span').text for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_2'}) if td.find('span')] for d in publish_date: if d == date: print('今日新聞') # Extract all article links from the news list hrefs = [a['href'] for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_1'}) for a in td.find_all('a')] # print(hrefs) article_links = [] for link in hrefs: href = "https://www.hsinchu.gov.tw/" + link article_links.append(href) # Iterate over each article link to extract title and content for article_url in article_links: print(article_url) article_response = requests.get(article_url) article_response.raise_for_status() article_soup = BeautifulSoup(article_response.text, 'html.parser') # Extract the title title_element = article_soup.select_one('#CCMS_Content > div > div > div > div:nth-of-type(1) > div > div > div > span') title = title_element.get_text(strip=True) if title_element else "Title not found" # Extract the content from all
tags, excluding those generated by