|
@@ -0,0 +1,278 @@
|
|
|
+import requests
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+from datetime import datetime, timezone, timedelta
|
|
|
+import os
|
|
|
+import re
|
|
|
+
|
|
|
+def hsinchu_news(date):
|
|
|
+ # Base URL for the news list
|
|
|
+ base_url = "https://www.hsinchu.gov.tw/News.aspx?n=153&sms=8603"
|
|
|
+
|
|
|
+ # Send a GET request to the base URL
|
|
|
+ response = requests.get(base_url)
|
|
|
+ response.raise_for_status() # Check for request errors
|
|
|
+
|
|
|
+ # Parse the HTML content using BeautifulSoup
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
+ # print(soup)
|
|
|
+
|
|
|
+ # Confirm that there is news on the date
|
|
|
+ publish_date = [td.find('span').text for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_2'}) if td.find('span')]
|
|
|
+
|
|
|
+ for d in publish_date:
|
|
|
+ if d == date:
|
|
|
+ print('今日新聞')
|
|
|
+ # Extract all article links from the news list
|
|
|
+ hrefs = [a['href'] for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_1'}) for a in td.find_all('a')]
|
|
|
+ # print(hrefs)
|
|
|
+ article_links = []
|
|
|
+ for link in hrefs:
|
|
|
+ href = "https://www.hsinchu.gov.tw/" + link
|
|
|
+ article_links.append(href)
|
|
|
+
|
|
|
+ # Iterate over each article link to extract title and content
|
|
|
+ for article_url in article_links:
|
|
|
+ print(article_url)
|
|
|
+ article_response = requests.get(article_url)
|
|
|
+ article_response.raise_for_status()
|
|
|
+ article_soup = BeautifulSoup(article_response.text, 'html.parser')
|
|
|
+
|
|
|
+ # Extract the title
|
|
|
+ title_element = article_soup.select_one('#CCMS_Content > div > div > div > div:nth-of-type(1) > div > div > div > span')
|
|
|
+ title = title_element.get_text(strip=True) if title_element else "Title not found"
|
|
|
+
|
|
|
+ # Extract the content from all <p> tags, excluding those generated by <script>
|
|
|
+ content_elements = [p for p in article_soup.find_all('p') if not p.find_parent('script')]
|
|
|
+ content = "\n".join([p.get_text(strip=True) for p in content_elements[1::]])
|
|
|
+
|
|
|
+ # Extract image
|
|
|
+ images = article_soup.find('img',class_='news_img')
|
|
|
+ if images:
|
|
|
+ src = images.get('src')
|
|
|
+ else:
|
|
|
+ src = 'https://images.chinatimes.com/newsphoto/2019-06-01/656/20190601002074.jpg'
|
|
|
+
|
|
|
+ tags = '新竹'
|
|
|
+ categories = '新竹縣政府'
|
|
|
+ file_name = f"hsinchu_{date}_{article_url[-6::]}"
|
|
|
+
|
|
|
+ create_md(title, src, content, tags, categories, file_name)
|
|
|
+ else:
|
|
|
+ print('非今日新聞')
|
|
|
+
|
|
|
+
|
|
|
+def taichuang_news(date):
|
|
|
+ base_url = "https://www.taichung.gov.tw/9962/Lpsimplelist"
|
|
|
+ response = requests.get(base_url)
|
|
|
+ response.raise_for_status() # Check for request errors
|
|
|
+
|
|
|
+ # Parse the HTML content using BeautifulSoup
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
+ # print(soup)
|
|
|
+
|
|
|
+ # Collect all news
|
|
|
+ today_news = []
|
|
|
+
|
|
|
+ publish_date = [td.find('time').text for td in soup.find_all('td', {'class': 'title'}) if
|
|
|
+ td.find('time')]
|
|
|
+ for i, d in enumerate(publish_date):
|
|
|
+ if d == date:
|
|
|
+ today_news.append(i)
|
|
|
+ # Extract all article links from the news list
|
|
|
+ hrefs = [a['href'] for td in soup.find_all('td', {'class': 'title'}) for a in
|
|
|
+ td.find_all('a')]
|
|
|
+
|
|
|
+ article_links = []
|
|
|
+ for i, link in enumerate(hrefs):
|
|
|
+ if i in today_news:
|
|
|
+ href = "https://www.taichung.gov.tw/" + link
|
|
|
+ article_links.append(href)
|
|
|
+
|
|
|
+ # Iterate over each article link to extract title and content
|
|
|
+ for article_url in article_links:
|
|
|
+ print(article_url)
|
|
|
+ article_response = requests.get(article_url)
|
|
|
+ article_response.raise_for_status()
|
|
|
+ article_soup = BeautifulSoup(article_response.text, 'html.parser')
|
|
|
+
|
|
|
+ # Extract the title
|
|
|
+ title = article_soup.find('h2').text
|
|
|
+ title = re.sub(r'\s+', ' ', title).strip()
|
|
|
+ print(title)
|
|
|
+ # Extract the content from all <p> tags, excluding those generated by <script>
|
|
|
+ article = article_soup.find('article', {'id': 'cpArticle', 'class': 'cpArticle'})
|
|
|
+ paragraphs = article.find_all('p')
|
|
|
+ paragraphs_text = [p.text.strip() for p in paragraphs]
|
|
|
+ content = "\n".join([p for p in paragraphs_text])
|
|
|
+ print(content)
|
|
|
+ # Extract image
|
|
|
+ images_url = [img['src'] for img in article_soup.find_all('img', src=True) if '?width=400' in img['src']]
|
|
|
+ if images_url:
|
|
|
+ images = f"https://www.taichung.gov.tw{images_url[0]}"
|
|
|
+ else:
|
|
|
+ images = 'https://upload.wikimedia.org/wikipedia/commons/thumb/5/5c/%E5%8F%B0%E4%B8%AD%E5%B7%9E%E5%BB%B3%EF%BC%88%E8%88%8A%E5%8F%B0%E4%B8%AD%E5%B8%82%E6%94%BF%E5%BA%9C%EF%BC%89.jpg/2560px-%E5%8F%B0%E4%B8%AD%E5%B7%9E%E5%BB%B3%EF%BC%88%E8%88%8A%E5%8F%B0%E4%B8%AD%E5%B8%82%E6%94%BF%E5%BA%9C%EF%BC%89.jpg'
|
|
|
+
|
|
|
+ tags = '台中'
|
|
|
+ categories = '台中政府'
|
|
|
+ file_name = re.search(r'/(\d+)/', article_url).group(1)
|
|
|
+
|
|
|
+ create_md(title, images, content, tags, categories, file_name)
|
|
|
+
|
|
|
+
|
|
|
+def taipei_news(date):
|
|
|
+ base_url = "https://www.gov.taipei/News.aspx?n=F0DDAF49B89E9413&sms=72544237BBE4C5F6"
|
|
|
+ response = requests.get(base_url)
|
|
|
+ response.raise_for_status() # Check for request errors
|
|
|
+
|
|
|
+ # Parse the HTML content using BeautifulSoup
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
+ # print(soup)
|
|
|
+
|
|
|
+ # Collect all news
|
|
|
+ today_news = []
|
|
|
+
|
|
|
+ publish_date = [td.find('span').text for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_2'}) if
|
|
|
+ td.find('span')]
|
|
|
+ print(publish_date)
|
|
|
+ for i, d in enumerate(publish_date):
|
|
|
+ if d == date:
|
|
|
+ today_news.append(i)
|
|
|
+ # Extract all article links from the news list
|
|
|
+ hrefs = [a['href'] for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_1'}) for a in td.find_all('a')]
|
|
|
+ print(hrefs)
|
|
|
+ article_links = []
|
|
|
+ for i, link in enumerate(hrefs):
|
|
|
+ if i in today_news:
|
|
|
+ href = "https://www.gov.taipei/" + link
|
|
|
+ article_links.append(href)
|
|
|
+
|
|
|
+ # Iterate over each article link to extract title and content
|
|
|
+ for article_url in article_links:
|
|
|
+ print(article_url)
|
|
|
+ article_response = requests.get(article_url)
|
|
|
+ article_response.raise_for_status()
|
|
|
+ article_soup = BeautifulSoup(article_response.text, 'html.parser')
|
|
|
+
|
|
|
+ # Extract the title
|
|
|
+ title = article_soup.find('h3').text
|
|
|
+ title = re.sub(r'\s+', ' ', title).strip()
|
|
|
+ print(title)
|
|
|
+ # Extract the content from all <p> tags, excluding those generated by <script>
|
|
|
+ article = article_soup.find('div', {'class': 'area-essay page-caption-p'})
|
|
|
+ paragraphs = article.find_all('p')
|
|
|
+ paragraphs_text = [p.text.strip() for p in paragraphs][1::]
|
|
|
+ content = "\n".join([p for p in paragraphs_text])
|
|
|
+ print(content)
|
|
|
+ # Extract image
|
|
|
+ images_element = article_soup.find_all('li', {'data-src': True})
|
|
|
+ if images_element:
|
|
|
+ images_url = [img['data-src'] for img in images_element]
|
|
|
+ images = images_url[0]
|
|
|
+ else:
|
|
|
+ images = 'https://turingcerts.com/wp-content/uploads/2024/01/TaipeiCity_Turing-Certs-2.webp'
|
|
|
+ print(images)
|
|
|
+ tags = '台北'
|
|
|
+ categories = '台北市政府'
|
|
|
+ file_name = article_url[-16::]
|
|
|
+
|
|
|
+ create_md(title, images, content, tags, categories, file_name)
|
|
|
+
|
|
|
+
|
|
|
+def tainan_news(date):
|
|
|
+ base_url = "https://www.tainan.gov.tw/News.aspx?n=13370&sms=9748"
|
|
|
+ response = requests.get(base_url)
|
|
|
+ response.raise_for_status() # Check for request errors
|
|
|
+
|
|
|
+ # Parse the HTML content using BeautifulSoup
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
+ # print(soup)
|
|
|
+
|
|
|
+ # Collect all news
|
|
|
+ today_news = []
|
|
|
+
|
|
|
+ publish_date = [td.find('span').text for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_0'}) if
|
|
|
+ td.find('span')]
|
|
|
+ print(publish_date)
|
|
|
+ for i, d in enumerate(publish_date):
|
|
|
+ if d == date:
|
|
|
+ today_news.append(i)
|
|
|
+ # Extract all article links from the news list
|
|
|
+ hrefs = [a['href'] for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_1'}) for a in td.find_all('a')]
|
|
|
+ print(hrefs)
|
|
|
+ article_links = []
|
|
|
+ for i, link in enumerate(hrefs):
|
|
|
+ if i in today_news:
|
|
|
+ href = "https://www.tainan.gov.tw/" + link
|
|
|
+ article_links.append(href)
|
|
|
+
|
|
|
+ # Iterate over each article link to extract title and content
|
|
|
+ for article_url in article_links:
|
|
|
+ print(article_url)
|
|
|
+ article_response = requests.get(article_url)
|
|
|
+ article_response.raise_for_status()
|
|
|
+ article_soup = BeautifulSoup(article_response.text, 'html.parser')
|
|
|
+
|
|
|
+ # Extract the title
|
|
|
+ title = article_soup.find('h3').text
|
|
|
+ title = re.sub(r'\s+', ' ', title).strip()
|
|
|
+ print(title)
|
|
|
+ # Extract the content from all <p> tags, excluding those generated by <script>
|
|
|
+ article = article_soup.find('div', {'class': 'area-essay page-caption-p'})
|
|
|
+ paragraphs = article.find_all('p')
|
|
|
+ paragraphs_text = [p.text.strip() for p in paragraphs][1::]
|
|
|
+ content = "\n".join([p for p in paragraphs_text])
|
|
|
+ print(content)
|
|
|
+ # Extract image
|
|
|
+ images_element = article_soup.find_all('li', {'data-src': True})
|
|
|
+ if images_element:
|
|
|
+ images_url = [img['data-src'] for img in images_element]
|
|
|
+ images = images_url[0]
|
|
|
+ else:
|
|
|
+ images = 'https://upload.wikimedia.org/wikipedia/commons/4/44/Tainan_City_Government_Logo.svg'
|
|
|
+ print(images)
|
|
|
+ tags = '台南'
|
|
|
+ categories = '台南市政府'
|
|
|
+ file_name = f"tainan_{date}_{article_url[-7::]}"
|
|
|
+ print('檔案名稱',file_name)
|
|
|
+ create_md(title, images, content, tags, categories, file_name)
|
|
|
+
|
|
|
+
|
|
|
+def create_md(title, images, content, tags,categories, file_name):
|
|
|
+ # Generate metadata
|
|
|
+ output_dir = 'C:\/Users\/s1301\/PycharmProjects\/news_aimedium_org\/content\/news'
|
|
|
+ date = datetime.now(timezone(timedelta(hours=8)))
|
|
|
+ formatted_date = date.strftime('%Y-%m-%d %H:%M:%S%z')
|
|
|
+ # Save to markdown file
|
|
|
+
|
|
|
+ md_content = f"""---
|
|
|
+title: "{title}"
|
|
|
+tags: ["{tags}"]
|
|
|
+categories: ["{categories}"]
|
|
|
+image: "{images}"
|
|
|
+url: "/news/news_content_{file_name}"
|
|
|
+date: {formatted_date}
|
|
|
+description: "{title}"
|
|
|
+draft: false
|
|
|
+display: true
|
|
|
+type: "post"
|
|
|
+---
|
|
|
+
|
|
|
+{content}
|
|
|
+"""
|
|
|
+
|
|
|
+
|
|
|
+ # 創建最終的文件路徑
|
|
|
+ filename = os.path.join(output_dir, f"{file_name}.md")
|
|
|
+
|
|
|
+ with open(filename, "w", encoding="utf-8") as file:
|
|
|
+ file.write(md_content)
|
|
|
+
|
|
|
+ print(f"Saved: {filename}")
|
|
|
+
|
|
|
+
|
|
|
+# hsinchu_news('114-01-22')
|
|
|
+# taichuang_news('2025-01-21')
|
|
|
+# taipei_news('114-01-21')
|
|
|
+tainan_news('114-01-22')
|
|
|
+
|
|
|
+
|