zooey
/
government_news


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
							import requests
from bs4 import BeautifulSoup
from datetime import datetime, timezone, timedelta
import os
import re

def hsinchu_news(date):
    # Base URL for the news list
    base_url = "https://www.hsinchu.gov.tw/News.aspx?n=153&sms=8603"

    # Send a GET request to the base URL
    response = requests.get(base_url)
    response.raise_for_status()  # Check for request errors

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    # print(soup)

    # Confirm that there is news on the date
    publish_date = [td.find('span').text for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_2'}) if td.find('span')]

    for d in publish_date:
        if d == date:
            print('今日新聞')
            # Extract all article links from the news list
            hrefs = [a['href'] for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_1'}) for a in td.find_all('a')]
            # print(hrefs)
            article_links = []
            for link in hrefs:
                href = "https://www.hsinchu.gov.tw/" + link
                article_links.append(href)

            # Iterate over each article link to extract title and content
            for article_url in article_links:
                print(article_url)
                article_response = requests.get(article_url)
                article_response.raise_for_status()
                article_soup = BeautifulSoup(article_response.text, 'html.parser')

                # Extract the title
                title_element = article_soup.select_one('#CCMS_Content > div > div > div > div:nth-of-type(1) > div > div > div > span')
                title = title_element.get_text(strip=True) if title_element else "Title not found"

                # Extract the content from all <p> tags, excluding those generated by <script>
                content_elements = [p for p in article_soup.find_all('p') if not p.find_parent('script')]
                content = "\n".join([p.get_text(strip=True) for p in content_elements[1::]])

                # Extract image
                images = article_soup.find('img',class_='news_img')
                if images:
                    src = images.get('src')
                else:
                    src = 'https://images.chinatimes.com/newsphoto/2019-06-01/656/20190601002074.jpg'

                tags = '新竹'
                categories = '新竹縣政府'
                file_name = f"hsinchu_{date}_{article_url[-6::]}"

                create_md(title, src, content, tags, categories, file_name)
        else:
            print('非今日新聞')


def taichuang_news(date):
    base_url = "https://www.taichung.gov.tw/9962/Lpsimplelist"
    response = requests.get(base_url)
    response.raise_for_status()  # Check for request errors

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    # print(soup)

    # Collect all news
    today_news = []

    publish_date = [td.find('time').text for td in soup.find_all('td', {'class': 'title'}) if
                    td.find('time')]
    for i, d in enumerate(publish_date):
        if d == date:
            today_news.append(i)
    # Extract all article links from the news list
    hrefs = [a['href'] for td in soup.find_all('td', {'class': 'title'}) for a in
             td.find_all('a')]

    article_links = []
    for i, link in enumerate(hrefs):
        if i in today_news:
            href = "https://www.taichung.gov.tw/" + link
            article_links.append(href)

    # Iterate over each article link to extract title and content
    for article_url in article_links:
        print(article_url)
        article_response = requests.get(article_url)
        article_response.raise_for_status()
        article_soup = BeautifulSoup(article_response.text, 'html.parser')

        # Extract the title
        title = article_soup.find('h2').text
        title = re.sub(r'\s+', ' ', title).strip()
        print(title)
        # Extract the content from all <p> tags, excluding those generated by <script>
        article = article_soup.find('article', {'id': 'cpArticle', 'class': 'cpArticle'})
        paragraphs = article.find_all('p')
        paragraphs_text = [p.text.strip() for p in paragraphs]
        content = "\n".join([p for p in paragraphs_text])
        print(content)
        # Extract image
        images_url = [img['src'] for img in article_soup.find_all('img', src=True) if '?width=400' in img['src']]
        if images_url:
            images = f"https://www.taichung.gov.tw{images_url[0]}"
        else:
            images = 'https://upload.wikimedia.org/wikipedia/commons/thumb/5/5c/%E5%8F%B0%E4%B8%AD%E5%B7%9E%E5%BB%B3%EF%BC%88%E8%88%8A%E5%8F%B0%E4%B8%AD%E5%B8%82%E6%94%BF%E5%BA%9C%EF%BC%89.jpg/2560px-%E5%8F%B0%E4%B8%AD%E5%B7%9E%E5%BB%B3%EF%BC%88%E8%88%8A%E5%8F%B0%E4%B8%AD%E5%B8%82%E6%94%BF%E5%BA%9C%EF%BC%89.jpg'

        tags = '台中'
        categories = '台中政府'
        file_name = re.search(r'/(\d+)/', article_url).group(1)

        create_md(title, images, content, tags, categories, file_name)


def taipei_news(date):
    base_url = "https://www.gov.taipei/News.aspx?n=F0DDAF49B89E9413&sms=72544237BBE4C5F6"
    response = requests.get(base_url)
    response.raise_for_status()  # Check for request errors

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    # print(soup)

    # Collect all news
    today_news = []

    publish_date = [td.find('span').text for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_2'}) if
                    td.find('span')]
    print(publish_date)
    for i, d in enumerate(publish_date):
        if d == date:
            today_news.append(i)
    # Extract all article links from the news list
    hrefs = [a['href'] for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_1'}) for a in td.find_all('a')]
    print(hrefs)
    article_links = []
    for i, link in enumerate(hrefs):
        if i in today_news:
            href = "https://www.gov.taipei/" + link
            article_links.append(href)

    # Iterate over each article link to extract title and content
    for article_url in article_links:
        print(article_url)
        article_response = requests.get(article_url)
        article_response.raise_for_status()
        article_soup = BeautifulSoup(article_response.text, 'html.parser')

        # Extract the title
        title = article_soup.find('h3').text
        title = re.sub(r'\s+', ' ', title).strip()
        print(title)
        # Extract the content from all <p> tags, excluding those generated by <script>
        article = article_soup.find('div', {'class': 'area-essay page-caption-p'})
        paragraphs = article.find_all('p')
        paragraphs_text = [p.text.strip() for p in paragraphs][1::]
        content = "\n".join([p for p in paragraphs_text])
        print(content)
        # Extract image
        images_element = article_soup.find_all('li', {'data-src': True})
        if images_element:
            images_url = [img['data-src'] for img in images_element]
            images = images_url[0]
        else:
            images = 'https://turingcerts.com/wp-content/uploads/2024/01/TaipeiCity_Turing-Certs-2.webp'
        print(images)
        tags = '台北'
        categories = '台北市政府'
        file_name = article_url[-16::]

        create_md(title, images, content, tags, categories, file_name)


def tainan_news(date):
    base_url = "https://www.tainan.gov.tw/News.aspx?n=13370&sms=9748"
    response = requests.get(base_url)
    response.raise_for_status()  # Check for request errors

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    # print(soup)

    # Collect all news
    today_news = []

    publish_date = [td.find('span').text for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_0'}) if
                    td.find('span')]
    print(publish_date)
    for i, d in enumerate(publish_date):
        if d == date:
            today_news.append(i)
    # Extract all article links from the news list
    hrefs = [a['href'] for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_1'}) for a in td.find_all('a')]
    print(hrefs)
    article_links = []
    for i, link in enumerate(hrefs):
        if i in today_news:
            href = "https://www.tainan.gov.tw/" + link
            article_links.append(href)

    # Iterate over each article link to extract title and content
    for article_url in article_links:
        print(article_url)
        article_response = requests.get(article_url)
        article_response.raise_for_status()
        article_soup = BeautifulSoup(article_response.text, 'html.parser')

        # Extract the title
        title = article_soup.find('h3').text
        title = re.sub(r'\s+', ' ', title).strip()
        print(title)
        # Extract the content from all <p> tags, excluding those generated by <script>
        article = article_soup.find('div', {'class': 'area-essay page-caption-p'})
        paragraphs = article.find_all('p')
        paragraphs_text = [p.text.strip() for p in paragraphs][1::]
        content = "\n".join([p for p in paragraphs_text])
        print(content)
        # Extract image
        images_element = article_soup.find_all('li', {'data-src': True})
        if images_element:
            images_url = [img['data-src'] for img in images_element]
            images = images_url[0]
        else:
            images = 'https://upload.wikimedia.org/wikipedia/commons/4/44/Tainan_City_Government_Logo.svg'
        print(images)
        tags = '台南'
        categories = '台南市政府'
        file_name = f"tainan_{date}_{article_url[-7::]}"
        print('檔案名稱',file_name)
        create_md(title, images, content, tags, categories, file_name)


def create_md(title, images, content, tags,categories, file_name):
    # Generate metadata
    output_dir = os.path.dirname(__file__)
    date = datetime.now(timezone(timedelta(hours=8)))
    formatted_date = date.strftime('%Y-%m-%d %H:%M:%S%z')
    # Save to markdown file

    md_content = f"""---
title: "{title}"
tags: ["{tags}"]
categories: ["{categories}"]
image: "{images}"
url: "/news/news_content_{file_name}"
date: {formatted_date}
description: "{title}"
draft: false
display: true
type: "post"
---

{content}
"""


    # 創建最終的文件路徑
    filename = os.path.join(output_dir, f"{file_name}.md")

    with open(filename, "w", encoding="utf-8") as file:
        file.write(md_content)

    print(f"Saved: {filename}")


# hsinchu_news('114-01-22')
# taichuang_news('2025-01-21')
# taipei_news('114-01-21')
tainan_news('114-01-23')