government_news.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from datetime import datetime, timezone, timedelta
  4. import os
  5. import re
  6. def hsinchu_news(date):
  7. # Base URL for the news list
  8. base_url = "https://www.hsinchu.gov.tw/News.aspx?n=153&sms=8603"
  9. # Send a GET request to the base URL
  10. response = requests.get(base_url)
  11. response.raise_for_status() # Check for request errors
  12. # Parse the HTML content using BeautifulSoup
  13. soup = BeautifulSoup(response.text, 'html.parser')
  14. # print(soup)
  15. # Confirm that there is news on the date
  16. publish_date = [td.find('span').text for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_2'}) if td.find('span')]
  17. for d in publish_date:
  18. if d == date:
  19. print('今日新聞')
  20. # Extract all article links from the news list
  21. hrefs = [a['href'] for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_1'}) for a in td.find_all('a')]
  22. # print(hrefs)
  23. article_links = []
  24. for link in hrefs:
  25. href = "https://www.hsinchu.gov.tw/" + link
  26. article_links.append(href)
  27. # Iterate over each article link to extract title and content
  28. for article_url in article_links:
  29. print(article_url)
  30. article_response = requests.get(article_url)
  31. article_response.raise_for_status()
  32. article_soup = BeautifulSoup(article_response.text, 'html.parser')
  33. # Extract the title
  34. title_element = article_soup.select_one('#CCMS_Content > div > div > div > div:nth-of-type(1) > div > div > div > span')
  35. title = title_element.get_text(strip=True) if title_element else "Title not found"
  36. # Extract the content from all <p> tags, excluding those generated by <script>
  37. content_elements = [p for p in article_soup.find_all('p') if not p.find_parent('script')]
  38. content = "\n".join([p.get_text(strip=True) for p in content_elements[1::]])
  39. # Extract image
  40. images = article_soup.find('img',class_='news_img')
  41. if images:
  42. src = images.get('src')
  43. else:
  44. src = 'https://images.chinatimes.com/newsphoto/2019-06-01/656/20190601002074.jpg'
  45. tags = '新竹'
  46. categories = '新竹縣政府'
  47. file_name = f"hsinchu_{date}_{article_url[-6::]}"
  48. create_md(title, src, content, tags, categories, file_name)
  49. else:
  50. print('非今日新聞')
  51. def taichuang_news(date):
  52. base_url = "https://www.taichung.gov.tw/9962/Lpsimplelist"
  53. response = requests.get(base_url)
  54. response.raise_for_status() # Check for request errors
  55. # Parse the HTML content using BeautifulSoup
  56. soup = BeautifulSoup(response.text, 'html.parser')
  57. # print(soup)
  58. # Collect all news
  59. today_news = []
  60. publish_date = [td.find('time').text for td in soup.find_all('td', {'class': 'title'}) if
  61. td.find('time')]
  62. for i, d in enumerate(publish_date):
  63. if d == date:
  64. today_news.append(i)
  65. # Extract all article links from the news list
  66. hrefs = [a['href'] for td in soup.find_all('td', {'class': 'title'}) for a in
  67. td.find_all('a')]
  68. article_links = []
  69. for i, link in enumerate(hrefs):
  70. if i in today_news:
  71. href = "https://www.taichung.gov.tw/" + link
  72. article_links.append(href)
  73. # Iterate over each article link to extract title and content
  74. for article_url in article_links:
  75. print(article_url)
  76. article_response = requests.get(article_url)
  77. article_response.raise_for_status()
  78. article_soup = BeautifulSoup(article_response.text, 'html.parser')
  79. # Extract the title
  80. title = article_soup.find('h2').text
  81. title = re.sub(r'\s+', ' ', title).strip()
  82. print(title)
  83. # Extract the content from all <p> tags, excluding those generated by <script>
  84. article = article_soup.find('article', {'id': 'cpArticle', 'class': 'cpArticle'})
  85. paragraphs = article.find_all('p')
  86. paragraphs_text = [p.text.strip() for p in paragraphs]
  87. content = "\n".join([p for p in paragraphs_text])
  88. print(content)
  89. # Extract image
  90. images_url = [img['src'] for img in article_soup.find_all('img', src=True) if '?width=400' in img['src']]
  91. if images_url:
  92. images = f"https://www.taichung.gov.tw{images_url[0]}"
  93. else:
  94. images = 'https://upload.wikimedia.org/wikipedia/commons/thumb/5/5c/%E5%8F%B0%E4%B8%AD%E5%B7%9E%E5%BB%B3%EF%BC%88%E8%88%8A%E5%8F%B0%E4%B8%AD%E5%B8%82%E6%94%BF%E5%BA%9C%EF%BC%89.jpg/2560px-%E5%8F%B0%E4%B8%AD%E5%B7%9E%E5%BB%B3%EF%BC%88%E8%88%8A%E5%8F%B0%E4%B8%AD%E5%B8%82%E6%94%BF%E5%BA%9C%EF%BC%89.jpg'
  95. tags = '台中'
  96. categories = '台中政府'
  97. file_name = re.search(r'/(\d+)/', article_url).group(1)
  98. create_md(title, images, content, tags, categories, file_name)
  99. def taipei_news(date):
  100. base_url = "https://www.gov.taipei/News.aspx?n=F0DDAF49B89E9413&sms=72544237BBE4C5F6"
  101. response = requests.get(base_url)
  102. response.raise_for_status() # Check for request errors
  103. # Parse the HTML content using BeautifulSoup
  104. soup = BeautifulSoup(response.text, 'html.parser')
  105. # print(soup)
  106. # Collect all news
  107. today_news = []
  108. publish_date = [td.find('span').text for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_2'}) if
  109. td.find('span')]
  110. print(publish_date)
  111. for i, d in enumerate(publish_date):
  112. if d == date:
  113. today_news.append(i)
  114. # Extract all article links from the news list
  115. hrefs = [a['href'] for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_1'}) for a in td.find_all('a')]
  116. print(hrefs)
  117. article_links = []
  118. for i, link in enumerate(hrefs):
  119. if i in today_news:
  120. href = "https://www.gov.taipei/" + link
  121. article_links.append(href)
  122. # Iterate over each article link to extract title and content
  123. for article_url in article_links:
  124. print(article_url)
  125. article_response = requests.get(article_url)
  126. article_response.raise_for_status()
  127. article_soup = BeautifulSoup(article_response.text, 'html.parser')
  128. # Extract the title
  129. title = article_soup.find('h3').text
  130. title = re.sub(r'\s+', ' ', title).strip()
  131. print(title)
  132. # Extract the content from all <p> tags, excluding those generated by <script>
  133. article = article_soup.find('div', {'class': 'area-essay page-caption-p'})
  134. paragraphs = article.find_all('p')
  135. paragraphs_text = [p.text.strip() for p in paragraphs][1::]
  136. content = "\n".join([p for p in paragraphs_text])
  137. print(content)
  138. # Extract image
  139. images_element = article_soup.find_all('li', {'data-src': True})
  140. if images_element:
  141. images_url = [img['data-src'] for img in images_element]
  142. images = images_url[0]
  143. else:
  144. images = 'https://turingcerts.com/wp-content/uploads/2024/01/TaipeiCity_Turing-Certs-2.webp'
  145. print(images)
  146. tags = '台北'
  147. categories = '台北市政府'
  148. file_name = article_url[-16::]
  149. create_md(title, images, content, tags, categories, file_name)
  150. def tainan_news(date):
  151. base_url = "https://www.tainan.gov.tw/News.aspx?n=13370&sms=9748"
  152. response = requests.get(base_url)
  153. response.raise_for_status() # Check for request errors
  154. # Parse the HTML content using BeautifulSoup
  155. soup = BeautifulSoup(response.text, 'html.parser')
  156. # print(soup)
  157. # Collect all news
  158. today_news = []
  159. publish_date = [td.find('span').text for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_0'}) if
  160. td.find('span')]
  161. print(publish_date)
  162. for i, d in enumerate(publish_date):
  163. if d == date:
  164. today_news.append(i)
  165. # Extract all article links from the news list
  166. hrefs = [a['href'] for td in soup.find_all('td', {'class': 'CCMS_jGridView_td_Class_1'}) for a in td.find_all('a')]
  167. print(hrefs)
  168. article_links = []
  169. for i, link in enumerate(hrefs):
  170. if i in today_news:
  171. href = "https://www.tainan.gov.tw/" + link
  172. article_links.append(href)
  173. # Iterate over each article link to extract title and content
  174. for article_url in article_links:
  175. print(article_url)
  176. article_response = requests.get(article_url)
  177. article_response.raise_for_status()
  178. article_soup = BeautifulSoup(article_response.text, 'html.parser')
  179. # Extract the title
  180. title = article_soup.find('h3').text
  181. title = re.sub(r'\s+', ' ', title).strip()
  182. print(title)
  183. # Extract the content from all <p> tags, excluding those generated by <script>
  184. article = article_soup.find('div', {'class': 'area-essay page-caption-p'})
  185. paragraphs = article.find_all('p')
  186. paragraphs_text = [p.text.strip() for p in paragraphs][1::]
  187. content = "\n".join([p for p in paragraphs_text])
  188. print(content)
  189. # Extract image
  190. images_element = article_soup.find_all('li', {'data-src': True})
  191. if images_element:
  192. images_url = [img['data-src'] for img in images_element]
  193. images = images_url[0]
  194. else:
  195. images = 'https://upload.wikimedia.org/wikipedia/commons/4/44/Tainan_City_Government_Logo.svg'
  196. print(images)
  197. tags = '台南'
  198. categories = '台南市政府'
  199. file_name = f"tainan_{date}_{article_url[-7::]}"
  200. print('檔案名稱',file_name)
  201. create_md(title, images, content, tags, categories, file_name)
  202. def create_md(title, images, content, tags,categories, file_name):
  203. # Generate metadata
  204. output_dir = os.path.dirname(__file__)
  205. date = datetime.now(timezone(timedelta(hours=8)))
  206. formatted_date = date.strftime('%Y-%m-%d %H:%M:%S%z')
  207. # Save to markdown file
  208. md_content = f"""---
  209. title: "{title}"
  210. tags: ["{tags}"]
  211. categories: ["{categories}"]
  212. image: "{images}"
  213. url: "/news/news_content_{file_name}"
  214. date: {formatted_date}
  215. description: "{title}"
  216. draft: false
  217. display: true
  218. type: "post"
  219. ---
  220. {content}
  221. """
  222. # 創建最終的文件路徑
  223. filename = os.path.join(output_dir, f"{file_name}.md")
  224. with open(filename, "w", encoding="utf-8") as file:
  225. file.write(md_content)
  226. print(f"Saved: {filename}")
  227. # hsinchu_news('114-01-22')
  228. # taichuang_news('2025-01-21')
  229. # taipei_news('114-01-21')
  230. tainan_news('114-01-23')