website_loader.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import re
  4. def einfo(url, headers):
  5. web_url = "https://e-info.org.tw/"
  6. if not url.startswith(web_url):
  7. raise ValueError("URL must start with {}".format(web_url))
  8. # get news content soup
  9. response = requests.get(url, headers=headers)
  10. response.encoding = 'utf-8'
  11. soup = BeautifulSoup(response.text, 'html.parser')
  12. # get news title
  13. news_title = soup.find('title')
  14. title = news_title.get_text(strip=True)
  15. # get news content
  16. news_content = []
  17. news_content_divs = soup.find_all('div', class_='field-item even')
  18. if news_content_divs and len(news_content_divs) > 0 :
  19. for div in news_content_divs:
  20. for tag in div.find_all(['h1', 'h2', 'h3', 'p']):
  21. news_content.append(tag.get_text(strip=True))
  22. else:
  23. news_text = "未找到新聞内容"
  24. raise Exception(f'news content is empty. url: {url}')
  25. if len(news_content) == 0:
  26. raise Exception(f'news content is empty. url: {url}')
  27. # coonbine all text
  28. news_text = title + "\n" + "\n".join(news_content)
  29. return news_text
  30. def csrone(url, headers):
  31. web_url = "https://csrone.com/"
  32. if not url.startswith(web_url):
  33. raise ValueError("URL must start with {}".format(web_url))
  34. # get news content soup
  35. response = requests.get(url, headers=headers)
  36. response.encoding = 'utf-8'
  37. soup = BeautifulSoup(response.text, 'html.parser')
  38. # get news title
  39. news_title = soup.find('h2', class_=False)
  40. title = news_title.get_text(strip=True)
  41. # get news content
  42. news_content = []
  43. news_content_divs = soup.find_all('div', class_="article_content text-break")
  44. if news_content_divs and len(news_content_divs) > 0:
  45. for div in news_content_divs:
  46. for tag in div.find_all(['h1', 'h2', 'h3', 'p', 'pre']):
  47. news_content.append(tag.get_text(strip=True))
  48. else:
  49. news_text = "未找到新聞内容"
  50. raise Exception(f'news content is empty. url: {url}')
  51. if len(news_content) == 0:
  52. raise Exception(f'news content is empty. url: {url}')
  53. # coonbine all text
  54. news_text = title + "\n" + "\n".join(news_content)
  55. return news_text
  56. def enews(url, headers):
  57. web_url = "https://enews.moenv.gov.tw/"
  58. if not url.startswith(web_url):
  59. raise ValueError("URL must start with {}".format(web_url))
  60. # get news content soup
  61. response = requests.get(url, headers=headers)
  62. response.encoding = 'utf-8'
  63. soup = BeautifulSoup(response.text, 'html.parser')
  64. # get news title
  65. news_title = soup.find('h2', class_="main-title")
  66. title = news_title.get_text(strip=True)
  67. # get news content
  68. news_content = []
  69. news_content_divs = soup.find_all('div', class_="news-info-paragraph")
  70. if news_content_divs and len(news_content_divs) > 0 :
  71. for div in news_content_divs:
  72. for tag in div.find_all("span"):
  73. news_content.append(tag.get_text(strip=True))
  74. else:
  75. news_text = "未找到新聞内容"
  76. raise Exception(f'news content is empty. url: {url}')
  77. if len(news_content) == 0:
  78. raise Exception(f'news content is empty. url: {url}')
  79. # coonbine all text
  80. news_text = title + "\n" + "\n".join(news_content)
  81. return news_text
  82. def esg_gvm(url, headers):
  83. web_url = "https://esg.gvm.com.tw/"
  84. if not url.startswith(web_url):
  85. raise ValueError("URL must start with {}".format(web_url))
  86. # get news content soup
  87. response = requests.get(url, headers=headers)
  88. response.encoding = 'utf-8'
  89. soup = BeautifulSoup(response.text, 'html.parser')
  90. # get news title
  91. news_title = soup.find('h1')
  92. title = news_title.get_text(strip=True)
  93. # get news content
  94. news_content = []
  95. abstract_content = soup.find('h2', class_="post_excerpt my-4 text-primary")
  96. abstract = abstract_content.get_text(strip=True)
  97. news_content.append(abstract)
  98. news_content_divs = soup.find_all('div', class_="col-xl-7 col-lg-10 post-content-container")
  99. if news_content_divs and len(news_content_divs) > 0 :
  100. for div in news_content_divs:
  101. for tag in div.find_all(["h2", "h3", "p"], class_=False):
  102. news_content.append(tag.get_text(strip=True))
  103. else:
  104. news_text = "未找到新聞内容"
  105. raise Exception(f'news content is empty. url: {url}')
  106. if len(news_content) == 0:
  107. raise Exception(f'news content is empty. url: {url}')
  108. # coonbine all text
  109. news_text = title + "\n" + "\n".join(news_content)
  110. return news_text
  111. def fsc(url, headers):
  112. web_url = "https://www.fsc.gov.tw/"
  113. if not url.startswith(web_url):
  114. raise ValueError("URL must start with {}".format(web_url))
  115. # get news content soup
  116. response = requests.get(url, headers=headers)
  117. response.encoding = 'utf-8'
  118. soup = BeautifulSoup(response.text, 'html.parser')
  119. # get news title
  120. news_title = soup.find('h3')
  121. title = news_title.get_text(strip=True)
  122. # get news content
  123. news_content = []
  124. news_content_div = soup.find('div', class_="main-a_03")
  125. news_article = news_content_div.get_text(strip=True)
  126. news_content.append(news_article)
  127. if len(news_content) == 0:
  128. raise Exception(f'news content is empty. url: {url}')
  129. # coonbine all text
  130. news_text = title + "\n" + "\n".join(news_content)
  131. return news_text
  132. def moeaea(url, headers):
  133. web_url = "https://www.moeaea.gov.tw/"
  134. if not url.startswith(web_url):
  135. raise ValueError("URL must start with {}".format(web_url))
  136. # get news content soup
  137. response = requests.get(url, headers=headers)
  138. response.encoding = 'utf-8'
  139. soup = BeautifulSoup(response.text, 'html.parser')
  140. # get news title
  141. news_title = soup.find('div', class_="divTitle")
  142. title = news_title.get_text(strip=True)
  143. # get news content
  144. news_content = []
  145. news_content_div = soup.find('div', style="clear: both; margin-top: 5px;")
  146. news_article = news_content_div.get_text(strip=True)
  147. news_content.append(news_article)
  148. if len(news_content) == 0:
  149. raise Exception(f'news content is empty. url: {url}')
  150. # coonbine all text
  151. news_text = title + "\n" + "\n".join(news_content)
  152. return news_text
  153. def tcx(url, headers):
  154. web_url = "https://www.tcx.com.tw/"
  155. if not url.startswith(web_url):
  156. raise ValueError("URL must start with {}".format(web_url))
  157. # get news content soup
  158. id = url.split("?")[-1]
  159. api_url = "https://www.tcx.com.tw/learn/front/newsDetailApi/"+id
  160. response = requests.get(api_url, headers=headers)
  161. response.encoding = 'utf-8'
  162. data = response.json()
  163. # get news title
  164. title = data['detail']['title'].strip()
  165. # get news content
  166. news_content = []
  167. soup = BeautifulSoup(data['detail']['content'], 'html.parser')
  168. news_content_divs = soup.find_all('p', class_=False, style=False)
  169. if news_content_divs and len(news_content_divs) > 0 :
  170. for div in news_content_divs:
  171. news_content.append(div.get_text(strip=True))
  172. else:
  173. news_text = "未找到新聞内容"
  174. raise Exception(f'news content is empty. url: {url}')
  175. if len(news_content) == 0:
  176. raise Exception(f'news content is empty. url: {url}')
  177. # coonbine all text
  178. news_text = title + "\n" + "\n".join(news_content)
  179. return news_text
  180. def get_web_loader(url, web_loaders=None):
  181. if web_loaders is None:
  182. web_loaders = [
  183. {"web": "https://e-info.org.tw/", "web_loader": einfo},
  184. {"web": "https://csrone.com/", "web_loader": csrone},
  185. {"web": "https://enews.moenv.gov.tw/", "web_loader": enews},
  186. {"web": "https://esg.gvm.com.tw/", "web_loader": esg_gvm},
  187. {"web": "https://www.fsc.gov.tw/", "web_loader": fsc},
  188. {"web": "https://www.moeaea.gov.tw/", "web_loader": moeaea},
  189. {"web": "https://www.tcx.com.tw/", "web_loader": tcx}
  190. ]
  191. for web_loader in web_loaders:
  192. if url.startswith(web_loader["web"]):
  193. return web_loader["web_loader"]
  194. return None
  195. if __name__ == "__main__":
  196. url = "https://enews.moenv.gov.tw/Page/3B3C62C78849F32F/871dc06b-4028-42e4-8d36-656e2427180c"
  197. web_loader = get_web_loader(url)
  198. headers = {
  199. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  200. }
  201. text = web_loader(url, headers)
  202. print(text)