123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- import xml.etree.ElementTree as XET
- from bs4 import BeautifulSoup
- import requests
- import csv # 載入csv套件
- from urllib.request import urlopen
- import xml.etree.ElementTree as ET
- """
- ElementTree.write() 將構建的XML文檔寫入(更新)文件。
- Element.set(key, value) 添加和修改屬性
- Element.text = '' 直接改變字段內容
- Element.remove(Element) 刪除Element節點
- Element.append(Element) 為當前的Elment對象添加子對象
- ET.SubElement(Element,tag)創建子節點
- """
- # 增加自動縮進換行
- def indent(elem, level=0):
- i = "\n" + level*" "
- if len(elem):
- if not elem.text or not elem.text.strip():
- elem.text = i + " "
- if not elem.tail or not elem.tail.strip():
- elem.tail = i
- for elem in elem:
- indent(elem, level+1)
- if not elem.tail or not elem.tail.strip():
- elem.tail = i
- else:
- if level and (not elem.tail or not elem.tail.strip()):
- elem.tail = i
- # 將網路上的XML下載下來
- def get_XML(url):
- response = requests.get(url)
- soup = BeautifulSoup(response.text, "html.parser")
- path = 'tmp.xml'
- f = open(path, 'w')
- f.write(soup.prettify())
- f.close()
- def read_and_add_XML():
- tree = XET.parse('tmp.xml') # 以XET套件載入XML檔案
- old_root = tree.getroot()
- a = ET.Element("urlset")
- a.attrib = {"xmlns":"http://www.sitemaps.org/schemas/sitemap/0.9","xmlns:xhtml":"http://www.w3.org/1999/xhtml"}
- indent(a,0)
- tree = ET.ElementTree(a)
- tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)
- print("add old data ... ")
- for child in old_root :
- b = ET.Element("url")
- # 新增節點
- order1 = XET.SubElement(b, 'loc')
- #添加數據
- order1.text = child[0].text.strip()
- # 新增節點
- order2 = XET.SubElement(b, 'lastmod')
- order2.text = child[1].text.strip()
- order3 = XET.SubElement(b, 'priority')
- order3.text = child[2].text.strip()
- order4 = XET.SubElement(b, 'changefreq')
- order4.text = child[3].text.strip()
- a.append(b)
-
- indent(a,0)
- tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)
- print("add new data ... ")
- #開啟csv檔案,其中參數newline=""是為了讓資料的換行符號被正確載入,而參數encoding="utf-8"則是因為資料包含中文字元,為了讓中文字元能正確被載入
- with open("Table.csv",newline="",encoding="utf-8")as file:
- # 以csv.reader函式讀取資料,若資料的分隔字元非逗號(,),舉例像是冒號(:)則可增加參數delimiter = ":" 來讀取
- rows = csv.reader(file)
- # 以for迴圈將資料一行一行載入
- for r in rows:
- orders = XET.Element('url')
- # 新增節點
- order1 = XET.SubElement(orders, 'loc')
- order1.text = str(r[0]).replace('\n', '')
- # 新增節點
- order2 = XET.SubElement(orders, 'lastmod')
- order2.text = str(r[1]).replace('\n', '')
- order3 = XET.SubElement(orders, 'priority')
- order3.text = "1.0"
- order4 = XET.SubElement(orders, 'changefreq')
- order4.text = "daily"
- # 輸出 XML 原始資料
- a.append(orders)
-
- indent(a,0)
- tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)
- #創建子節點,並添加屬性
- #創建elementtree對象,寫文件
- def main():
- get_XML("https://hhh.com.tw/sitemap.xml")
- read_and_add_XML()
- if __name__ == '__main__':
- main()
|