import xml.etree.ElementTree as XET from bs4 import BeautifulSoup import requests import csv # 載入csv套件 from urllib.request import urlopen import xml.etree.ElementTree as ET """ ElementTree.write() 將構建的XML文檔寫入(更新)文件。 Element.set(key, value) 添加和修改屬性 Element.text = '' 直接改變字段內容 Element.remove(Element) 刪除Element節點 Element.append(Element) 為當前的Elment對象添加子對象 ET.SubElement(Element,tag)創建子節點 """ # 增加自動縮進換行 def indent(elem, level=0): i = "\n" + level*" " if len(elem): if not elem.text or not elem.text.strip(): elem.text = i + " " if not elem.tail or not elem.tail.strip(): elem.tail = i for elem in elem: indent(elem, level+1) if not elem.tail or not elem.tail.strip(): elem.tail = i else: if level and (not elem.tail or not elem.tail.strip()): elem.tail = i # 將網路上的XML下載下來 def get_XML(url): response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") path = 'tmp.xml' f = open(path, 'w') f.write(soup.prettify()) f.close() def read_and_add_XML(): tree = XET.parse('tmp.xml') # 以XET套件載入XML檔案 old_root = tree.getroot() a = ET.Element("urlset") a.attrib = {"xmlns":"http://www.sitemaps.org/schemas/sitemap/0.9","xmlns:xhtml":"http://www.w3.org/1999/xhtml"} indent(a,0) tree = ET.ElementTree(a) tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True) print("add old data ... ") for child in old_root : b = ET.Element("url") # 新增節點 order1 = XET.SubElement(b, 'loc') #添加數據 order1.text = child[0].text.strip() # 新增節點 order2 = XET.SubElement(b, 'lastmod') order2.text = child[1].text.strip() order3 = XET.SubElement(b, 'priority') order3.text = child[2].text.strip() order4 = XET.SubElement(b, 'changefreq') order4.text = child[3].text.strip() a.append(b) indent(a,0) tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True) print("add new data ... ") #開啟csv檔案,其中參數newline=""是為了讓資料的換行符號被正確載入,而參數encoding="utf-8"則是因為資料包含中文字元,為了讓中文字元能正確被載入 with open("Table.csv",newline="",encoding="utf-8")as file: # 以csv.reader函式讀取資料,若資料的分隔字元非逗號(,),舉例像是冒號(:)則可增加參數delimiter = ":" 來讀取 rows = csv.reader(file) # 以for迴圈將資料一行一行載入 for r in rows: orders = XET.Element('url') # 新增節點 order1 = XET.SubElement(orders, 'loc') order1.text = str(r[0]).replace('\n', '') # 新增節點 order2 = XET.SubElement(orders, 'lastmod') order2.text = str(r[1]).replace('\n', '') order3 = XET.SubElement(orders, 'priority') order3.text = "1.0" order4 = XET.SubElement(orders, 'changefreq') order4.text = "daily" # 輸出 XML 原始資料 a.append(orders) indent(a,0) tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True) #創建子節點,並添加屬性 #創建elementtree對象,寫文件 def main(): get_XML("https://hhh.com.tw/sitemap.xml") read_and_add_XML() if __name__ == '__main__': main()