123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- import xml.etree.ElementTree as XET
- from bs4 import BeautifulSoup
- import requests
- import csv
- from urllib.request import urlopen
- import xml.etree.ElementTree as ET
- """
- ElementTree.write() 將構建的XML文檔寫入(更新)文件。
- Element.set(key, value) 添加和修改屬性
- Element.text = '' 直接改變字段內容
- Element.remove(Element) 刪除Element節點
- Element.append(Element) 為當前的Elment對象添加子對象
- ET.SubElement(Element,tag)創建子節點
- """
- def indent(elem, level=0):
- i = "\n" + level*" "
- if len(elem):
- if not elem.text or not elem.text.strip():
- elem.text = i + " "
- if not elem.tail or not elem.tail.strip():
- elem.tail = i
- for elem in elem:
- indent(elem, level+1)
- if not elem.tail or not elem.tail.strip():
- elem.tail = i
- else:
- if level and (not elem.tail or not elem.tail.strip()):
- elem.tail = i
- def get_XML(url):
- response = requests.get(url)
- soup = BeautifulSoup(response.text, "html.parser")
- path = 'tmp.xml'
- f = open(path, 'w')
- f.write(soup.prettify())
- f.close()
- def read_and_add_XML():
- tree = XET.parse('tmp.xml')
- old_root = tree.getroot()
- a = ET.Element("urlset")
- a.attrib = {"xmlns":"http://www.sitemaps.org/schemas/sitemap/0.9","xmlns:xhtml":"http://www.w3.org/1999/xhtml"}
- indent(a,0)
- tree = ET.ElementTree(a)
- tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)
- print("add old data ... ")
- for child in old_root :
- b = ET.Element("url")
-
- order1 = XET.SubElement(b, 'loc')
-
- order1.text = child[0].text.strip()
-
- order2 = XET.SubElement(b, 'lastmod')
- order2.text = child[1].text.strip()
- order3 = XET.SubElement(b, 'priority')
- order3.text = child[2].text.strip()
- order4 = XET.SubElement(b, 'changefreq')
- order4.text = child[3].text.strip()
- a.append(b)
-
- indent(a,0)
- tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)
- print("add new data ... ")
-
- with open("Table.csv",newline="",encoding="utf-8")as file:
-
- rows = csv.reader(file)
-
- for r in rows:
- orders = XET.Element('url')
-
- order1 = XET.SubElement(orders, 'loc')
- order1.text = str(r[0]).replace('\n', '')
-
- order2 = XET.SubElement(orders, 'lastmod')
- order2.text = str(r[1]).replace('\n', '')
- order3 = XET.SubElement(orders, 'priority')
- order3.text = "1.0"
- order4 = XET.SubElement(orders, 'changefreq')
- order4.text = "daily"
-
- a.append(orders)
-
- indent(a,0)
- tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)
- def main():
- get_XML("https://hhh.com.tw/sitemap.xml")
- read_and_add_XML()
- if __name__ == '__main__':
- main()
|