choozmo
/
kw_tools


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
							import xml.etree.ElementTree as XET
from bs4 import BeautifulSoup
import requests

import csv   # 載入csv套件


from urllib.request import urlopen
     # 取得XML表格 

import xml.etree.ElementTree as ET

"""
    ElementTree.write()       將構建的XML文檔寫入（更新）文件。
    Element.set(key, value)   添加和修改屬性
    Element.text = ''         直接改變字段內容
    Element.remove(Element)   刪除Element節點
    Element.append(Element)   為當前的Elment對象添加子對象
    ET.SubElement(Element,tag)創建子節點 
"""

#  增加自動縮進換行
def indent(elem, level=0):
    i = "\n" + level*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for elem in elem:
            indent(elem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i
            
response = requests.get("https://hhh.com.tw/sitemap.xml")
soup = BeautifulSoup(response.text, "html.parser")

path = 'tmp.xml'
f = open(path, 'w')
f.write(soup.prettify())
f.close()

tree = XET.parse('tmp.xml')  # 以XET套件載入XML檔案
old_root = tree.getroot() 

#創建根節點
a = ET.Element("urlset")
a.attrib = {"xmlns":"http://www.sitemaps.org/schemas/sitemap/0.9","xmlns:xhtml":"http://www.w3.org/1999/xhtml"}
#創建子節點，並添加屬性

#創建elementtree對象，寫文件
indent(a,0)
tree = ET.ElementTree(a)
tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)

print("add old data ... ")

for child in old_root :
     b = ET.Element("url")
     # 新增節點
     order1 = XET.SubElement(b, 'loc')
     #添加數據
     order1.text = child[0].text.strip()

     # 新增節點
     order2 = XET.SubElement(b, 'lastmod')
     order2.text = child[1].text.strip()

     order3 = XET.SubElement(b, 'priority')
     order3.text = child[2].text.strip()

     order4 = XET.SubElement(b, 'changefreq')
     order4.text = child[3].text.strip()
     a.append(b)
     
indent(a,0)
tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)

print("add new data ... ")

#開啟csv檔案，其中參數newline=""是為了讓資料的換行符號被正確載入，而參數encoding="utf-8"則是因為資料包含中文字元，為了讓中文字元能正確被載入
with open("Table.csv",newline="",encoding="utf-8")as file:
    # 以csv.reader函式讀取資料，若資料的分隔字元非逗號(,)，舉例像是冒號(:)則可增加參數delimiter = ":" 來讀取
    rows = csv.reader(file)   
    # 以for迴圈將資料一行一行載入
    for r in rows:
          orders = XET.Element('url')

          # 新增節點
          order1 = XET.SubElement(orders, 'loc')
          order1.text = str(r[0]).replace('\n', '')

          # 新增節點
          order2 = XET.SubElement(orders, 'lastmod')
          order2.text = str(r[1]).replace('\n', '')

          order3 = XET.SubElement(orders, 'priority')
          order3.text = "1.0"

          order4 = XET.SubElement(orders, 'changefreq')
          order4.text = "daily"
          # 輸出 XML 原始資料

          a.append(orders)
          
indent(a,0)
tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)