edit.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. import xml.etree.ElementTree as XET
  2. from bs4 import BeautifulSoup
  3. import requests
  4. import csv # 載入csv套件
  5. from urllib.request import urlopen
  6. import xml.etree.ElementTree as ET
  7. """
  8. ElementTree.write() 將構建的XML文檔寫入(更新)文件。
  9. Element.set(key, value) 添加和修改屬性
  10. Element.text = '' 直接改變字段內容
  11. Element.remove(Element) 刪除Element節點
  12. Element.append(Element) 為當前的Elment對象添加子對象
  13. ET.SubElement(Element,tag)創建子節點
  14. """
  15. # 增加自動縮進換行
  16. def indent(elem, level=0):
  17. i = "\n" + level*" "
  18. if len(elem):
  19. if not elem.text or not elem.text.strip():
  20. elem.text = i + " "
  21. if not elem.tail or not elem.tail.strip():
  22. elem.tail = i
  23. for elem in elem:
  24. indent(elem, level+1)
  25. if not elem.tail or not elem.tail.strip():
  26. elem.tail = i
  27. else:
  28. if level and (not elem.tail or not elem.tail.strip()):
  29. elem.tail = i
  30. # 將網路上的XML下載下來
  31. def get_XML(url):
  32. response = requests.get(url)
  33. soup = BeautifulSoup(response.text, "html.parser")
  34. path = 'tmp.xml'
  35. f = open(path, 'w')
  36. f.write(soup.prettify())
  37. f.close()
  38. def read_and_add_XML():
  39. tree = XET.parse('tmp.xml') # 以XET套件載入XML檔案
  40. old_root = tree.getroot()
  41. a = ET.Element("urlset")
  42. a.attrib = {"xmlns":"http://www.sitemaps.org/schemas/sitemap/0.9","xmlns:xhtml":"http://www.w3.org/1999/xhtml"}
  43. indent(a,0)
  44. tree = ET.ElementTree(a)
  45. tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)
  46. print("add old data ... ")
  47. for child in old_root :
  48. b = ET.Element("url")
  49. # 新增節點
  50. order1 = XET.SubElement(b, 'loc')
  51. #添加數據
  52. order1.text = child[0].text.strip()
  53. # 新增節點
  54. order2 = XET.SubElement(b, 'lastmod')
  55. order2.text = child[1].text.strip()
  56. order3 = XET.SubElement(b, 'priority')
  57. order3.text = child[2].text.strip()
  58. order4 = XET.SubElement(b, 'changefreq')
  59. order4.text = child[3].text.strip()
  60. a.append(b)
  61. indent(a,0)
  62. tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)
  63. print("add new data ... ")
  64. #開啟csv檔案,其中參數newline=""是為了讓資料的換行符號被正確載入,而參數encoding="utf-8"則是因為資料包含中文字元,為了讓中文字元能正確被載入
  65. with open("Table.csv",newline="",encoding="utf-8")as file:
  66. # 以csv.reader函式讀取資料,若資料的分隔字元非逗號(,),舉例像是冒號(:)則可增加參數delimiter = ":" 來讀取
  67. rows = csv.reader(file)
  68. # 以for迴圈將資料一行一行載入
  69. for r in rows:
  70. orders = XET.Element('url')
  71. # 新增節點
  72. order1 = XET.SubElement(orders, 'loc')
  73. order1.text = str(r[0]).replace('\n', '')
  74. # 新增節點
  75. order2 = XET.SubElement(orders, 'lastmod')
  76. order2.text = str(r[1]).replace('\n', '')
  77. order3 = XET.SubElement(orders, 'priority')
  78. order3.text = "1.0"
  79. order4 = XET.SubElement(orders, 'changefreq')
  80. order4.text = "daily"
  81. # 輸出 XML 原始資料
  82. a.append(orders)
  83. indent(a,0)
  84. tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)
  85. #創建子節點,並添加屬性
  86. #創建elementtree對象,寫文件
  87. def main():
  88. get_XML("https://hhh.com.tw/sitemap.xml")
  89. read_and_add_XML()
  90. if __name__ == '__main__':
  91. main()