edit.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. import xml.etree.ElementTree as XET
  2. from bs4 import BeautifulSoup
  3. import requests
  4. import csv # 載入csv套件
  5. from urllib.request import urlopen
  6. # 取得XML表格
  7. import xml.etree.ElementTree as ET
  8. """
  9. ElementTree.write() 將構建的XML文檔寫入(更新)文件。
  10. Element.set(key, value) 添加和修改屬性
  11. Element.text = '' 直接改變字段內容
  12. Element.remove(Element) 刪除Element節點
  13. Element.append(Element) 為當前的Elment對象添加子對象
  14. ET.SubElement(Element,tag)創建子節點
  15. """
  16. # 增加自動縮進換行
  17. def indent(elem, level=0):
  18. i = "\n" + level*" "
  19. if len(elem):
  20. if not elem.text or not elem.text.strip():
  21. elem.text = i + " "
  22. if not elem.tail or not elem.tail.strip():
  23. elem.tail = i
  24. for elem in elem:
  25. indent(elem, level+1)
  26. if not elem.tail or not elem.tail.strip():
  27. elem.tail = i
  28. else:
  29. if level and (not elem.tail or not elem.tail.strip()):
  30. elem.tail = i
  31. response = requests.get("https://hhh.com.tw/sitemap.xml")
  32. soup = BeautifulSoup(response.text, "html.parser")
  33. path = 'tmp.xml'
  34. f = open(path, 'w')
  35. f.write(soup.prettify())
  36. f.close()
  37. tree = XET.parse('tmp.xml') # 以XET套件載入XML檔案
  38. old_root = tree.getroot()
  39. #創建根節點
  40. a = ET.Element("urlset")
  41. a.attrib = {"xmlns":"http://www.sitemaps.org/schemas/sitemap/0.9","xmlns:xhtml":"http://www.w3.org/1999/xhtml"}
  42. #創建子節點,並添加屬性
  43. #創建elementtree對象,寫文件
  44. indent(a,0)
  45. tree = ET.ElementTree(a)
  46. tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)
  47. print("add old data ... ")
  48. for child in old_root :
  49. b = ET.Element("url")
  50. # 新增節點
  51. order1 = XET.SubElement(b, 'loc')
  52. #添加數據
  53. order1.text = child[0].text.strip()
  54. # 新增節點
  55. order2 = XET.SubElement(b, 'lastmod')
  56. order2.text = child[1].text.strip()
  57. order3 = XET.SubElement(b, 'priority')
  58. order3.text = child[2].text.strip()
  59. order4 = XET.SubElement(b, 'changefreq')
  60. order4.text = child[3].text.strip()
  61. a.append(b)
  62. indent(a,0)
  63. tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)
  64. print("add new data ... ")
  65. #開啟csv檔案,其中參數newline=""是為了讓資料的換行符號被正確載入,而參數encoding="utf-8"則是因為資料包含中文字元,為了讓中文字元能正確被載入
  66. with open("Table.csv",newline="",encoding="utf-8")as file:
  67. # 以csv.reader函式讀取資料,若資料的分隔字元非逗號(,),舉例像是冒號(:)則可增加參數delimiter = ":" 來讀取
  68. rows = csv.reader(file)
  69. # 以for迴圈將資料一行一行載入
  70. for r in rows:
  71. orders = XET.Element('url')
  72. # 新增節點
  73. order1 = XET.SubElement(orders, 'loc')
  74. order1.text = str(r[0]).replace('\n', '')
  75. # 新增節點
  76. order2 = XET.SubElement(orders, 'lastmod')
  77. order2.text = str(r[1]).replace('\n', '')
  78. order3 = XET.SubElement(orders, 'priority')
  79. order3.text = "1.0"
  80. order4 = XET.SubElement(orders, 'changefreq')
  81. order4.text = "daily"
  82. # 輸出 XML 原始資料
  83. a.append(orders)
  84. indent(a,0)
  85. tree.write("sitemap_new.xml",encoding="utf-8", xml_declaration=True)