#XML #xml #py #python #xmlhandling #xmlparsing
import os
import sys
try:
import xml.etree.ElementTree as ET
except Exception as e:
print(str(e))
else:
print(dir(ET))
def pplist(list_in):
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(list_in)
def MapXmlLvl2LoL(file_in):
tree = ET.parse(file_in)
root = tree.getroot()
output = []
for elem in root:
if list(elem) == []:
output.append(elem)
else:
innerlist = []
for subelem in elem:
innerlist.append(subelem)
output.append(innerlist)
return output
def MapXmlLvl2(file_in):
tree = ET.parse(file_in)
root = tree.getroot()
print("Root element: " + str(root))
for elem in root:
print("First Level element: " + str(elem))
if list(elem) == []:
next
else:
for subelem in elem:
print("Second Level element: " + str(subelem))
def MapXmlLvl2attrib(file_in):
tree = ET.parse(file_in)
root = tree.getroot()
print("Root element: " + str(root.tag) + " : " + str(root.attrib))
for elem in root:
print("First Level element: " + str(elem.tag) + " : " + str(elem.attrib))
if list(elem) == []:
next
else:
for subelem in elem:
print("Second Level element: " + str(subelem.tag) + " : " + str(subelem.attrib))
def MapXmlLvl2tag(file_in):
tree = ET.parse(file_in)
root = tree.getroot()
print("Root element: " + str(root.tag))
for elem in root:
print("First Level element: " + str(elem.tag))
if list(elem) == []:
next
else:
for subelem in elem:
print("Second Level element: " + str(subelem.tag))
def MapXmlLvl2full(file_in):
tree = ET.parse(file_in)
root = tree.getroot()
print("Root element: " + str(root.tag) + " : " + str(root.attrib) + " : " + str(root.text))
for elem in root:
print("\tFirst Level element: " + str(elem.tag) + " : " + str(elem.attrib) + " : " + str(elem.text))
if list(elem) == []:
next
else:
for subelem in elem:
print("\t\tSecond Level element: " + str(subelem.tag) + " : " + str(subelem.attrib) + " : " + str(subelem.text))
print("\n")
def ExtractData2LevelIn(file_in, position):
# These will work for xml with double nesting
# ie <parent>
# <child>
# <subchild>
# Note: For more nesting, add a 3/4/5 dimension to root.
# ie for triple nesting root would be [0][1][2]
tree = ET.parse(file_in)
root = tree.getroot()
elements = len(root)
for i in range(int(elements)):
print("i is: " + str(i))
print(root[i][position].text)
def ExtractDataSpecRecText(file_in, record_number):
tree = ET.parse(file_in)
root = tree.getroot()
elements = len(root[int(record_number)])
for i in range(int(elements)):
print(root[record_number][i].text)
def ExtractDataSpecRecAttr(file_in, record_number):
tree = ET.parse(file_in)
root = tree.getroot()
elements = len(root[int(record_number)])
for i in range(int(elements)):
print(root[record_number][i].attrib)
def ExtractDataSpecRecNode(file_in, record_number):
tree = ET.parse(file_in)
root = tree.getroot()
elements = len(root[int(record_number)])
for i in range(int(elements)):
print(root[record_number][i])
file_in = sys.argv[1]
# Parse xml file
tree = ET.parse(file_in)
# Get root element
root = tree.getroot()
# Iterate over all item and print attribute pairs
# Example of how to invoke a name value
MapXmlLvl2(file_in)
MapXmlLvl2attrib(file_in)
MapXmlLvl2tag(file_in)
pplist(MapXmlLvl2LoL(file_in))
MapXmlLvl2full(file_in)
# element.attrib will place all attribute/value pairs in a dict.
print("All item attributes")
for elem in root:
print("Handling element: ", str(elem))
print("Handling element attribute: ", str(elem.attrib))
for subelem in elem:
print("Handling subelement: " + str(subelem))
print("Printing subelement attributes" + str(subelem.attrib))
# This will work only if the attrib 'name' is present in the attrib dict
# of each node.
#print(subelem.attrib['name'])
# Iterate over all item and print values and attribute pairs and text
print("\n\nAll item attributes - Data accessing")
for elem in root:
print("Handling element: ", str(elem))
for subelem in elem:
print("Handling subelement: " + str(subelem))
print("Printing subelement attributes: " + str(subelem.attrib))
print("Printing subelement value: " + str(subelem.text))
#print(subelem.attrib['name'])
# In this module the XML file is represented by a connected graph.
# Here s[x][y] -> x = how many levels below root it is, with 0 it is one level.
# y = which item in grouping we want to fish
print(root[0][0])
print(root[0][1])
print(root[0][2])
print(root[0][0].text)
print(root[0][1].text)
print(root[0][2].text)
print(root[0][0].attrib)
print(root[0][1].attrib)
print(root[0][2].attrib)
print(root[1][0].text)
print(root[2][0].text)
# for j in root[1][1]:
# print(j.text)
ExtractData2LevelIn(file_in, 0)
ExtractData2LevelIn(file_in, 4)
ExtractDataSpecRecText(file_in, 1)
ExtractDataSpecRecAttr(file_in, 2)
ExtractDataSpecRecNode(file_in, 3)
import os
import sys
import xml.etree.ElementTree as ET
def countxmlETsim(file_in):
'''
Name: countxmlETsim
Description: Counts tags under the root tag.
Counts xml "records"
Function: countxmlETsim(file_in)
Input: <file_in>
Output: <integer>
Usage:
Notes:
'''
import xml.etree.ElementTree as ET
tree = ET.parse(file_in)
try:
root = tree.getroot()
except Exception as e:
return False
else:
return len(root)
file_in = sys.argv[1]
tree = ET.parse(file_in)
root = tree.getroot()
# total amount of items
print(len(root))
print(countxmlETsim(file_in))
# #########################################################
# This is an example of how to create an xml file by
# declaring all elements one by one.
#
# Specific attributes are declared independently.
import os
import sys
import xml.etree.ElementTree as ET
file_in = sys.argv[1]
# create the file structure
data = ET.Element('data')
items = ET.SubElement(data, 'items')
item1 = ET.SubElement(items, 'item')
item2 = ET.SubElement(items, 'item')
#i_item1 = ET.SubElement(item1, 'gear')
# assign labels
item1.set('name', 'item1')
item2.set('name', 'item2')
#i_item1.set('name', 'gear1')
# assign values to the label tags
item1.text = 'item1abc'
item2.text = 'item2abc'
#i_item1.text = 'wheel'
# create the file with the results
mydata = ET.tostring(data)
myfile = open(file_in, 'w')
myfile.write(mydata.decode('utf8'))
import os
import sys
# A different way of parsing XML is by using "events"
# The parser generates "start" events for opening tags and "end"
# events for closing tags.
# Data can be extracted from the document during the parsing phase
# by iterating over the event stream, that way the document does not
# need to be maintained in memory.
# iterparse() returns an iterable that produces tuples
# containing the name of the event and the node triggering the event.
# Events can be:
# "start" : new tag has been encountered.
# "end" : Closing angle tag has been processed. All of the children were already processed.
# "start-ns": Start a namespace declaration
# "end-ns : End a namespace declaration
from xml.etree.ElementTree import iterparse
depth = 0
prefix_width = 8
prefix_dots = '.' * prefix_width
line_template = '{prefix:<0.{prefix_len}}{event:<8}{suffix:<{suffix_len}} {node.tag:<12} {node_id}'
for (event, node) in iterparse('podcasts.opml', ['start', 'end', 'start-ns', 'end-ns']):
if event == 'end':
depth -= 1
prefix_len = depth * 2
print (line_template.format(prefix=prefix_dots,
prefix_len=prefix_len,
suffix='',
suffix_len=(prefix_width - prefix_len),
node=node,
node_id=id(node),
event=event,
))
if event == 'start':
depth += 1
# This can be used in a "line by line" fashion ie
# when converting XML input to some other format.
# ie XML TO CSV
import csv
from xml.etree.ElementTree import iterparse
import sys
writer = csv.writer(sys.stdout, quoting=csv.QUOTE_NONNUMERIC)
group_name = ''
# This is how we can convert XML to csv
for (event, node) in iterparse('podcasts.opml', events=['start']):
if node.tag != 'outline':
# Ignore anything not part of the outline
continue
if not node.attrib.get('xmlUrl'):
# Remember the current group
group_name = node.attrib['text']
else:
# Output a podcast entry
writer.writerow( (group_name, node.attrib['text'],
node.attrib['xmlUrl'],
node.attrib.get('htmlUrl', ''),
)
)
import os
import sys
file_in = sys.argv[1]
try:
import xml.etree.ElementTree as ET
except Exception as e:
print(str(e))
tree = ET.parse(file_in)
root = tree.getroot()
# find the first root object
# find(match, namespaces=None)
for elem in root:
print(elem.find('item').get('name'))
# find all item objects and iterate to print
# findall(match, namespaces=None)
for elem in root:
for subelem in elem.findall('item'):
# if we dont know the name of the attribute - get the dict
print(subelem.attrib)
# if we know the name of the attribute, access it directly
print(subelem.get('name'))
# Similarly we access the value of the attribute
print(subelem.text)
import os
import sys
from xml.etree import ElementTree
with open('podcasts.opml', 'rt') as f:
tree = ElementTree.parse(f)
# Printing the Tree will return an ElementTreeObject
print (tree)
# Iterate over all notes with .iter()
for node in tree.iter():
print(node.tag, node.attrib)
# Iteraver over all .outline nodes (specific)
# Limits iter to only nodes with the given tag.
for node in tree.iter('outline'):
name = node.attrib.get('text')
url = node.attrib.get('xmlUrl')
if name and url:
print(" %s :: %s " % (name, url))
else:
print(name)
# Use findall to look for nodes with more descriptive search
# characteristics.
for node in tree.findall('.//outline'):
url = node.attrib.get('xmlUrl')
if url:
print(url)
# We could limit it even further, by having it iterate over
# all the inner outline nodes only.
for node in tree.findall('.//outline/outline'):
url = node.attrib.get('xmlUrl')
print(url)
# Parsing specific node attributes
with open('data.xml', 'rt') as f:
tree = ElementTree.parse(f)
# Set a specific node into a variable
node = tree.find('/with_attributes')
print (node.tag)
# Parsing attrivute key/value pairs
for name, value in sorted(node.attrib.items()):
print(' %-4s = "%s"' % (name, value))
# Parsing text and tail text
for path in [ './child', './child_with_tail' ]:
node = tree.find(path)
print(node.tag)
print('Child node text: ', node.text)
print('and tail text : ', node.tail)
# If we are dealing with XML entity references embedded
# in an xml document those are converted to chars before
# values are returned.
node = tree.find('entity_expansion')
print(node.tag)
print(' in attribute:', node.attrib['attribute'])
print(' in text :', node.text)
import os
import sys
try:
import xml.etree.ElementTree as ET
except Exception as e:
print(str(e))
else:
pass
file_in = sys.argv[1]
def pplist(list_in):
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(list_in)
def MapXmlLvl3(file_in):
tree = ET.parse(file_in)
root = tree.getroot()
print("Root element: " + str(root))
for elem in root:
print("First Level element: " + str(elem))
if list(elem) == []:
next
else:
for subelem in elem:
print("Second Level element: " + str(subelem))
if list(subelem) == []:
next
else:
for thirdlevel in subelem:
print("Third level element: " + str(thirdlevel))
def MapXmlLvl3attrib(file_in):
tree = ET.parse(file_in)
root = tree.getroot()
print("Root element: " + str(root) + " : " + str(root.attrib))
for elem in root:
print("First Level element: " + str(elem) + " : " + str(elem.attrib))
if list(elem) == []:
next
else:
for subelem in elem:
print("Second Level element: " + str(subelem) + " : " + str(subelem.attrib))
if list(subelem) == []:
next
else:
for thirdlevel in subelem:
print("Third level element: " + str(thirdlevel) + " : " + str(thirdlevel.attrib))
MapXmlLvl3(file_in)
MapXmlLvl3attrib(file_in)
import os
import sys
file_in = sys.argv[1]
from xml.dom import minidom
def prettify(elem):
"""Return a pretty-printed XML string for the Element.
"""
from xml.dom import minidom
import xml.etree.ElementTree as ET
rough_string = ET.tostring(elem, 'utf-8')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ")
def indent(elem, level=0):
i = os.linesep + level*" "
#i = "\n" + level*" "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
indent(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i
try:
import xml.etree.ElementTree as ET
except Exception as e:
print(str(e))
tree = ET.parse(file_in)
root = tree.getroot()
# changing a field text
for elem in root.iter('item'):
elem.text = 'new text'
# changing an attribute
for elem in root.iter('item'):
elem.set('name', 'newitem')
# adding an extra attribute
for elem in root.iter('item'):
elem.set('name2', 'newitem2')
# Conditionally changing a field text
for elem in root.iter('item'):
#print(elem.attrib, elem.text)
if elem.attrib['name']=='item2':
elem.text = 'new text'
# Conditionally Remove an attribute
for elem in root.iter('item'):
if elem.text == 'new text':
elem.attrib = {}
# or we could set the new mapping.
# Using tree.write to write xml adds extra formatting and indentation
tree.write('newitems3.xml')
# Creating XML Sub-Elements
# makeelement()
# SubElement()
# To add element to ROOT NODE we need to do:
# 1. declare the attrib dictionary
attrib = {}
# 2. Construct the element
element = root.makeelement('seconditems', attrib)
# 3. Append to root
root.append(element)
# To add element to the second subnode of root
attrib = {'name2': 'secondname2'}
subelement = root[0][1].makeelement('seconditem', attrib)
ET.SubElement(root[1], 'seconditem', attrib)
root[1][0].text = 'seconditemabc'
indent(root)
# create a new xml file with the new element
tree.write('newitems_v1.xml')
# how to prettify xml
# check func as well
print(prettify(root))
print (minidom.parseString(
ET.tostring(
tree.getroot(),
'utf-8')).toprettyxml(indent=" "))
# Deleting XML Elements
# Deleting an attrib
# Deleting text
# Deleting one sub element
# Deleting all subelements
tree = ET.parse(file_in)
root = tree.getroot()
# removing the name attribute from the first node of root. [0][0]
root[0][0].attrib.pop('name', None)
tree.write('newitems_v2_del.xml')
tree = ET.parse(file_in)
root = tree.getroot()
root[0].remove(root[0][0])
tree.write('newitems_v2_sub.xml')
# write with declaration
tree.write("newitems_v2_sub2.xml", encoding="utf-8", xml_declaration=True)
tree = ET.parse(file_in)
root = tree.getroot()
root[0].clear()
tree.write('newitems_v2_clr.xml')
def indent(elem, level=0):
i = os.linesep + level*" "
#i = "\n" + level*" "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
indent(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i
try:
import xml.etree.ElementTree as ET
except Exception as e:
print(str(e))
import os
import sys
try:
import xml.etree.ElementTree as ET
except Exception as e:
print(str(e))
else:
print(dir(ET))
file_in = sys.argv[1]
# Parse xml file
tree = ET.parse(file_in)
# Get root element
root = tree.getroot()
# Iterate over all item and print attribute pairs
# Example of how to invoke a name value
print("All item attributes")
for elem in root:
print("Handling element: ", str(elem))
for subelem in elem:
print("Handling subelement: " + str(subelem))
print("Printing subelement attributes" + str(subelem.attrib))
print(subelem.attrib['name'])
# Iterate over all item and print values and attribute pairs and text
print("\n\nAll item attributes - Data accessing")
for elem in root:
print("Handling element: ", str(elem))
for subelem in elem:
print("Handling subelement: " + str(subelem))
print("Printing subelement attributes: " + str(subelem.attrib))
print("Printing subelement value: " + str(subelem.text))
print(subelem.attrib['name'])
# In this module the XML file is represented by a connected graph.
# Here s[x][y] -> x = how many levels below root it is, with 0 it is one level.
# y = which item in grouping we want to fish
print(root[0][0].text)
print(root[0][1].text)
print(root[0][2].text)
print(root[0][0].attrib)
print(root[0][1].attrib)
print(root[0][2].attrib)
#############################################
#############################################
import os
import sys
# Insert subelement
from xml.etree import ElementTree
root = ElementTree.parse("sample.xml").getroot()
c = ElementTree.Element("c")
c.text = "3"
root.insert(1, c)
ElementTree.dump(root)
# Remove SubElement
root = ElementTree.parse("sample.xml").getroot()
b = root.getchildren()[1]
root.remove(b)
ElementTree.dump(root)
# Get the text
from xml.etree import ElementTree
tree = ElementTree.parse("sample.xml")
root = tree.getroot()
print root.find("a").text
# Get the attribute
from xml.etree import ElementTree
tree = ElementTree.parse("fruits.xml")
item = tree.getroot()[0]
print item.get("color")
# Get the root XML element of an 'ElementTree'
from xml.etree import ElementTree
tree = ElementTree.parse("sample.xml")
print tree.getroot()
# Get all child elements of an XML element
from xml.etree import ElementTree
tree = ElementTree.parse("sample.xml")
root = tree.getroot()
children = root.getchildren()
for child in children:
ElementTree.dump(child)
# Get the tag name of an XML element
from xml.etree import ElementTree
element = ElementTree.Element("a")
print element.tag
# Get a descendant of an XML element using indices.
from xml.etree import ElementTree
tree = ElementTree.parse("fruits.xml")
root = tree.getroot()
print root[0][1].text
# Get the key/value pairs of the attrib dictionary (XML)
from xml.etree import ElementTree
tree = ElementTree.parse("fruits.xml")
item = tree.getroot()[0]
print item.keys()
print item.items()
# Retrieve the tag of an XML element with a namespace
from xml.etree import ElementTree
root = ElementTree.parse("namespaces.xml").getroot()
ElementTree.dump(root)
print root.tag
# Add element as last child of another XML element
from xml.etree import ElementTree
root = ElementTree.parse("sample.xml").getroot()
c = ElementTree.Element("c")
c.text = "3"
root.append(c)
ElementTree.dump(root)
# Load XML from string
from xml.etree import ElementTree
root = ElementTree.XML("<root><a>1</a></root>")
ElementTree.dump(root)
#Load XML from a string into an ElementTree
from xml.etree import ElementTree
root = ElementTree.fromstring("<root><a>1</a></root>")
ElementTree.dump(root)
# Load XML file into ElementTree
from xml.etree import ElementTree
tree = ElementTree.parse("sample.xml")
ElementTree.dump(tree)
# Load XML file into an ElementTree more effieciently
from xml.etree import cElementTree
tree = cElementTree.parse("sample.xml")
cElementTree.dump(tree)
# Clear an XML element
from xml.etree import ElementTree
root = ElementTree.parse("fruits.xml").getroot()
root.clear()
ElementTree.dump(root
# Find the first subelement of an xml element that matches a condition
rom xml.etree import ElementTree
root = ElementTree.parse("fruits.xml").getroot()
for item in root.findall("item"):
ElementTree.dump(item)
from xml.etree import ElementTree
root = ElementTree.parse("fruits.xml").getroot()
name = root.find("item/name")
ElementTree.dump(name)
# Find all children of an XML element that match a tag
from xml.etree import ElementTree
root = ElementTree.parse("fruits.xml").getroot()
for name in root.iter("name"):
ElementTree.dump(name)
# Find all descendants of an XML element that match a tag
from xml.etree import ElementTree
root = ElementTree.parse("fruits.xml").getroot()
for name in root.iter("name"):
ElementTree.dump(name)
# Set attributes of an XML element
from xml.etree import ElementTree
item = ElementTree.Element("item")
item.set("color", "red")
item.set("flavor", "sweet")
ElementTree.dump(item)
# Check whether an object is an xml element
from xml.etree import ElementTree
tree = ElementTree.parse("sample.xml")
root = tree.getroot()
a = root[0]
print ElementTree.iselement(root)
print ElementTree.iselement(a)
print ElementTree.iselement(tree)
print ElementTree.iselement(1)
# Generate a string representation of an XML element
from xml.etree import ElementTree
a = ElementTree.Element("a")
a.text = "1"
print ElementTree.tostring(a)
# Efficient ways to construct XML elements
from xml.etree import cElementTree
a = cElementTree.Element("a")
a.text = "1"
cElementTree.dump(a)
import os
import sys
file_in = sys.argv[1]
from xml.dom import minidom
def prettify(elem):
"""Return a pretty-printed XML string for the Element.
"""
from xml.dom import minidom
import xml.etree.ElementTree as ET
rough_string = ET.tostring(elem, 'utf-8')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ")
def indent(elem, level=0):
i = os.linesep + level*" "
#i = "\n" + level*" "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
indent(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i
try:
import xml.etree.ElementTree as ET
except Exception as e:
print(str(e))
tree = ET.parse(file_in)
root = tree.getroot()
# changing a field text
for elem in root.iter('item'):
elem.text = 'new text'
# changing an attribute
for elem in root.iter('item'):
elem.set('name', 'newitem')
# adding an extra attribute
for elem in root.iter('item'):
elem.set('name2', 'newitem2')
# Conditionally changing a field text
for elem in root.iter('item'):
#print(elem.attrib, elem.text)
if elem.attrib['name']=='item2':
elem.text = 'new text'
# Conditionally Remove an attribute
for elem in root.iter('item'):
if elem.text == 'new text':
elem.attrib = {}
# or we could set the new mapping.
# Using tree.write to write xml adds extra formatting and indentation
tree.write('newitems3.xml')
# Creating XML Sub-Elements
# makeelement()
# SubElement()
# To add element to ROOT NODE we need to do:
# 1. declare the attrib dictionary
attrib = {}
# 2. Construct the element
element = root.makeelement('seconditems', attrib)
# 3. Append to root
root.append(element)
# To add element to the second subnode of root
attrib = {'name2': 'secondname2'}
subelement = root[0][1].makeelement('seconditem', attrib)
ET.SubElement(root[1], 'seconditem', attrib)
root[1][0].text = 'seconditemabc'
indent(root)
# create a new xml file with the new element
tree.write('newitems_v1.xml')
# how to prettify xml
# check func as well
print(prettify(root))
print (minidom.parseString(
ET.tostring(
tree.getroot(),
'utf-8')).toprettyxml(indent=" "))
# Deleting XML Elements
# Deleting an attrib
# Deleting text
# Deleting one sub element
# Deleting all subelements
tree = ET.parse(file_in)
root = tree.getroot()
# removing the name attribute from the first node of root. [0][0]
root[0][0].attrib.pop('name', None)
tree.write('newitems_v2_del.xml')
tree = ET.parse(file_in)
root = tree.getroot()
root[0].remove(root[0][0])
tree.write('newitems_v2_sub.xml')
# write with declaration
tree.write("newitems_v2_sub2.xml", encoding="utf-8", xml_declaration=True)
tree = ET.parse(file_in)
root = tree.getroot()
root[0].clear()
tree.write('newitems_v2_clr.xml')