#XML #xml #py #python #xmlhandling #xmlparsing
# ##########################################################################
# NAME: CountXml.py
#
# # ------------------------------------------------------------------------
# AUTHOR: P.Doulgeridis
#
# # ------------------------------------------------------------------------
# PACKAGE: Utils
#
# # ------------------------------------------------------------------------
# LOCATION: C:\Users\p.doulgeridis\Desktop\MergingXml\CountXml.py
# PRODUCTION LOCATION: C:\stratik\COMPLETEDTHALIS
# # ------------------------------------------------------------------------
# INPUT: Folder Name *(No slash)
#
# # ------------------------------------------------------------------------
# OUTPUT: CountTotal.txt (Contains analytical count of files)
#
# # ------------------------------------------------------------------------
# USAGE: python CountXml.py InputXml
#
#
# # ------------------------------------------------------------------------
# FUNCTION: Parses xml files only found in folder parameter,
# counts total amount of records under parent node,
# and reports in new file.
# # ------------------------------------------------------------------------
# NOTES: Possible encoding problems due to unicode escapes.
# Ended up using lxml instead of ElementTree as it handles encoding
# better.
#
# lxml is another module for parsing xml files with python.
# It uses a similar interface and has inter-changeable commands.
# We can use:
#
# from xml.etree import ElementTree
# from lxml import etree as ElementTree
#
#
# # ------------------------------------------------------------------------
# REVISION HISTORY:
#
# v1: 20/07/2019
# v2: 21/07/2019
# - added lxml call instead of ElemenetTree (Solved encoding error)
# v3: 22/07/2019
# - added output to file "CountTotal.txt"
#
# # ------------------------------------------------------------------------
# TO DO:
#
#
# ############################################################################
# import glob
# from xml.etree import ElementTree
import os
import sys
def newRunRun(folder):
import glob
#from xml.etree import ElementTree
from lxml import etree as ElementTree
xml_files = []
try:
print("Checking: For xml files in folder: " + str(folder))
xml_files = glob.glob(folder+"/*.xml")
except Exception as e:
print("Failed to glob xml files. Possible bad regex")
else:
print("Checking: Xml files globbed successfully. Counting...")
print("Xml files located in directory: " + str(len(xml_files)))
node = None
lengths = dict()
#print(str(xml_files))
#x = input("press a key")
for xmlFile in xml_files:
print("Processing file: " + str(xmlFile))
lengths[xmlFile] = 0
tree = ElementTree.parse(xmlFile)
# or if we want to use specific parser
#parser = ET.XMLParser(encoding="utf-8")
# tree = ElementTree.parse(xmlFile, parser=parser)
root = tree.getroot()
lengths[xmlFile] = len(root)
outfile = open('CountTotal.txt', 'w')
print("\n@@Performing integrity check: ")
total = 0
for item in lengths.keys():
print("File: " + str(item) + " with total: " + str(lengths[item]))
outfile.write(str(item) + "," + str(lengths[item]) + "\n")
total += lengths[item]
print("Checking: Total number of calculated entries: " + str(total) + "\n")
#print("Checking: Expected number of entries: " + str(len(node)))
#if int(total) != int(len(node)):
# print("Error : When matching produced totals")
#else:
# print("Success: Produced totals match. ")
outfile.close()
lengths = {}
folder = sys.argv[1]
newRunRun(folder)
# ##########################################################################
# NAME: CountXml.py
#
# # ------------------------------------------------------------------------
# AUTHOR: P.Doulgeridis
#
# # ------------------------------------------------------------------------
# PACKAGE: Utils
#
# # ------------------------------------------------------------------------
# LOCATION: C:\Users\p.doulgeridis\Desktop\MergingXml\MergingXml4.py
# PRODUCTION LOCATION: C:\stratik\COMPLETEDTHALIS
# CALLED BY: MergeXml.bat
# # ------------------------------------------------------------------------
# INPUT: Folder Name *(No slash) ie InputXml
#
# # ------------------------------------------------------------------------
# OUTPUT: MergeXmlLog.20190730-152452.log (Contains analytical count of files)
#
# # ------------------------------------------------------------------------
# USAGE: python MergingXml4.py InputXml
# or
# MergeXml.bat
# # ------------------------------------------------------------------------
# FUNCTION: Parses xml files only found in folder parameter,
# counts total amount of records under parent node,
# and reports in new file, then it uses the first
# parent node it encounters and appends to it every other
# root node. This is used to concatenate xml files under one root node.
# # ------------------------------------------------------------------------
# NOTES: Possible encoding problems due to unicode escapes.
# Ended up using lxml instead of ElementTree as it handles encoding
# better.
#
# lxml is another module for parsing xml files with python.
# It uses a similar interface and has inter-changeable commands.
# We can use:
#
# from xml.etree import ElementTree
# from lxml import etree as ElementTree
#
#
# # ------------------------------------------------------------------------
# REVISION HISTORY:
#
# v1: 20/07/2019
# v2: 21/07/2019
# - added lxml call instead of ElemenetTree (Solved encoding error)
# v3: 22/07/2019
# - added output to file "MergeXmlLog.20190730-152452.log"
# v4: 23/07/2019
# - Fixed output name to be the name of the last file read.
#
# # ------------------------------------------------------------------------
# TO DO:
#
#
# ############################################################################
# import glob
# from xml.etree import ElementTree
import os
import sys
def getLineInfo():
import inspect
print(inspect.stack()[1][1],":",inspect.stack()[1][2],":",
inspect.stack()[1][3])
def autologfile(message, filename):
"""
Function: autolog()
Description: Autologs - Prints to stdout with caller and line.no
Input: Message <string>
Output: Message will be appended to the caller\line.no string
Usage: autolog("This is a test message from autolog")
Notes: Get the previous frame in the stack, otherwise it would be this function!!!
"""
import inspect, logging
func = inspect.currentframe().f_back.f_code
#print(func)
#print(inspect.getframeinfo(f_code))
#print(inspect.stack())
#print(inspect.trace())
# Dump the message + the name of this function to the log.
with open(filename, 'a') as f:
f.write("\n[[Script: %s]]:[Method: %s]: Line:%i -> : %s" % (
func.co_filename,
func.co_name,
func.co_firstlineno,
message
))
def autologprint(message, filename):
"""
Function: autolog()
Description: Autologs - Prints to stdout with caller and line.no
Input: Message <string>
Output: Message will be appended to the caller\line.no string
Usage: autolog("This is a test message from autolog")
Notes: Get the previous frame in the stack, otherwise it would be this function!!!
"""
import inspect, logging
func = inspect.currentframe().f_back.f_code
# Will return line of autologprint
#func2 = inspect.currentframe().f_code
#getLineInfo()
#print(func)
#print(inspect.getframeinfo(f_code))
#print(inspect.stack())
#print(inspect.trace())
# Dump the message + the name of this function to the log.
print(str(message))
with open(filename, 'a') as f:
f.write("\n[[Script: %s]]:[Method: %s]: Line:%i -> : %s" % (
func.co_filename,
func.co_name,
func.co_firstlineno,
message
))
def newRunRun(folder):
import glob
from xml.etree import ElementTree
#from lxml import etree as ElementTree
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
print timestr
#getLineInfo()
log_name = "MergeXmlLog." + str(timestr) + ".log"
bad_name = "FailedMergeLog." + str(timestr) + ".log"
bad_files = []
xml_files = []
try:
autologprint("Checking: For xml files in folder: " + str(folder), log_name)
xml_files = glob.glob(folder+"/*.xml")
except Exception as e:
autologprint("Failed to glob xml files. Possible bad regex", log_name)
else:
autologprint("Checking: Xml files globbed successfully. Counting...", log_name)
print("Xml files located in directory: " + str(len(xml_files)))
node = None
lengths = dict()
#print(str(xml_files))
#x = input("press a key")
for xmlFile in xml_files:
autologfile("Processing file: " + str(xmlFile), log_name)
print("Processing file: " + str(xmlFile))
lengths[xmlFile] = 0
try:
tree = ElementTree.parse(xmlFile)
root = tree.getroot()
except:
print("ATTENTION: Failed to parse file: " + str(xmlFile))
autologfile("ATTENTION: Failed to parse file: " + str(xmlFile), log_name)
bad_files.append(xmlFile)
else:
autologfile("Successfully parsed file: " + str(xmlFile), log_name)
lengths[xmlFile] = len(root)
if node is None:
node = root
else:
elements = root
#for element in elements._children:
for element in elements:
#print(str(element))
node.append(element)
#print(tree)
#print(node)
#print(lengths)
#print(len(node))
file_ot = os.path.join(folder, "NEW3.xmlout5.xml")
print("Output file: " + file_ot)
with open(file_ot, 'w') as f:
# vgazei titlo alla doulevei k xalaei sto sap
#f.write(str(ElementTree.tostring(node, encoding="utf-16", method="xml").decode('utf-16')))
#f.write(str(ElementTree.tostring(node, encoding="utf_16_le", method="xml").decode('utf_16_le')))
#f.write(str(ElementTree.tostring(node, encoding="utf-16").decode('utf-16')))
#f.write(ElementTree.tostring(node, encoding="utf-8", method="xml").decode('utf-8'))
#f.write(ElementTree.tostring(node, encoding="utf-16", method="xml").decode('utf-16'))
#f.write(ElementTree.tostring(node, encoding="utf-8", method="xml").decode('utf-16-le'))
#doulevei alla vgazei titlo
try:
f.write(str(ElementTree.tostring(node, encoding="utf-8", method="xml").decode('utf-8').encode('utf-16')))
except:
print("Failed to write.")
sys.exit(3)
else:
print("Writing Successful")
# vgazei oti nanai - xoris titlo
#f.write(str(ElementTree.tostring(node).decode('utf-16')))
#f.write(str(ElementTree.tostring(node).decode('utf-16le')))
#f.write(str(ElementTree.tostring(node).decode('utf16')))
#f.write(str(ElementTree.tostring(node, encoding="utf-8", method="xml").decode('utf-8')))
# den doulevei
#f.write(str(ElementTree.tostring(node).decode('utf16')))
# den doulevei
#f.write(str(ElementTree.tostring(node).decode('utf_16_be')))
# encoding error
#f.write(str(ElementTree.tostring(node).decode('utf_16')))
# encoding error
#f.write(str(ElementTree.tostring(node).decode('utf-16')))
#f.write(str(ElementTree.tostring(node).decode('utf16')))
#f.write(str(ElementTree.tostring(node).decode('utf-16')))
# Works - Closest version - No title / Lebom Encoding / Loads on Viewers
#tree.write(f, encoding="utf-16", xml_declaration=False, default_namespace=None, method="xml")
#tree.write(f, encoding="'utf-16le'", xml_declaration=False, default_namespace=None, method="xml")
#tree.write(f, encoding="'utf-16be'", xml_declaration=False, default_namespace=None, method="xml")
#tree.write(f, encoding="'utf-16-be'", xml_declaration=False, default_namespace=None, method="xml")
#node.write(f, encoding="utf-16", xml_declaration=False, default_namespace=None, method="xml")
#ElementTree.tostring(node).write(f, encoding="utf-16", xml_declaration=False, default_namespace=None, method="xml")
#print (ElementTree.tostring(node))
print("\n@@Performing integrity check: ")
total = 0
for item in lengths.keys():
print("File: " + str(item) + " with total: " + str(lengths[item]))
autologprint("File: " + str(item) + " with total: " + str(lengths[item]), log_name)
total += lengths[item]
#print("Checking: Total number of calculated entries: " + str(total))
autologprint("Checking: Total number of calculated entries: " + str(total), log_name)
#print("Checking: Expected number of entries: " + str(len(node)))
autologprint("Checking: Expected number of entries: " + str(len(node)), log_name)
if int(total) != int(len(node)):
autologprint("Error : When matching produced totals", log_name)
else:
autologprint("Success: Produced totals match. ", log_name)
print("Reporting on files that failed to parse: ")
for badfile in bad_files:
autologprint("File: " + str(badfile) + " failed to print.", log_name)
return True
if __name__ == '__main__':
folder = sys.argv[1]
if os.path.exists(folder):
print("Input Directory located.")
else:
print("ERROR: Failed to detect input directory.")
newRunRun(folder)
# try:
# newRunRun(folder)
# except Exception as e:
# print("Failure on function: newRunRun")
# import glob
# from xml.etree import ElementTree
import os
import sys
def newRunRun(folder):
import glob
#from xml.etree import ElementTree
from lxml import etree as ElementTree
xml_files = []
try:
print("Checking: For xml files in folder: " + str(folder))
xml_files = glob.glob(folder+"/*.xml")
except Exception as e:
print("Failed to glob xml files. Possible bad regex")
else:
print("Checking: Xml files globbed successfully. Counting...")
print("Xml files located in directory: " + str(len(xml_files)))
node = None
lengths = dict()
#print(str(xml_files))
#x = input("press a key")
for xmlFile in xml_files:
print("Processing file: " + str(xmlFile))
lengths[xmlFile] = 0
tree = ElementTree.parse(xmlFile)
# or if we want to use specific parser
#parser = ET.XMLParser(encoding="utf-8")
# tree = ElementTree.parse(xmlFile, parser=parser)
root = tree.getroot()
lengths[xmlFile] = len(root)
outfile = open('CountTotal.txt', 'w')
print("\n@@Performing integrity check: ")
total = 0
for item in lengths.keys():
print("File: " + str(item) + " with total: " + str(lengths[item]))
outfile.write(str(item) + "," + str(lengths[item]) + "\n")
total += lengths[item]
print("Checking: Total number of calculated entries: " + str(total) + "\n")
#print("Checking: Expected number of entries: " + str(len(node)))
#if int(total) != int(len(node)):
# print("Error : When matching produced totals")
#else:
# print("Success: Produced totals match. ")
outfile.close()
lengths = {}
folder = sys.argv[1]
newRunRun(folder)