onlyforbopi
7/31/2019 - 6:23 AM

Python.XML.ReadyApps

#XML #xml #py #python #xmlhandling #xmlparsing

# ##########################################################################
# NAME: CountXml.py
#
# # ------------------------------------------------------------------------
# AUTHOR: P.Doulgeridis
#
# # ------------------------------------------------------------------------
# PACKAGE: Utils
#
# # ------------------------------------------------------------------------ 
# LOCATION: C:\Users\p.doulgeridis\Desktop\MergingXml\CountXml.py
# PRODUCTION LOCATION: C:\stratik\COMPLETEDTHALIS
# # ------------------------------------------------------------------------ 
# INPUT: Folder Name *(No slash)
#
# # ------------------------------------------------------------------------ 
# OUTPUT: CountTotal.txt (Contains analytical count of files)
#
# # ------------------------------------------------------------------------
# USAGE: python CountXml.py InputXml
#                                   
#          
# # ------------------------------------------------------------------------
# FUNCTION: Parses xml files only found in folder parameter, 
#           counts total amount of records under parent node, 
#           and reports in new file.
# # ------------------------------------------------------------------------
# NOTES: Possible encoding problems due to unicode escapes. 
#        Ended up using lxml instead of ElementTree as it handles encoding
#        better. 
#          
#        lxml is another module for parsing xml files with python.    
#        It uses a similar interface and has inter-changeable commands. 
#        We can use: 
#
#        from xml.etree import ElementTree
#        from lxml import etree as ElementTree
#
#
# # ------------------------------------------------------------------------ 
# REVISION HISTORY:
#
#       v1: 20/07/2019
#       v2: 21/07/2019
#           - added lxml call instead of ElemenetTree (Solved encoding error)
#       v3: 22/07/2019
#           - added output to file "CountTotal.txt"
#
# # ------------------------------------------------------------------------
# TO DO:     
#
#
# ############################################################################



# import glob
# from xml.etree import ElementTree



import os
import sys

def newRunRun(folder):


    import glob
    #from xml.etree import ElementTree
    from lxml import etree as ElementTree

    xml_files = []

    try:
        print("Checking: For xml files in folder: " + str(folder))
        xml_files = glob.glob(folder+"/*.xml")
    except Exception as e:
        print("Failed to glob xml files. Possible bad regex")
    else:
        print("Checking: Xml files globbed successfully. Counting...")
        print("Xml files located in directory: " + str(len(xml_files)))
    
    node = None
    lengths = dict()
    
    
    #print(str(xml_files))
    #x = input("press a key")
    
    
    for xmlFile in xml_files:
        print("Processing file: " + str(xmlFile))
        lengths[xmlFile] = 0
        tree = ElementTree.parse(xmlFile)
        
        # or if we want to use specific parser 
        #parser = ET.XMLParser(encoding="utf-8")
        # tree = ElementTree.parse(xmlFile, parser=parser)
        
        root = tree.getroot()
        lengths[xmlFile] = len(root)

    
    
    outfile = open('CountTotal.txt', 'w')
    print("\n@@Performing integrity check: ")
    total = 0
    for item in lengths.keys():
        print("File: " + str(item) + " with total: " + str(lengths[item]))
        outfile.write(str(item) + "," + str(lengths[item]) + "\n")
        total += lengths[item]
    
    
    print("Checking: Total number of calculated entries: " + str(total) + "\n")
    #print("Checking: Expected number of entries: " + str(len(node)))
    
    #if int(total) != int(len(node)):
    #    print("Error : When matching produced totals")
    #else:
    #    print("Success: Produced totals match. ")
    outfile.close()
    lengths = {}

folder = sys.argv[1]
newRunRun(folder)
# ##########################################################################
# NAME: CountXml.py
#
# # ------------------------------------------------------------------------
# AUTHOR: P.Doulgeridis
#
# # ------------------------------------------------------------------------
# PACKAGE: Utils
#
# # ------------------------------------------------------------------------ 
# LOCATION: C:\Users\p.doulgeridis\Desktop\MergingXml\MergingXml4.py
# PRODUCTION LOCATION: C:\stratik\COMPLETEDTHALIS
# CALLED BY: MergeXml.bat
# # ------------------------------------------------------------------------ 
# INPUT: Folder Name *(No slash) ie InputXml
#
# # ------------------------------------------------------------------------ 
# OUTPUT: MergeXmlLog.20190730-152452.log (Contains analytical count of files)
#
# # ------------------------------------------------------------------------
# USAGE: python MergingXml4.py InputXml
#        or                           
#        MergeXml.bat  
# # ------------------------------------------------------------------------
# FUNCTION: Parses xml files only found in folder parameter, 
#           counts total amount of records under parent node, 
#           and reports in new file, then it uses the first 
#           parent node it encounters and appends to it every other
#           root node. This is used to concatenate xml files under one root node.

# # ------------------------------------------------------------------------
# NOTES: Possible encoding problems due to unicode escapes. 
#        Ended up using lxml instead of ElementTree as it handles encoding
#        better. 
#          
#        lxml is another module for parsing xml files with python.    
#        It uses a similar interface and has inter-changeable commands. 
#        We can use: 
#
#        from xml.etree import ElementTree
#        from lxml import etree as ElementTree
#
#
# # ------------------------------------------------------------------------ 
# REVISION HISTORY:
#
#       v1: 20/07/2019
#       v2: 21/07/2019
#           - added lxml call instead of ElemenetTree (Solved encoding error)
#       v3: 22/07/2019
#           - added output to file "MergeXmlLog.20190730-152452.log"
#       v4: 23/07/2019
#           - Fixed output name to be the name of the last file read.
#
# # ------------------------------------------------------------------------
# TO DO:     
#
#
# ############################################################################







# import glob
# from xml.etree import ElementTree

import os
import sys

def getLineInfo():
    import inspect
    print(inspect.stack()[1][1],":",inspect.stack()[1][2],":",
          inspect.stack()[1][3])

def autologfile(message, filename):
    """
    Function: autolog()
    Description: Autologs - Prints to stdout with caller and line.no
    Input: Message <string> 
    Output: Message will be appended to the caller\line.no string
    Usage: autolog("This is a test message from autolog")
    Notes:  Get the previous frame in the stack, otherwise it would be this function!!!
    """
    
    
    
    import inspect, logging
    func = inspect.currentframe().f_back.f_code
    #print(func)
    #print(inspect.getframeinfo(f_code))
    #print(inspect.stack())
    #print(inspect.trace())
    # Dump the message + the name of this function to the log.



    with open(filename, 'a') as f:

        f.write("\n[[Script: %s]]:[Method: %s]: Line:%i -> : %s" % ( 
            func.co_filename, 
            func.co_name, 
            func.co_firstlineno,
            message
        ))


def autologprint(message, filename):
    """
    Function: autolog()
    Description: Autologs - Prints to stdout with caller and line.no
    Input: Message <string> 
    Output: Message will be appended to the caller\line.no string
    Usage: autolog("This is a test message from autolog")
    Notes:  Get the previous frame in the stack, otherwise it would be this function!!!
    """
    
    
    
    import inspect, logging
    func = inspect.currentframe().f_back.f_code
    
    # Will return line of autologprint
    #func2 = inspect.currentframe().f_code
    
    
    #getLineInfo()
    
    #print(func)
    #print(inspect.getframeinfo(f_code))
    #print(inspect.stack())
    #print(inspect.trace())
    # Dump the message + the name of this function to the log.

    print(str(message))

    with open(filename, 'a') as f:

        f.write("\n[[Script: %s]]:[Method: %s]: Line:%i -> : %s" % ( 
            func.co_filename, 
            func.co_name, 
            func.co_firstlineno,
            message
        ))




def newRunRun(folder):

    import glob
    from xml.etree import ElementTree
    #from lxml import etree as ElementTree

    import time
    timestr = time.strftime("%Y%m%d-%H%M%S")
    print timestr

    #getLineInfo()

    log_name = "MergeXmlLog." + str(timestr) + ".log"
    bad_name = "FailedMergeLog." + str(timestr) + ".log"

    bad_files = []
    xml_files = []

    try:
        autologprint("Checking: For xml files in folder: " + str(folder), log_name)
        xml_files = glob.glob(folder+"/*.xml")
    except Exception as e:
        autologprint("Failed to glob xml files. Possible bad regex", log_name)
    else:
        autologprint("Checking: Xml files globbed successfully. Counting...", log_name)
        print("Xml files located in directory: " + str(len(xml_files)))
    
    node = None
    lengths = dict()
    
    
    #print(str(xml_files))
    #x = input("press a key")
    
    
    for xmlFile in xml_files:
        autologfile("Processing file: " + str(xmlFile), log_name)
        print("Processing file: " + str(xmlFile))
        lengths[xmlFile] = 0
        try:
            tree = ElementTree.parse(xmlFile)
            root = tree.getroot()
        except:
            print("ATTENTION: Failed to parse file: " + str(xmlFile))
            autologfile("ATTENTION: Failed to parse file: " + str(xmlFile), log_name)
            bad_files.append(xmlFile)
        else:
            autologfile("Successfully parsed file: " + str(xmlFile), log_name)
        
        lengths[xmlFile] = len(root)
        if node is None:
            node = root
        else:
            elements = root         
            #for element in elements._children:
            for element in elements:
                #print(str(element))
                node.append(element) 


    #print(tree)
    #print(node)
    #print(lengths)
    #print(len(node))
                
    file_ot = os.path.join(folder, "NEW3.xmlout5.xml")
    print("Output file: " + file_ot)
    with open(file_ot, 'w') as f:
       
        # vgazei titlo alla doulevei k xalaei sto sap
        #f.write(str(ElementTree.tostring(node, encoding="utf-16", method="xml").decode('utf-16')))
        #f.write(str(ElementTree.tostring(node, encoding="utf_16_le", method="xml").decode('utf_16_le')))
        #f.write(str(ElementTree.tostring(node, encoding="utf-16").decode('utf-16')))
        
        #f.write(ElementTree.tostring(node, encoding="utf-8", method="xml").decode('utf-8'))
        #f.write(ElementTree.tostring(node, encoding="utf-16", method="xml").decode('utf-16'))
        #f.write(ElementTree.tostring(node, encoding="utf-8", method="xml").decode('utf-16-le'))
        
        
        #doulevei alla vgazei titlo
        try:
            f.write(str(ElementTree.tostring(node, encoding="utf-8", method="xml").decode('utf-8').encode('utf-16')))
        except:
            print("Failed to write.")
            sys.exit(3)
        else:
            print("Writing Successful")
        # vgazei oti nanai - xoris titlo
        #f.write(str(ElementTree.tostring(node).decode('utf-16')))
        #f.write(str(ElementTree.tostring(node).decode('utf-16le')))
        #f.write(str(ElementTree.tostring(node).decode('utf16')))
        
        #f.write(str(ElementTree.tostring(node, encoding="utf-8", method="xml").decode('utf-8')))
        
        
        # den doulevei
        #f.write(str(ElementTree.tostring(node).decode('utf16')))
        
        
        # den doulevei
        #f.write(str(ElementTree.tostring(node).decode('utf_16_be')))
        
        # encoding error
        #f.write(str(ElementTree.tostring(node).decode('utf_16')))
        
        # encoding error
        #f.write(str(ElementTree.tostring(node).decode('utf-16')))
        
        #f.write(str(ElementTree.tostring(node).decode('utf16')))
        #f.write(str(ElementTree.tostring(node).decode('utf-16')))
        
        
        # Works - Closest version - No title / Lebom Encoding / Loads on Viewers
        #tree.write(f, encoding="utf-16", xml_declaration=False, default_namespace=None, method="xml")
        #tree.write(f, encoding="'utf-16le'", xml_declaration=False, default_namespace=None, method="xml")
        #tree.write(f, encoding="'utf-16be'", xml_declaration=False, default_namespace=None, method="xml")
        #tree.write(f, encoding="'utf-16-be'", xml_declaration=False, default_namespace=None, method="xml")
        #node.write(f, encoding="utf-16", xml_declaration=False, default_namespace=None, method="xml")
        #ElementTree.tostring(node).write(f, encoding="utf-16", xml_declaration=False, default_namespace=None, method="xml")
        
    #print (ElementTree.tostring(node))
    
    
    print("\n@@Performing integrity check: ")
    total = 0
    for item in lengths.keys():
        print("File: " + str(item) + " with total: " + str(lengths[item]))
        autologprint("File: " + str(item) + " with total: " + str(lengths[item]), log_name)
        total += lengths[item]
    
    
    #print("Checking: Total number of calculated entries: " + str(total))
    autologprint("Checking: Total number of calculated entries: " + str(total), log_name)
    #print("Checking: Expected number of entries: " + str(len(node)))
    autologprint("Checking: Expected number of entries: " + str(len(node)), log_name)
    
    if int(total) != int(len(node)):
        autologprint("Error : When matching produced totals", log_name)
    else:
        autologprint("Success: Produced totals match. ", log_name)
    

    print("Reporting on files that failed to parse: ")
    for badfile in bad_files:
        autologprint("File: " + str(badfile) + " failed to print.", log_name)
        
        
    return True


if __name__ == '__main__':

    folder = sys.argv[1]

    if os.path.exists(folder):
        print("Input Directory located.")
    else:
        print("ERROR: Failed to detect input directory.")
    
    newRunRun(folder)
        
    # try:    
        # newRunRun(folder)
    # except Exception as e:
        # print("Failure on function: newRunRun")
# import glob
# from xml.etree import ElementTree

import os
import sys

def newRunRun(folder):


    import glob
    #from xml.etree import ElementTree
    from lxml import etree as ElementTree





    xml_files = []

    try:
        print("Checking: For xml files in folder: " + str(folder))
        xml_files = glob.glob(folder+"/*.xml")
    except Exception as e:
        print("Failed to glob xml files. Possible bad regex")
    else:
        print("Checking: Xml files globbed successfully. Counting...")
        print("Xml files located in directory: " + str(len(xml_files)))
    
    node = None
    lengths = dict()
    
    
    #print(str(xml_files))
    #x = input("press a key")
    
    
    for xmlFile in xml_files:
        print("Processing file: " + str(xmlFile))
        lengths[xmlFile] = 0
        tree = ElementTree.parse(xmlFile)
        
        # or if we want to use specific parser 
        #parser = ET.XMLParser(encoding="utf-8")
        # tree = ElementTree.parse(xmlFile, parser=parser)
        
        root = tree.getroot()
        lengths[xmlFile] = len(root)

    
    
    outfile = open('CountTotal.txt', 'w')
    print("\n@@Performing integrity check: ")
    total = 0
    for item in lengths.keys():
        print("File: " + str(item) + " with total: " + str(lengths[item]))
        outfile.write(str(item) + "," + str(lengths[item]) + "\n")
        total += lengths[item]
    
    
    print("Checking: Total number of calculated entries: " + str(total) + "\n")
    #print("Checking: Expected number of entries: " + str(len(node)))
    
    #if int(total) != int(len(node)):
    #    print("Error : When matching produced totals")
    #else:
    #    print("Success: Produced totals match. ")
    outfile.close()
    lengths = {}

folder = sys.argv[1]
newRunRun(folder)