joelondon
5/15/2017 - 2:42 PM

xml-to-dataframe - with option to speed up processing by providing structure upfront

xml-to-dataframe - with option to speed up processing by providing structure upfront

# https://hopstat.wordpress.com/2014/01/14/faster-xml-conversion-to-data-frames/
# doc = xmlParse("xmlFile.xml")
## xpath is an XPath expression extracting the dataset you want. For example if I wanted dataset1, I'd run:
# xmlToDF(doc, xpath = "/export/dataset1")
## You can set isXML=FALSE and pass in a character string of the xml filename, which just parses it for you.
# xmlToDF("xmlFile.xml", xpath = "/export/dataset1", isXML = FALSE)
require(XML)
xmlToDF = function(doc, xpath, isXML = TRUE, usewhich = TRUE, verbose = TRUE) {
 
    if (!isXML) 
        doc = xmlParse(doc)
    #### get the records for that form
    nodeset <- getNodeSet(doc, xpath)
 
    ## get the field names
    var.names <- lapply(nodeset, names)
 
    ## get the total fields that are in any record
    fields = unique(unlist(var.names))
 
    ## extract the values from all fields
    dl = lapply(fields, function(x) {
        if (verbose) 
            print(paste0("  ", x))
        xpathSApply(proc, paste0(xpath, "/", x), xmlValue)
    })
 
    ## make logical matrix whether each record had that field
    name.mat = t(sapply(var.names, function(x) fields %in% x))
    df = data.frame(matrix(NA, nrow = nrow(name.mat), ncol = ncol(name.mat)))
    names(df) = fields
 
    ## fill in that data.frame
    for (icol in 1:ncol(name.mat)) {
        rep.rows = name.mat[, icol]
        if (usewhich) 
            rep.rows = which(rep.rows)
        df[rep.rows, icol] = dl[[icol]]
    }
 
    return(df)
}