joewiz
3/18/2012 - 7:50 PM

Wrangling plain text with XQuery

Wrangling plain text with XQuery

let $lines := local:text-to-lines($text)
let $groups := local:group-lines($lines)
let $processed-group := local:process-groups($groups)
let $list := local:groups-to-list($processed-group)
return
    $list
declare function local:text-to-lines($text as xs:string) {
    let $lines := tokenize($text, '\n')
    for $line in $lines
    let $level := 
        if (matches($line, '^\s')) then 
            string-length(replace($line, '^(\s*).+$', '$1'))
        else 
            0
    let $content := replace($line, '^\s*(.+)$', '$1')
    return
        <line level="{$level}">{$content}</line>
};
<line level="0">The President left at 8:48 am</line>
<line level="1">-Administration recommendations on Capitol Hill</line>
<line level="1">-Improvements</line>
<line level="1">-Richardson’s trip to New York</line>
<line level="1">-Health programs</line>
<line level="2">-Goals</line>
<line level="2">-Problems in present system</line>
<line level="2">-Approach</line>
<line level="2">-Emphasis on quality</line>
<line level="1">-Improvements in United States’ health care</line>
<line level="2">-Maternal deaths</line>
<line level="3">-Rate</line>
<line level="3">-Decline</line>
<line level="3">-United States’ rate compared to other nations</line>
<line level="4">-Reporting system</line>
<line level="2">-Data on health</line>
<line level="3">-Differences in reporting system</line>
<line level="2">-Low-income people</line>
<line level="3">-Whites</line>
<line level="3">-Non-whites</line>
<line level="2">-Mortality rates</line>
<line level="3">-Figures</line>
<line level="1">-Resource allocation</line>
<line level="2">-Rural areas</line>
<line level="3">-Availability of care</line>
<line level="2">-Catastrophic care costs</line>
<line level="2">-Prevention</line>
<line level="2">-Problems</line>
<list>
    <item>The President left at 8:48 am
        <list>
            <item>Administration recommendations on Capitol Hill</item>
            <item>Improvements</item>
            <item>Richardson’s trip to New York</item>
            <item>Health programs
                <list>
                    <item>Goals</item>
                    <item>Problems in present system</item>
                    <item>Approach</item>
                    <item>Emphasis on quality</item>
                </list>
            </item>
            <item>Improvements in United States’ health care
                <list>
                    <item>Maternal deaths<list>
                            <item>Rate</item>
                            <item>Decline</item>
                            <item>United States’ rate compared to other nations
                                <list>
                                    <item>Reporting system</item>
                                </list>
                            </item>
                        </list>
                    </item>
                    <item>Data on health
                        <list>
                            <item>Differences in reporting system</item>
                        </list>
                    </item>
                    <item>Low-income people
                        <list>
                            <item>Whites</item>
                            <item>Non-whites</item>
                        </list>
                    </item>
                    <item>Mortality rates
                        <list>
                            <item>Figures</item>
                        </list>
                    </item>
                </list>
            </item>
            <item>Resource allocation
                <list>
                    <item>Rural areas
                        <list>
                            <item>Availability of care</item>
                        </list>
                    </item>
                    <item>Catastrophic care costs</item>
                    <item>Prevention</item>
                    <item>Problems</item>
                </list>
            </item>
        </list>
    </item>
</list>
declare function local:process-groups($groups as element(group)+) {
    if (count($groups) gt 1) then
        <group>{
            for $group in $groups
            return
                local:apply-levels($group)
        }</group>
    else
        local:apply-levels($groups)
};

declare function local:apply-levels($group as element(group)) {
    <group>
        {$group/line[1]}
        {
        if ($group/line[2]) then 
            if (count(subsequence($group/line, 2)) gt 1) then 
                <group>{
                    for $group in local:group-lines(subsequence($group/line, 2))
                    return
                        local:apply-levels($group)                    
                }</group>
            else
                local:group-lines(subsequence($group/line, 2))
        else ()
        }
    </group>
};
The President left at 8:48 am
        -Administration recommendations on Capitol Hill
        -Improvements
        -Richardson’s trip to New York
        -Health programs
                -Goals
                -Problems in present system
                -Approach
                -Emphasis on quality
        -Improvements in United States’ health care
                -Maternal deaths
                        -Rate
                        -Decline
                        -United States’ rate compared to other nations
                                -Reporting system
                -Data on health
                        -Differences in reporting system
                -Low-income people
                        -Whites
                        -Non-whites
                -Mortality rates
                        -Figures
        -Resource allocation
                -Rural areas
                        -Availability of care
                -Catastrophic care costs
                -Prevention
                -Problems
declare function local:groups-to-list($group as element(group)) {
    <list>{local:inner-groups-to-list($group)}</list>
};

declare function local:inner-groups-to-list($group as element(group)) {
    if ($group/line) then
        for $item in $group/line
        return
            <item>{
                $item/text()
                ,
                if ($item/following-sibling::group) then
                    <list>{local:inner-groups-to-list($item/following-sibling::group)}</list>
                else 
                    ()
            }</item>
    else (: if ($group[not(line)]) then :)
        for $g in $group/group 
        return 
            local:inner-groups-to-list($g)
};
declare function local:group-lines($lines as element(line)+) {
    let $first-line := $lines[1]
    let $level := $first-line/@level
    let $next-line-at-same-level := subsequence($lines, 2)[@level eq $level][1]
    let $group-of-lines-inside-this-level := 
        if ($next-line-at-same-level) then 
            subsequence(
                $lines, 
                1, 
                index-of($lines, $next-line-at-same-level) - 1
            )
        else 
            $lines
    return 
        (
        <group>{$group-of-lines-inside-this-level}</group>
        ,
        if ($next-line-at-same-level) then 
            local:group-lines(subsequence($lines, index-of($lines, $next-line-at-same-level)))
        else 
            ()
        )
};
<group>
    <line level="0">The President left at 8:48 am</line>
    <group>
        <group>
            <line level="1">-Administration recommendations on Capitol Hill</line>
        </group>
        <group>
            <line level="1">-Improvements</line>
        </group>
        <group>
            <line level="1">-Richardson’s trip to New York</line>
        </group>
        <group>
            <line level="1">-Health programs</line>
            <group>
                <group>
                    <line level="2">-Goals</line>
                </group>
                <group>
                    <line level="2">-Problems in present system</line>
                </group>
                <group>
                    <line level="2">-Approach</line>
                </group>
                <group>
                    <line level="2">-Emphasis on quality</line>
                </group>
            </group>
        </group>
        <group>
            <line level="1">-Improvements in United States’ health care</line>
            <group>
                <group>
                    <line level="2">-Maternal deaths</line>
                    <group>
                        <group>
                            <line level="3">-Rate</line>
                        </group>
                        <group>
                            <line level="3">-Decline</line>
                        </group>
                        <group>
                            <line level="3">-United States’ rate compared to other nations</line>
                            <group>
                                <line level="4">-Reporting system</line>
                            </group>
                        </group>
                    </group>
                </group>
                <group>
                    <line level="2">-Data on health</line>
                    <group>
                        <line level="3">-Differences in reporting system</line>
                    </group>
                </group>
                <group>
                    <line level="2">-Low-income people</line>
                    <group>
                        <group>
                            <line level="3">-Whites</line>
                        </group>
                        <group>
                            <line level="3">-Non-whites</line>
                        </group>
                    </group>
                </group>
                <group>
                    <line level="2">-Mortality rates</line>
                    <group>
                        <line level="3">-Figures</line>
                    </group>
                </group>
            </group>
        </group>
        <group>
            <line level="1">-Resource allocation</line>
            <group>
                <group>
                    <line level="2">-Rural areas</line>
                    <group>
                        <line level="3">-Availability of care</line>
                    </group>
                </group>
                <group>
                    <line level="2">-Catastrophic care costs</line>
                </group>
                <group>
                    <line level="2">-Prevention</line>
                </group>
                <group>
                    <line level="2">-Problems</line>
                </group>
            </group>
        </group>
    </group>
</group>