joewiz
9/25/2014 - 4:56 PM

Generate a Culturomics Bookworm file from FRUS TEI XML data, using XQuery and eXist

Generate a Culturomics Bookworm file from FRUS TEI XML data, using XQuery and eXist

xquery version "3.0";

(: Transform volumes from the FRUS series (TEI XML) into a zip file containing JSON and text files 
   formatted for Bookworm http://bookworm.culturomics.org/. The resulting data can be accessed at: 
     http://static.history.state.gov/temp/frus-all.zip (140 MB)
   A link to the running demo of the resulting data awaits fixes to the bookworm server, 
   but @bmschmidt kindly posted one volume's worth of data at 
     http://benschmidt.org/joewiz/ 
:)

import module namespace frus="http://history.state.gov/xquery/frus" at "xmldb:exist:///db/history/modules/frus.xq";

import module namespace xqjson="http://xqilla.sourceforge.net/lib/xqjson";

declare namespace tei="http://www.tei-c.org/ns/1.0";

declare function local:field-descriptions-entry($field-descriptions as element(field-descriptions)) {
    let $pre-json := 
        (: we want to arrive at something like this, for the purposes of generating JSON 
           via xqjson (see https://github.com/joewiz/xqjson):
        
            <json type="array">
                <item type="object">
                    <pair name="field"    type="string">date</pair>
                    <pair name="datatype" type="string">time</pair>
                    <pair name="type"     type="string">numeric</pair>
                    <pair name="unique"   type="boolean">true</pair>
                    <pair name="derived"  type="array">
                        <item type="object">
                            <pair name="resolution" type="string">month</pair>
                        </item>
                    </pair>
                </item>
                <item type="object">
                    <pair name="field"    type="string">searchstring</pair>
                    <pair name="datatype" type="string">searchstring</pair>
                    <pair name="type"     type="string">text</pair>
                    <pair name="unique"   type="boolean">true</pair>
                </item>
                <item type="object">
                    <pair name="field"    type="string">series</pair>
                    <pair name="datatype" type="string">categorical</pair>
                    <pair name="type"     type="string">text</pair>
                    <pair name="unique"   type="boolean">false</pair>
                </item>
                <item type="object">
                    <pair name="field"    type="string">persons</pair>
                    <pair name="datatype" type="string">categorical</pair>
                    <pair name="type"     type="string">text</pair>
                    <pair name="unique"   type="boolean">false</pair>
                </item>
            </json>
        :)
        <json type="array">
            {
            for $field in $field-descriptions/field-description
            return
                <item type="object">
                    {
                    for $pair in $field/*
                    let $name := $pair/name()
                    let $type := 
                        if ($name = 'unique') then 'boolean'
                        else if ($pair/*) then 'array'
                        else 'string'
                    return
                        <pair name="{$name}" type="{$type}">{
                            if ($pair/*) then
                                <item type="object">
                                    {
                                    (: assumes single nested pair :)
                                    <pair name="{$pair/*/name()}" type="string">{$pair/*/string()}</pair>
                                    }
                                </item>
                            else 
                                $pair/string()
                        }</pair>
                    }
                </item>
            }
        </json>
    let $json := xqjson:serialize-json($pre-json)
    return
        <entry name="metadata/field_descriptions.json" type="text">{$json}</entry>
};

declare function local:doc-filename($doc as element(tei:div)+) {
    let $vol-id := substring-before(util:document-name($doc), '.xml')
    let $doc-id := $doc/@xml:id
    let $filename := concat($vol-id, '_', $doc-id, '.txt')
    return
        $filename
};

declare function local:json-catalog-entry($docs as element(tei:div)+) {
    let $hashmaps := 
        (: we want to arrive at something like this:
            <json type="object">
                <pair name="filename" type="string">77-80v02d1</pair>
                <pair name="date" type="string">1977-01-05</pair>
                <pair name="searchstring" type="string">1. Memorandum From the Coordinator for Human Rights and
                    Humanitarian Affairs ( Wilson ) to All Regional and Functional Assistant Secretaries of
                    State and the Administrator of the Agency for International Development ( Parker )</pair>
                <pair name="series" type="array">
                    <item type="string">1977-1980</item>
                </pair>
                <pair name="persons" type="array">
                    <item type="string">Atherton, Alfred L., Jr.</item>
                    <item type="string">Ford, Gerald R.</item>
                    <item type="string">Habib, Philip C.</item>
                    <item type="string">Hartman, Arthur A.</item>
                    <item type="string">Hummel, Arthur W., Jr.</item>
                    <item type="string">Jenkins, Kempton B.</item>
                    <item type="string">Jordan, Amos A. (Joe)</item>
                    <item type="string">Katz, Julius</item>
                    <item type="string">Leigh, Monroe</item>
                    <item type="string">Lewis, Samuel W.</item>
                    <item type="string">Parker, Daniel</item>
                    <item type="string">Rogers, Kenneth N.</item>
                    <item type="string">Schaufele, William E., Jr.</item>
                    <item type="string">Shlaudeman, Harry W.</item>
                    <item type="string">Vest, George S.</item>
                    <item type="string">Wilson, James M.</item>
                </pair>
            </json>
        :)
        for $doc in $docs
        let $filename :=
            (: bookworm docs say filenames shouldn't have .txt in the json-catalog-entry :)
            substring-before(local:doc-filename($doc), '.txt')
        let $date :=
            (: strip out time; use only first date in the document :)
            substring(($doc/tei:dateline/tei:date)[1]/@when/string(), 1, 10)
        let $searchstring := 
            (: strip notes out :)
            normalize-space(string-join($doc/tei:head//text()[not(ancestor::tei:note)], ' '))
        let $series :=
            (: grab the series from the volume-id for now :)
            replace($filename, '^frus(\d{4}-\d{2}).*$', '$1')
        let $persons := 
            let $names := $doc//tei:persName
            let $distinct-corresps := distinct-values($names/@corresp)
            let $distinct-corresps-sans-pound := $distinct-corresps ! substring-after(., '#')
            let $vol := root($doc)
            let $persons-list-entries := $vol/id('persons')//tei:persName
            let $people-to-show := $persons-list-entries[@xml:id = $distinct-corresps-sans-pound] ! normalize-space(.)
            return
                $people-to-show
        let $pre-json :=
            <json type="object">
                <pair name="filename" type="string">{$filename}</pair>
                <pair name="date" type="string">{$date}</pair>
                <pair name="searchstring" type="string">{$searchstring}</pair>
                <pair name="series" type="array">
                    <item type="string">{$series}</item>
                </pair>
                <pair name="persons" type="array">
                    {
                    for $person in $persons
                    return
                        <item type="string">{$person}</item>
                    }
                </pair>
            </json>
        return
            xqjson:serialize-json($pre-json)
    let $json-catalog := string-join($hashmaps, '&#10;')
    return
        <entry name="metadata/jsoncatalog.txt" type="text">{$json-catalog}</entry>
};

declare function local:raw-text-entries($docs as element(tei:div)+) {
    for $doc in $docs
    let $vol-id := substring-before(util:document-name($doc), '.xml')
    let $doc-id := $doc/@xml:id
    let $filename := concat($vol-id, '_', $doc-id, '.txt')
    let $raw-text := 
        (: strip document heading, dateline, and footnotes :)
        string-join($doc//text()[not(ancestor::tei:note | ancestor::tei:head | ancestor::tei:dateline)], ' ') 
    let $clean-up := 
        (: strip extraneous spaces at the beginning of each line :)
        string-join(tokenize($raw-text, '\n\s+'), '&#10;')
    return
        <entry name="texts/raw/{$filename}" type="text">{$clean-up}</entry>
};

declare function local:docs-to-bookworm($docs as element(tei:div)+, $field-descriptions as element(field-descriptions), $project-name as xs:string) {
    let $filename := concat($project-name, '.zip')
    let $field-descriptions-entry := local:field-descriptions-entry($field-descriptions)
    let $json-catalog-entry := local:json-catalog-entry($docs)
    let $raw-text-entries := local:raw-text-entries($docs)
    let $all-entries := ($field-descriptions-entry, $json-catalog-entry, $raw-text-entries)
    let $bookworm-zip := compression:zip( $all-entries, true() )
    return
        (
        response:set-header("Content-Disposition", concat("attachment; filename=", $filename))
        ,
        response:stream-binary(
            $bookworm-zip,
            'application/zip',
            $filename
            )
        )
};

declare function local:vols-to-bookworm($vol-ids as xs:string+, $field-descriptions as element(field-descriptions), $project-name as xs:string) {
    let $docs := 
        for $vol-id in $vol-ids
        let $vol := doc(concat('/db/cms/apps/tei-content/data/frus-volumes/', $vol-id, '.xml'))
        (: only include dated documents; excludes editorial notes and undated documents :)
        return
            $vol//tei:div[@xml:id and @type='document' and matches(./tei:dateline/tei:date/@when, '^\d{4}-\d{2}-\d{2}')] 
    return
        local:docs-to-bookworm($docs, $field-descriptions, $project-name)
};

let $vol-ids := frus:fulltext-volumes-in-db()
let $project-name := 'frus-all'
let $field-descriptions := 
    <field-descriptions>
        <field-description>
            <field>date</field>
            <datatype>time</datatype>
            <type>numeric</type>
            <unique>true</unique>
            <derived>
                <resolution>month</resolution>
            </derived>
        </field-description>
        <field-description>
            <field>searchstring</field>
            <datatype>searchstring</datatype>
            <type>text</type>
            <unique>true</unique>
        </field-description>
        <field-description>
            <field>series</field>
            <datatype>categorical</datatype>
            <type>text</type>
            <unique>false</unique>
        </field-description>
        <field-description>
            <field>persons</field>
            <datatype>categorical</datatype>
            <type>text</type>
            <unique>false</unique>
        </field-description>
    </field-descriptions>
return 
    local:vols-to-bookworm($vol-ids, $field-descriptions, $project-name)