joewiz
10/7/2016 - 10:51 AM

Find FRUS documents whose English dates do not match the date metadata

Find FRUS documents whose English dates do not match the date metadata

<report>
    <doc>
        <vol-id>frus1969-76v17</vol-id>
        <div-id>d142</div-id>
        <supplied-english-date>July11, 1971, midnight–1:40 a.m. and 9:50–10:35 a.m.</supplied-english-date>
        <supplied-when-attribute>1971-07-11T10:35:00</supplied-when-attribute>
        <parsed-english-date/>
        <parsed-iso-date>parseDate had problems with July11, 1971, midnight–1:40 a.m. and 9:50–10:35 a.m.</parsed-iso-date>
        <matches>false</matches>
    </doc>
    <doc>
        <vol-id>frus1969-76v17</vol-id>
        <div-id>d159</div-id>
        <supplied-english-date>October 15, 1971</supplied-english-date>
        <supplied-when-attribute>1971-10-05</supplied-when-attribute>
        <parsed-english-date>October 15, 1971</parsed-english-date>
        <parsed-iso-date>1971-10-15</parsed-iso-date>
        <matches>false</matches>
    </doc>
    <doc>
        <vol-id>frus1969-76v17</vol-id>
        <div-id>d164</div-id>
        <supplied-english-date>November 1971</supplied-english-date>
        <supplied-when-attribute>1971-11</supplied-when-attribute>
        <parsed-english-date/>
        <parsed-iso-date>
            <date resolution="month">
                <range>
                    <start>1971-11-01</start>
                    <end>1971-11-30</end>
                </range>
                <value>1971-11-01</value>
            </date>
        </parsed-iso-date>
        <matches>false</matches>
    </doc>
</report>
xquery version "3.1";

(:
Find cases such as this:
    <dateline>
        <placeName>Washington</placeName>, 
        <date when="1971-10-05">October 15, 1971</date>.
    </dateline>
... where the supplied English date, October 15, 1971, does not match the
supplied machine-readable date, 1971-10-05.
:)

declare namespace tei="http://www.tei-c.org/ns/1.0";

import module namespace dates="http://xqdev.com/dateparser" at "/db/apps/twitter/modules/date-parser.xqm";

let $vols := 
    doc('/db/apps/frus/volumes/frus1969-76v17.xml')
(:    collection('/db/apps/frus/volumes'):)
let $datelines := $vols//tei:dateline[.//tei:date/@when]
let $report := 
    element report {
        for $dateline in $datelines
        let $div-id := $dateline/ancestor::tei:div[@xml:id][1]/@xml:id
        let $vol-id := util:document-name($dateline) ! substring-before(., '.xml')
        let $supplied-english-date := $dateline//tei:date[@when][1]
        let $supplied-when-attribute := $supplied-english-date/@when
        let $english-date-regex := '[A-Z][a-z]+\s+\d{1,2},\s+\d{4}'
        let $parsed-english-date := analyze-string(normalize-space($supplied-english-date), $english-date-regex)//fn:match
        let $parsed-iso-date := 
            try 
                { 
                    if ($parsed-english-date) then 
                        dates:parseDate($parsed-english-date)/string() 
                    else 
                        dates:parseDate(normalize-space($supplied-english-date))
                } 
            catch * 
                { 
                    "parseDate had problems with " || normalize-space($supplied-english-date)
                }
        let $matches := substring($supplied-when-attribute, 1, 10) = $parsed-iso-date
        where not($matches)
        return
            element doc {
                element vol-id { $vol-id },
                element div-id { $div-id/string() },
                element supplied-english-date { normalize-space($supplied-english-date) },
                element supplied-when-attribute { $supplied-when-attribute/string() },
                element parsed-english-date { $parsed-english-date/string() },
                element parsed-iso-date { $parsed-iso-date },
                element matches { $matches }
            }
    }
return
    xmldb:store('/db', 'report.xml', $report)