joewiz
8/6/2013 - 9:12 PM

Find the shortest and longest article in a collection of TEI XML articles by word count, and calculate the average word count, using XQuery

Find the shortest and longest article in a collection of TEI XML articles by word count, and calculate the average word count, using XQuery

<results>
    <article-count>162</article-count>
    <shortest-article>1899-1913/DollarDiplo: 172 words</shortest-article>
    <longest-article>1945-1952/KoreanWar2: 1662 words</longest-article>
    <average-length>739 words</average-length>
</results>
xquery version "3.0";

(: find the shortest and longest article and get the average word count of a collection of TEI XML articles :)

declare namespace tei="http://www.tei-c.org/ns/1.0";

(: in our case, 'articles' are TEI divs that have @xml:id attributes and no child divs; 
   we filter out the foreward since they're not full articles. :)
let $milestone-articles := collection('/db/cms/apps/tei-content/data/milestones')//tei:div[@xml:id and not(.//tei:div)][@xml:id ne 'foreword']
let $article-infos := 
    for $article in $milestone-articles
    let $text := string-join($article//text(), ' ') (:  :)
    let $words := tokenize($text, '\s+')
    let $word-count := count($words)
    order by $word-count
    return 
        <article>
            <url>{concat(substring-after(substring-before(base-uri($article), '.xml'), 'milestones/'), '/', $article/@xml:id)}</url>
            <word-count>{$word-count}</word-count>
        </article>
return
    element results {
        element article-count { count($milestone-articles) },
        element shortest-article { concat($article-infos[1]/url, ': ', $article-infos[1]/word-count, ' words') },
        element longest-article { concat($article-infos[last()]/url, ': ', $article-infos[last()]/word-count, ' words') },
        element average-length { concat(round(sum($article-infos/word-count) div count($milestone-articles)), ' words') }
    }