joewiz
8/22/2013 - 9:21 PM

Dehyphenate text suffering from improper hyphenation, using XQuery

Dehyphenate text suffering from improper hyphenation, using XQuery

<results>
    <result>
        <source>face-to-face</source>
        <dehyph>face-to-face</dehyph>
    </result>
    <result>
        <source>vis-a-vis</source>
        <dehyph>vis-a-vis</dehyph>
    </result>
    <result>
        <source>accom-panied</source>
        <dehyph>accompanied</dehyph>
    </result>
    <result>
        <source>con-vinced</source>
        <dehyph>convinced</dehyph>
    </result>
    <result>
        <source>confi-dential</source>
        <dehyph>confidential</dehyph>
    </result>
    <result>
        <source>es-tablish</source>
        <dehyph>establish</dehyph>
    </result>
    <result>
        <source>mili-tary</source>
        <dehyph>military</dehyph>
    </result>
    <result>
        <source>condi-tions</source>
        <dehyph>conditions</dehyph>
    </result>
    <result>
        <source>ef-fective</source>
        <dehyph>effective</dehyph>
    </result>
    <result>
        <source>cease-fire</source>
        <dehyph>ceasefire</dehyph>
    </result>
</results>
<!-- 
    an eXist-db collection.xconf file that sets a range index on <word> elements in 
    a dictionary file from https://github.com/marklogic/dictionaries/tree/master/dictionaries 
-->

<collection xmlns="http://exist-db.org/collection-config/1.0">
    <index xmlns:spell="http://marklogic.com/xdmp/spell">
        <fulltext default="none" attributes="false"/>
        
        <!-- Range index configuration -->
        <create qname="spell:word" type="xs:string"/>
        
    </index>
</collection>
xquery version "3.0";

(: Functions to dehyphenate a word or a paragraph suffering from improper hyphenation.
   Uses a dictionary (a list of known words), such as those available at:

     https://github.com/marklogic/dictionaries/tree/master/dictionaries
:)

declare namespace fn="http://www.w3.org/2005/xpath-functions";
declare namespace spell="http://marklogic.com/xdmp/spell";

declare function local:dehyphenate-word($candidate as xs:string, $dictionary as element(spell:word)+) as xs:string {
    let $sans-hyphen := replace($candidate, '-', '')
    return
        if ($sans-hyphen = $dictionary) then
            $sans-hyphen
        else
            $candidate
};

declare function local:dehyphenate-paragraph($paragraph as xs:string, $dictionary as element(spell:word)+) as xs:string* {
    let $pattern := '\b[A-Za-z]+[a-z]-[a-z-]+\b'
    let $analysis := analyze-string($paragraph, $pattern)
    return 
        if ($analysis/fn:match) then
            string-join( 
                for $fragment in $analysis/*
                return
                    if ($fragment/self::fn:non-match) then 
                        $fragment/string()
                    else (: if ($fragment/self::fn:match) then :)
                        local:dehyphenate-word($fragment, $dictionary)
            , '')
        else 
            $paragraph
};

(:
let $paragraph :=
    'Kissinger said that in connection with our message President Nixon had, from the
    very out-set, been prepared to endorse a de facto halt to the escalation of
    military operations. He had con-vinced the Israeli Government to do so as well.
    The President presumed that our message was accom-panied by a concurrent
    proposal to renew a confi-dential, bilateral exchange of views on a political
    settlement, i.e., the cease-fire would, as it were, es-tablish a favorable
    atmosphere for further efforts to reach a political settlement. However, he
    contin-ued, in the view of President Nixon and the Israeli Government, the
    information regarding shipments of Soviet missiles of a more advanced type to
    Cairo creates a new situation. That situation could well be interpreted as
    follows: Israel now agrees to a de facto halt to its air raids, while Nasser
    uses this time to make all kinds of improvements in his mili-tary machine, in
    particular his air defenses, with no interference whatsoever. Then at some point
    chosen by the UAR President himself, military op-erations will again resume, but
    now under condi-tions far worse for Israel, because by that time the UAR will
    have an air defense system with SAM–3 missiles, which U.S. experts describe as
    “fairly ef-fective.” Given this prospect, it is hard to persuade Israel to
    totally stop the air raids at this time.'
:)
let $words := 
    (
    'face-to-face',
    'vis-a-vis',
    'accom-panied',
    'con-vinced',
    'confi-dential',
    'es-tablish',
    'mili-tary',
    'condi-tions',
    'ef-fective',
    'cease-fire'
    )
let $dictionary := doc('/db/apps/dictionaries/data/large-dictionary.xml')//spell:word
return
    element results {
        for $word in $words 
        let $dehyphenated := local:dehyphenate-word($word, $dictionary)
        return
            element result {
                element source {$source},
                element dehyph {$dehyphenated}
            }
    }