Dehyphenate text suffering from improper hyphenation, using XQuery
<results>
<result>
<source>face-to-face</source>
<dehyph>face-to-face</dehyph>
</result>
<result>
<source>vis-a-vis</source>
<dehyph>vis-a-vis</dehyph>
</result>
<result>
<source>accom-panied</source>
<dehyph>accompanied</dehyph>
</result>
<result>
<source>con-vinced</source>
<dehyph>convinced</dehyph>
</result>
<result>
<source>confi-dential</source>
<dehyph>confidential</dehyph>
</result>
<result>
<source>es-tablish</source>
<dehyph>establish</dehyph>
</result>
<result>
<source>mili-tary</source>
<dehyph>military</dehyph>
</result>
<result>
<source>condi-tions</source>
<dehyph>conditions</dehyph>
</result>
<result>
<source>ef-fective</source>
<dehyph>effective</dehyph>
</result>
<result>
<source>cease-fire</source>
<dehyph>ceasefire</dehyph>
</result>
</results>
<!--
an eXist-db collection.xconf file that sets a range index on <word> elements in
a dictionary file from https://github.com/marklogic/dictionaries/tree/master/dictionaries
-->
<collection xmlns="http://exist-db.org/collection-config/1.0">
<index xmlns:spell="http://marklogic.com/xdmp/spell">
<fulltext default="none" attributes="false"/>
<!-- Range index configuration -->
<create qname="spell:word" type="xs:string"/>
</index>
</collection>
xquery version "3.0";
(: Functions to dehyphenate a word or a paragraph suffering from improper hyphenation.
Uses a dictionary (a list of known words), such as those available at:
https://github.com/marklogic/dictionaries/tree/master/dictionaries
:)
declare namespace fn="http://www.w3.org/2005/xpath-functions";
declare namespace spell="http://marklogic.com/xdmp/spell";
declare function local:dehyphenate-word($candidate as xs:string, $dictionary as element(spell:word)+) as xs:string {
let $sans-hyphen := replace($candidate, '-', '')
return
if ($sans-hyphen = $dictionary) then
$sans-hyphen
else
$candidate
};
declare function local:dehyphenate-paragraph($paragraph as xs:string, $dictionary as element(spell:word)+) as xs:string* {
let $pattern := '\b[A-Za-z]+[a-z]-[a-z-]+\b'
let $analysis := analyze-string($paragraph, $pattern)
return
if ($analysis/fn:match) then
string-join(
for $fragment in $analysis/*
return
if ($fragment/self::fn:non-match) then
$fragment/string()
else (: if ($fragment/self::fn:match) then :)
local:dehyphenate-word($fragment, $dictionary)
, '')
else
$paragraph
};
(:
let $paragraph :=
'Kissinger said that in connection with our message President Nixon had, from the
very out-set, been prepared to endorse a de facto halt to the escalation of
military operations. He had con-vinced the Israeli Government to do so as well.
The President presumed that our message was accom-panied by a concurrent
proposal to renew a confi-dential, bilateral exchange of views on a political
settlement, i.e., the cease-fire would, as it were, es-tablish a favorable
atmosphere for further efforts to reach a political settlement. However, he
contin-ued, in the view of President Nixon and the Israeli Government, the
information regarding shipments of Soviet missiles of a more advanced type to
Cairo creates a new situation. That situation could well be interpreted as
follows: Israel now agrees to a de facto halt to its air raids, while Nasser
uses this time to make all kinds of improvements in his mili-tary machine, in
particular his air defenses, with no interference whatsoever. Then at some point
chosen by the UAR President himself, military op-erations will again resume, but
now under condi-tions far worse for Israel, because by that time the UAR will
have an air defense system with SAM–3 missiles, which U.S. experts describe as
“fairly ef-fective.” Given this prospect, it is hard to persuade Israel to
totally stop the air raids at this time.'
:)
let $words :=
(
'face-to-face',
'vis-a-vis',
'accom-panied',
'con-vinced',
'confi-dential',
'es-tablish',
'mili-tary',
'condi-tions',
'ef-fective',
'cease-fire'
)
let $dictionary := doc('/db/apps/dictionaries/data/large-dictionary.xml')//spell:word
return
element results {
for $word in $words
let $dehyphenated := local:dehyphenate-word($word, $dictionary)
return
element result {
element source {$source},
element dehyph {$dehyphenated}
}
}