ettorerizza
7/9/2017 - 2:06 PM

Jython naive method to detect names of belgian municipalities in OpenRefine based on a gazeeter

Jython naive method to detect names of belgian municipalities in OpenRefine based on a gazeeter

import sys
sys.path.append(r'D:\jython2.7.0\Lib\site-packages')
from unidecode import unidecode

#TEST
value = "carette leuven"


with open(r"C:\Users\Boulot\Desktop\communes.tsv", 'r', encoding="utf8") as f:
    lieux = [unidecode(name.strip().lower().replace("-", " ")) for name in f]

valeurs = "".join(unidecode(c.lower()) for c in value).strip().split(' ')

liste = []

joint_locations = ["le", "la", "les", "lez", "saint", "s", "t"]

for i, tokens in enumerate(valeurs):
    try:
        if tokens in lieux:
            liste.append(tokens)
        elif tokens in joint_locations:
            tokens = tokens + " " + valeurs[i+1]
        elif valeurs[i+1] in joint_locations:
            tokens = tokens + " " + valeurs[i+1] + " " + valeurs[i+2]
        if tokens in lieux:
            liste.append(tokens)
    except IndexError:
        pass

liste = set(liste)
print("||".join(liste))