ettorerizza
7/8/2017 - 8:44 AM

Jython naive method to detect potential persons names in OpenRefine based on a list of first names

Jython naive method to detect potential persons names in OpenRefine based on a list of first names

from unidecode import unidecode

with open(r"C:\Users\Boulot\Desktop\prenoms.txt", 'r') as f:
    prenoms = [name.strip().lower() for name in f]

CHARS = "abcdefghijklmnopqrstuvwxyzéèàçüûùABCDEFGHIJKLMNOPQRSTUVWXYZ- "

family_joint = ["d'", "de", "du", "der", "den", "vander", "vanden", "van", "le"]

#TEST
value = "mexico pierre françois van pip test"

valeurs = "".join(unidecode(c.lower()) for c in value if c in CHARS).strip().split(' ')
liste = []

if len(valeurs) > 1:
    for i, token in enumerate(valeurs):
        if token in prenoms:
            liste.append(token)
            try:
                liste.append(valeurs[i + 1])
                if valeurs[i + 1] in family_joint and valeurs[i + 2] not in liste:
                    liste.append(valeurs[i + 2])
                    if valeurs[i + 2] in family_joint and valeurs[i + 3] not in liste:
                        liste.append(valeurs[i + 3])
            except IndexError:
                try:
                    if valeurs[i - 1] not in liste:
                        liste.insert(1, valeurs[i - 1])
                        if valeurs[i - 2] in family_joint and valeurs[i - 2] not in liste:
                            liste.insert(1, valeurs[i - 2])
                        if valeurs[i - 3] in family_joint and valeurs[i - 3] not in liste:
                            liste.insert(1, valeurs[i - 3])
                except IndexError:
                    pass

#liste dédoublonnée
seen = set()
seen_add = seen.add
liste = [x for x in liste if not (x in seen or seen_add(x))]
print(" ".join(liste))