ettorerizza
7/2/2017 - 5:05 PM

OpenRefine/Jython sparql query (find possible locations and persons in tokens)

OpenRefine/Jython sparql query (find possible locations and persons in tokens)

import sys
sys.path.append(r'D:\jython2.7.0\Lib\site-packages')
from SPARQLWrapper import SPARQLWrapper, JSON
from langdetect import detect

dbpedia_version = "http://dbpedia.org/sparql"

#TEST
value = "comptoir"

#detect language (useless with short tokens)
lang_query = detect(value)

if lang_query == "fr":
    dbpedia = "http://fr.dbpedia.org/sparql"
elif lang_query == "nl":
    dbpedia = "http://nl.dbpedia.org/sparql"
else:
    dbpedia = "http://dbpedia.org/sparql"


def get_sparql_label(value, dbpedia_version):
    dbpedia_version = dbpedia
    sparql = SPARQLWrapper(dbpedia_version)
    sparql.setQuery("""
SELECT DISTINCT ?entity ?score1 ?type
    WHERE{
        ?entity ?p ?label.
                ?entity ?q ?abstract.
                Filter langMatches(lang(?label),"%s").
                Filter langMatches(lang(?abstract),"%s").
        ?label <bif:contains> "'%s'" OPTION(score ?score1).
        FILTER (?p=<http://www.w3.org/2000/01/rdf-schema#label> ||
                ?p=<http://www.w3.org/2004/02/skos/core#prefLabel>).
                FILTER (?q=<http://dbpedia.org/ontology/abstract>).
        ?entity a ?type.
        FILTER (?type IN (<http://dbpedia.org/ontology/Place>,
                           <http://dbpedia.org/ontology/Agent>)).
        FILTER isIRI(?entity).
    } ORDER BY desc(?score1) LIMIT 5
""" % (lang_query, lang_query, value))

    sparql.setReturnFormat(JSON)
    results=sparql.query().convert()
    return results

results=get_sparql_label(value, dbpedia)
liste=[]
for result in results["results"]["bindings"]:
    liste.append(result["type"]["value"] + "||" + result["entity"]["value"])


if not liste:
    dbpedia_version="http://nl.dbpedia.org/sparql"
    lang_query = "NL"
    results=get_sparql_label(value, dbpedia_version)
    for result in results["results"]["bindings"]:
        liste.append(result["type"]["value"] +
                     "||" + result["entity"]["value"])

print(liste)