From aefcc72236fe106f76413efec31ca161980c2f13 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 15 Mar 2023 23:37:48 +0100 Subject: [PATCH] Implement an identifier mapping service (#41) References https://github.com/biopragmatics/bioregistry/issues/686. This pull request implements the identifier mapping service described in [SPARQL-enabled identifier conversion with Identifiers.org](https://pubmed.ncbi.nlm.nih.gov/25638809/). The goal of such a service is to act as an interoperability in SPARQL queries that federate data from multiple places and potentially use different IRIs for the same things. --- docs/source/index.rst | 5 + setup.cfg | 3 +- src/curies/mapping_service/__init__.py | 215 ++++++++++++++++++++ src/curies/mapping_service/rdflib_custom.py | 83 ++++++++ tests/test_mapping_service.py | 181 ++++++++++++++++ tests/test_web.py | 4 +- 6 files changed, 488 insertions(+), 3 deletions(-) create mode 100644 src/curies/mapping_service/__init__.py create mode 100644 src/curies/mapping_service/rdflib_custom.py create mode 100644 tests/test_mapping_service.py diff --git a/docs/source/index.rst b/docs/source/index.rst index c69c3fc..2bf9e45 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -83,3 +83,8 @@ for the `records` argument and prefixes can be added one at a time converter = curies.Converter(records=[]) converter.add_prefix("hgnc", "https://bioregistry.io/hgnc:") + +Identifier Mapping Service +-------------------------- +.. automodapi:: curies.mapping_service + :no-inheritance-diagram: diff --git a/setup.cfg b/setup.cfg index 2a7468f..f6feefd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -118,7 +118,7 @@ show_missing = True exclude_lines = pragma: no cover raise NotImplementedError - if __name__ == __main__: + if __name__ == "__main__": if TYPE_CHECKING: def __str__ def __repr__ @@ -142,6 +142,7 @@ ignore = S603 W503 # Line break before binary operator (flake8 is wrong) E203 # whitespace before ':' + S113 # Requests call without timeout exclude = .tox, .git, diff --git a/src/curies/mapping_service/__init__.py b/src/curies/mapping_service/__init__.py new file mode 100644 index 0000000..8588572 --- /dev/null +++ b/src/curies/mapping_service/__init__.py @@ -0,0 +1,215 @@ +# -*- coding: utf-8 -*- + +"""Identifier mappings service. + +This contains an implementation of the service described in `SPARQL-enabled identifier +conversion with Identifiers.org `_. +The idea here is that you can write a SPARQL query like the following: + +.. code-block:: sparql + + PREFIX biomodel: + PREFIX bqbio: + PREFIX sbmlrdf: + PREFIX up: + + SELECT DISTINCT ?protein ?protein_domain + WHERE { + # The first part of this query extracts the proteins appearing in an RDF serialization + # of the BioModels database (see https://www.ebi.ac.uk/biomodels/BIOMD0000000372) on + # insulin/glucose feedback. Note that modelers call entities appearing in compartmental + # models "species", and this does not refer to taxa. + biomodel:BIOMD0000000372 sbmlrdf:species/bqbio:isVersionOf ?biomodels_protein . + + # The second part of this query maps BioModels protein IRIs to UniProt protein IRIs + # using service XXX - that's what we're implementing here. + SERVICE { + ?biomodels_protein owl:sameAs ?uniprot_protein. + } + + # The third part of this query gets links between UniProt proteins and their + # domains. Since the service maps between the BioModels query, this only gets + # us relevant protein domains to the insulin/glucose model. + SERVICE { + ?uniprot_protein a up:Protein; + up:organism taxon:9606; + rdfs:seeAlso ?protein_domain. + } + } + +The SPARQL endpoint running at the web address XXX takes in the bound values for `?biomodels_protein` +one at a time and dynamically generates triples with `owl:sameAs` as the predicate mapping and other +equivalent IRIs (based on the definition of the converter) as the objects. This allows for gluing +together multiple services that use different URIs for the same entities - in this example, there +are two ways of referring to UniProt Proteins: + +1. The BioModels database example represents a SBML model on insulin-glucose feedback and uses legacy + Identifiers.org URIs for proteins such as http://identifiers.org/uniprot/P01308. +2. The first-part UniProt database uses its own PURLs such as https://purl.uniprot.org/uniprot/P01308. + +.. seealso:: + + - Jerven Bolleman's implementation of this service in Java: https://github.com/JervenBolleman/sparql-identifiers + - Vincent Emonet's `SPARQL endpoint for RDFLib generator `_ +""" + +import itertools as itt +from typing import TYPE_CHECKING, Any, Collection, Iterable, List, Set, Tuple, Union, cast + +from rdflib import OWL, Graph, URIRef + +from .rdflib_custom import JervenSPARQLProcessor # type: ignore +from ..api import Converter + +if TYPE_CHECKING: + import flask + +__all__ = [ + "CURIEServiceGraph", + "get_flask_mapping_blueprint", + "get_flask_mapping_app", +] + + +def _prepare_predicates(predicates: Union[None, str, Collection[str]] = None) -> Set[URIRef]: + if predicates is None: + return {OWL.sameAs} + if isinstance(predicates, str): + return {URIRef(predicates)} + return {URIRef(predicate) for predicate in predicates} + + +class CURIEServiceGraph(Graph): # type:ignore + """A service that implements identifier mapping based on a converter.""" + + converter: Converter + predicates: Set[URIRef] + + def __init__( + self, + *args: Any, + converter: Converter, + predicates: Union[None, str, List[str]] = None, + **kwargs: Any, + ) -> None: + """Instantiate the graph. + + :param args: Positional arguments to pass to :meth:`rdflib.Graph.__init__` + :param converter: A converter object + :param predicates: A predicate or set of predicates. If not given, this service + will use `owl:sameAs` as a predicate for mapping IRIs. + :param kwargs: Keyword arguments to pass to :meth:`rdflib.Graph.__init__` + + In the following example, a service graph is instantiated using a small example + converter, then an example SPARQL query is made directly to show how it makes + results: + + .. code-block:: python + + from curies import Converter + from curies.mapping_service import CURIEServiceGraph + + converter = Converter.from_priority_prefix_map( + { + "CHEBI": [ + "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=", + "http://identifiers.org/chebi/", + "http://purl.obolibrary.org/obo/CHEBI_", + ], + "GO": ["http://purl.obolibrary.org/obo/GO_"], + "OBO": ["http://purl.obolibrary.org/obo/"], + ..., + } + ) + graph = CURIEServiceGraph(converter=converter) + + res = graph.query(''' + SELECT ?o WHERE { + VALUES ?s { + + } + ?s owl:sameAs ?o + } + ''') + + + The results of this are: + + ====================================== ================================================= + subject object + -------------------------------------- ------------------------------------------------- + http://purl.obolibrary.org/obo/CHEBI_1 http://purl.obolibrary.org/obo/CHEBI_1 + http://purl.obolibrary.org/obo/CHEBI_1 http://identifiers.org/chebi/1 + http://purl.obolibrary.org/obo/CHEBI_1 https://www.ebi.ac.uk/chebi/searchId.do?chebiId=1 + ====================================== ================================================= + """ + self.converter = converter + self.predicates = _prepare_predicates(predicates) + super().__init__(*args, **kwargs) + + def triples( + self, triple: Tuple[URIRef, URIRef, URIRef] + ) -> Iterable[Tuple[URIRef, URIRef, URIRef]]: + """Generate triples, overriden to dynamically generate mappings based on this graph's converter.""" + subj_query, pred_query, obj_query = triple + if pred_query not in self.predicates: + return + if subj_query is None and obj_query is None: + return # can't generate based on this + if subj_query is None and obj_query is not None: + prefix, identifier = self.converter.parse_uri(obj_query) + if prefix is None or identifier is None: + return + subjects = [ + URIRef(sub) + for sub in cast(Collection[str], self.converter.expand_pair_all(prefix, identifier)) + ] + for subj, pred in itt.product(subjects, self.predicates): + yield subj, pred, obj_query + elif subj_query is not None and obj_query is None: + prefix, identifier = self.converter.parse_uri(subj_query) + if prefix is None or identifier is None: + return + objects = [ + URIRef(obj) + for obj in cast(Collection[str], self.converter.expand_pair_all(prefix, identifier)) + ] + for obj, pred in itt.product(objects, self.predicates): + yield subj_query, pred, obj + else: # subj_query is not None and obj_query is not None + return # too much specification? maybe just return one triple then? + + +def get_flask_mapping_blueprint(converter: Converter, **kwargs: Any) -> "flask.Blueprint": + """Get a blueprint for :class:`flask.Flask`. + + :param converter: A converter + :param kwargs: Keyword arguments passed through to :class:`flask.Blueprint` + :return: A blueprint + """ + from flask import Blueprint, Response, request + + blueprint = Blueprint("mapping", __name__, **kwargs) + graph = CURIEServiceGraph(converter=converter) + processor = JervenSPARQLProcessor(graph=graph) + + @blueprint.route("/sparql", methods=["GET", "POST"]) # type:ignore + def serve_sparql() -> "Response": + """Run a SPARQL query and serve the results.""" + sparql = (request.args if request.method == "GET" else request.json).get("query") + if not sparql: + return Response("Missing parameter query", 400) + results = graph.query(sparql, processor=processor).serialize(format="json") + return Response(results) + + return blueprint + + +def get_flask_mapping_app(converter: Converter) -> "flask.Flask": + """Get a Flask app for the mapping service.""" + from flask import Flask + + blueprint = get_flask_mapping_blueprint(converter) + app = Flask(__name__) + app.register_blueprint(blueprint) + return app diff --git a/src/curies/mapping_service/rdflib_custom.py b/src/curies/mapping_service/rdflib_custom.py new file mode 100644 index 0000000..fb8abe3 --- /dev/null +++ b/src/curies/mapping_service/rdflib_custom.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +# type: ignore + +"""A custom SPARQL processor that optimizes the query based on https://github.com/RDFLib/rdflib/pull/2257.""" + +from typing import Union + +from rdflib.plugins.sparql.algebra import translateQuery +from rdflib.plugins.sparql.evaluate import evalQuery +from rdflib.plugins.sparql.parser import parseQuery +from rdflib.plugins.sparql.parserutils import CompValue +from rdflib.plugins.sparql.processor import SPARQLProcessor +from rdflib.plugins.sparql.sparql import Query + +__all__ = ["JervenSPARQLProcessor"] + + +class JervenSPARQLProcessor(SPARQLProcessor): + """A custom SPARQL processor that optimizes the query based on https://github.com/RDFLib/rdflib/pull/2257. + + Why is this necessary? Ideally, we get queries like + + .. code-block:: sparql + + SELECT * WHERE { + VALUES ?s { :a :b ... } + ?s owl:sameAs ?o + } + + This is fine, since the way that RDFLib parses and constructs an abstract syntax tree, the values + for ``?s`` get bound properly when calling a custom :func:`rdflib.Graph.triples`. However, it's also + valid SPARQL to have the ``VALUES`` clause outside of the ``WHERE`` clause like + + .. code-block:: sparql + + SELECT * WHERE { + ?s owl:sameAs ?o + } + VALUES ?s { :a :b ... } + + Unfortunately, this trips up RDFLib since it doesn't know to bind the values before calling ``triples()``, + therefore thwarting our custom implementation that dynamically generates triples based on the bound values + themselves. + + This processor, originally by Jerven Bolleman in https://github.com/RDFLib/rdflib/pull/2257, + adds some additional logic between parsing + constructing the abstract syntax tree and evaluation + of the syntax tree. Basically, the abstract syntax tree has nodes with two or more children. Jerven's + clever code (see :func:`_optimize_node` below) finds *Join* nodes that have a ``VALUES`` clause in the + second of its two arguments, then flips them around. It does this recursively for the whole tree. + This gets us to the goal of having the ``VALUES`` clauses appear first, therefore making sure that their + bound values are available to the ``triples`` function. + """ + + def query( + self, + query: Union[str, Query], + initBindings=None, # noqa:N803 + initNs=None, # noqa:N803 + base=None, + DEBUG=False, + ): + """Evaluate a SPARQL query on this processor's graph.""" + if isinstance(query, str): + parse_tree = parseQuery(query) + query = translateQuery(parse_tree, base, initNs) + return self.query(query, initBindings=initBindings, base=base) + + query.algebra = _optimize_node(query.algebra) + return evalQuery(self.graph, query, initBindings or {}, base) + + +# From Jerven's PR to RDFLib (https://github.com/RDFLib/rdflib/pull/2257) +def _optimize_node(comp_value: CompValue) -> CompValue: + if ( + comp_value.name == "Join" + and comp_value.p1.name != "ToMultiSet" + and comp_value.p2.name == "ToMultiSet" + ): + comp_value.update(p1=comp_value.p2, p2=comp_value.p1) + for inner_comp_value in comp_value.values(): + if isinstance(inner_comp_value, CompValue): + _optimize_node(inner_comp_value) + return comp_value diff --git a/tests/test_mapping_service.py b/tests/test_mapping_service.py new file mode 100644 index 0000000..10e1659 --- /dev/null +++ b/tests/test_mapping_service.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- + +"""Tests for the identifier mapping service.""" + +import json +import unittest +from typing import Iterable, Set, Tuple +from urllib.parse import quote + +from rdflib import OWL, SKOS +from rdflib.query import ResultRow + +from curies import Converter +from curies.mapping_service import CURIEServiceGraph, _prepare_predicates, get_flask_mapping_app +from curies.mapping_service.rdflib_custom import JervenSPARQLProcessor + +PREFIX_MAP = { + "CHEBI": [ + "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=", + "http://identifiers.org/chebi/", + "http://purl.obolibrary.org/obo/CHEBI_", + ], + "GO": ["http://purl.obolibrary.org/obo/GO_"], + "OBO": ["http://purl.obolibrary.org/obo/"], +} + +SPARQL_SIMPLE = """\ +SELECT DISTINCT ?s ?o WHERE { + VALUES ?s { + + + } + ?s owl:sameAs ?o +} +""".rstrip() + +SPARQL_SIMPLE_BACKWARDS = """\ +SELECT DISTINCT ?s ?o WHERE { + VALUES ?o { + + + } + ?s owl:sameAs ?o +} +""".rstrip() + +#: This represents a SPARQL query that happens when a service generates it +SPARQL_FROM_SERVICE = """\ +SELECT REDUCED * WHERE { + ?s owl:sameAs ?o . +} +VALUES (?s) { + () + () +} +""" + +EXPECTED = { + ( + "http://purl.obolibrary.org/obo/CHEBI_1", + "http://purl.obolibrary.org/obo/CHEBI_1", + ), + ("http://purl.obolibrary.org/obo/CHEBI_1", "http://identifiers.org/chebi/1"), + ( + "http://purl.obolibrary.org/obo/CHEBI_1", + "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=1", + ), + ( + "http://purl.obolibrary.org/obo/CHEBI_2", + "http://purl.obolibrary.org/obo/CHEBI_2", + ), + ("http://purl.obolibrary.org/obo/CHEBI_2", "http://identifiers.org/chebi/2"), + ( + "http://purl.obolibrary.org/obo/CHEBI_2", + "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=2", + ), +} + + +def _stm(rows: Iterable[ResultRow]) -> Set[Tuple[str, str]]: + return {(str(row.s), str(row.o)) for row in rows} + + +class TestMappingService(unittest.TestCase): + """Test the identifier mapping service.""" + + def setUp(self) -> None: + """Set up the converter.""" + self.converter = Converter.from_priority_prefix_map(PREFIX_MAP) + self.graph = CURIEServiceGraph(converter=self.converter) + self.processor = JervenSPARQLProcessor(self.graph) + + def test_prepare_predicates(self): + """Test preparation of predicates.""" + self.assertEqual({OWL.sameAs}, _prepare_predicates()) + self.assertEqual({OWL.sameAs}, _prepare_predicates(OWL.sameAs)) + self.assertEqual( + {OWL.sameAs, SKOS.exactMatch}, _prepare_predicates({OWL.sameAs, SKOS.exactMatch}) + ) + + def test_errors(self): + """Test errors.""" + for sparql in [ + # errors because of unbound subject + "SELECT ?s ?o WHERE { ?s owl:sameAs ?o }", + # errors because of bad predicate + "SELECT ?o WHERE { rdfs:seeAlso ?o }", + "SELECT ?s WHERE { ?s rdfs:seeAlso }", + # errors because of unknown URI + "SELECT ?o WHERE { owl:sameAs ?o }", + "SELECT ?s WHERE { ?s owl:sameAs }", + # errors because predicate is given + "SELECT * WHERE { " + "owl:sameAs }", + ]: + with self.subTest(sparql=sparql): + self.assertEqual([], list(self.graph.query(sparql, processor=self.processor))) + + def test_sparql(self): + """Test a sparql query on the graph.""" + rows = _stm(self.graph.query(SPARQL_SIMPLE, processor=self.processor)) + self.assertNotEqual(0, len(rows), msg="No results were returned") + self.assertEqual(EXPECTED, rows) + + def test_sparql_backwards(self): + """Test a sparql query on the graph.""" + rows = _stm(self.graph.query(SPARQL_SIMPLE_BACKWARDS, processor=self.processor)) + self.assertNotEqual(0, len(rows), msg="No results were returned") + expected = {(o, s) for s, o in EXPECTED} + self.assertEqual(expected, rows) + + def test_service_sparql(self): + """Test the SPARQL that gets sent when using this as a service.""" + rows = _stm(self.graph.query(SPARQL_FROM_SERVICE, processor=self.processor)) + self.assertNotEqual(0, len(rows), msg="No results were returned") + self.assertEqual(EXPECTED, rows) + + def test_missing(self): + """Test a sparql query on the graph where the URIs can't be parsed.""" + sparql = """\ + SELECT ?s ?o WHERE { + VALUES ?s { } + ?s owl:sameAs ?o + } + """ + self.assertEqual([], list(self.graph.query(sparql, processor=self.processor))) + + +class TestMappingWeb(unittest.TestCase): + """Test the web component of the mapping service.""" + + def setUp(self) -> None: + """Set up the test case with a converter and app.""" + self.converter = Converter.from_priority_prefix_map(PREFIX_MAP) + self.app = get_flask_mapping_app(self.converter) + + def test_query(self): + """Test querying the app.""" + with self.app.test_client() as client: + self.assert_sparql_results(client, SPARQL_SIMPLE) + + def test_service_query(self): + """Test sparql generated by a service (that has values outside of where clause).""" + with self.app.test_client() as client: + self.assert_sparql_results(client, SPARQL_FROM_SERVICE) + + def test_missing_query(self): + """Test error on missing query parameter.""" + with self.app.test_client() as client: + res = client.get("/sparql") + self.assertEqual(400, res.status_code, msg=f"Response: {res}") + + def assert_sparql_results(self, client, sparql): + """Test a sparql query returns expected values.""" + res = client.get(f"/sparql?query={quote(sparql)}") + self.assertEqual(200, res.status_code, msg=f"Response: {res}") + records = { + (record["s"]["value"], record["o"]["value"]) + for record in json.loads(res.text)["results"]["bindings"] + } + self.assertEqual(EXPECTED, records) diff --git a/tests/test_web.py b/tests/test_web.py index c0d3cb1..f54afdf 100644 --- a/tests/test_web.py +++ b/tests/test_web.py @@ -37,12 +37,12 @@ def setUp(self) -> None: def test_resolve_success(self): """Test resolution for a valid CURIE redirects properly.""" - res = self.client.get("/GO:1234567", follow_redirects=False) + res = self.client.get("/GO:1234567", allow_redirects=False) self.assertEqual(302, res.status_code, msg=res.text) def test_resolve_failure(self): """Test resolution for an invalid CURIE aborts with 404.""" - res = self.client.get("/NOPREFIX:NOIDENTIFIER", follow_redirects=False) + res = self.client.get("/NOPREFIX:NOIDENTIFIER", allow_redirects=False) self.assertEqual(FAILURE_CODE, res.status_code, msg=res.text)