Implement an identifier mapping service (#41)

References biopragmatics/bioregistry#686. This pull request implements the identifier mapping service described in [SPARQL-enabled identifier conversion with Identifiers.org](https://pubmed.ncbi.nlm.nih.gov/25638809/). The goal of such a service is to act as an interoperability in SPARQL queries that federate data from multiple places and potentially use different IRIs for the same things.
biopragmatics · Mar 15, 2023 · aefcc72 · aefcc72
1 parent 4233832
commit aefcc72
Show file tree

Hide file tree

Showing 6 changed files with 488 additions and 3 deletions.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -83,3 +83,8 @@ for the `records` argument and prefixes can be added one at a time
 
     converter = curies.Converter(records=[])
     converter.add_prefix("hgnc", "https://bioregistry.io/hgnc:")
+
+Identifier Mapping Service
+--------------------------
+.. automodapi:: curies.mapping_service
+   :no-inheritance-diagram:
diff --git a/setup.cfg b/setup.cfg
@@ -118,7 +118,7 @@ show_missing = True
 exclude_lines =
     pragma: no cover
     raise NotImplementedError
-    if __name__ == __main__:
+    if __name__ == "__main__":
     if TYPE_CHECKING:
     def __str__
     def __repr__
@@ -142,6 +142,7 @@ ignore =
     S603
     W503 # Line break before binary operator (flake8 is wrong)
     E203  # whitespace before ':'
+    S113 # Requests call without timeout
 exclude =
     .tox,
     .git,

diff --git a/src/curies/mapping_service/__init__.py b/src/curies/mapping_service/__init__.py
@@ -0,0 +1,215 @@
+# -*- coding: utf-8 -*-
+
+"""Identifier mappings service.
+
+This contains an implementation of the service described in `SPARQL-enabled identifier
+conversion with Identifiers.org <https://pubmed.ncbi.nlm.nih.gov/25638809/>`_.
+The idea here is that you can write a SPARQL query like the following:
+
+.. code-block:: sparql
+
+    PREFIX biomodel: <http://identifiers.org/biomodels.db/>
+    PREFIX bqbio: <http://biomodels.net/biology-qualifiers#>
+    PREFIX sbmlrdf: <http://identifiers.org/biomodels.vocabulary#>
+    PREFIX up: <http://purl.uniprot.org/core/>
+
+    SELECT DISTINCT ?protein ?protein_domain
+    WHERE {
+        # The first part of this query extracts the proteins appearing in an RDF serialization
+        # of the BioModels database (see https://www.ebi.ac.uk/biomodels/BIOMD0000000372) on
+        # insulin/glucose feedback. Note that modelers call entities appearing in compartmental
+        # models "species", and this does not refer to taxa.
+        biomodel:BIOMD0000000372 sbmlrdf:species/bqbio:isVersionOf ?biomodels_protein .
+
+        # The second part of this query maps BioModels protein IRIs to UniProt protein IRIs
+        # using service XXX - that's what we're implementing here.
+        SERVICE <XXX> {
+            ?biomodels_protein owl:sameAs ?uniprot_protein.
+        }
+
+        # The third part of this query gets links between UniProt proteins and their
+        # domains. Since the service maps between the BioModels query, this only gets
+        # us relevant protein domains to the insulin/glucose model.
+        SERVICE <http://beta.sparql.uniprot.org/sparql> {
+            ?uniprot_protein a up:Protein;
+                up:organism taxon:9606;
+                rdfs:seeAlso ?protein_domain.
+        }
+    }
+
+The SPARQL endpoint running at the web address XXX takes in the bound values for `?biomodels_protein`
+one at a time and dynamically generates triples with `owl:sameAs` as the predicate mapping and other
+equivalent IRIs (based on the definition of the converter) as the objects. This allows for gluing
+together multiple services that use different URIs for the same entities - in this example, there
+are two ways of referring to UniProt Proteins:
+
+1. The BioModels database example represents a SBML model on insulin-glucose feedback and uses legacy
+   Identifiers.org URIs for proteins such as http://identifiers.org/uniprot/P01308.
+2. The first-part UniProt database uses its own PURLs such as https://purl.uniprot.org/uniprot/P01308.
+
+.. seealso::
+
+    - Jerven Bolleman's implementation of this service in Java: https://github.com/JervenBolleman/sparql-identifiers
+    - Vincent Emonet's `SPARQL endpoint for RDFLib generator <https://github.com/vemonet/rdflib-endpoint>`_
+"""
+
+import itertools as itt
+from typing import TYPE_CHECKING, Any, Collection, Iterable, List, Set, Tuple, Union, cast
+
+from rdflib import OWL, Graph, URIRef
+
+from .rdflib_custom import JervenSPARQLProcessor  # type: ignore
+from ..api import Converter
+
+if TYPE_CHECKING:
+    import flask
+
+__all__ = [
+    "CURIEServiceGraph",
+    "get_flask_mapping_blueprint",
+    "get_flask_mapping_app",
+]
+
+
+def _prepare_predicates(predicates: Union[None, str, Collection[str]] = None) -> Set[URIRef]:
+    if predicates is None:
+        return {OWL.sameAs}
+    if isinstance(predicates, str):
+        return {URIRef(predicates)}
+    return {URIRef(predicate) for predicate in predicates}
+
+
+class CURIEServiceGraph(Graph):  # type:ignore
+    """A service that implements identifier mapping based on a converter."""
+
+    converter: Converter
+    predicates: Set[URIRef]
+
+    def __init__(
+        self,
+        *args: Any,
+        converter: Converter,
+        predicates: Union[None, str, List[str]] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Instantiate the graph.
+
+        :param args: Positional arguments to pass to :meth:`rdflib.Graph.__init__`
+        :param converter: A converter object
+        :param predicates: A predicate or set of predicates. If not given, this service
+            will use `owl:sameAs` as a predicate for mapping IRIs.
+        :param kwargs: Keyword arguments to pass to :meth:`rdflib.Graph.__init__`
+
+        In the following example, a service graph is instantiated using a small example
+        converter, then an example SPARQL query is made directly to show how it makes
+        results:
+
+        .. code-block:: python
+
+            from curies import Converter
+            from curies.mapping_service import CURIEServiceGraph
+
+            converter = Converter.from_priority_prefix_map(
+                {
+                    "CHEBI": [
+                        "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=",
+                        "http://identifiers.org/chebi/",
+                        "http://purl.obolibrary.org/obo/CHEBI_",
+                    ],
+                    "GO": ["http://purl.obolibrary.org/obo/GO_"],
+                    "OBO": ["http://purl.obolibrary.org/obo/"],
+                    ...,
+                }
+            )
+            graph = CURIEServiceGraph(converter=converter)
+
+            res = graph.query('''
+                SELECT ?o WHERE {
+                    VALUES ?s {
+                        <http://purl.obolibrary.org/obo/CHEBI_1>
+                    }
+                    ?s owl:sameAs ?o
+                }
+            ''')
+
+
+        The results of this are:
+
+        ======================================  =================================================
+        subject                                 object
+        --------------------------------------  -------------------------------------------------
+        http://purl.obolibrary.org/obo/CHEBI_1  http://purl.obolibrary.org/obo/CHEBI_1
+        http://purl.obolibrary.org/obo/CHEBI_1  http://identifiers.org/chebi/1
+        http://purl.obolibrary.org/obo/CHEBI_1  https://www.ebi.ac.uk/chebi/searchId.do?chebiId=1
+        ======================================  =================================================
+        """
+        self.converter = converter
+        self.predicates = _prepare_predicates(predicates)
+        super().__init__(*args, **kwargs)
+
+    def triples(
+        self, triple: Tuple[URIRef, URIRef, URIRef]
+    ) -> Iterable[Tuple[URIRef, URIRef, URIRef]]:
+        """Generate triples, overriden to dynamically generate mappings based on this graph's converter."""
+        subj_query, pred_query, obj_query = triple
+        if pred_query not in self.predicates:
+            return
+        if subj_query is None and obj_query is None:
+            return  # can't generate based on this
+        if subj_query is None and obj_query is not None:
+            prefix, identifier = self.converter.parse_uri(obj_query)
+            if prefix is None or identifier is None:
+                return
+            subjects = [
+                URIRef(sub)
+                for sub in cast(Collection[str], self.converter.expand_pair_all(prefix, identifier))
+            ]
+            for subj, pred in itt.product(subjects, self.predicates):
+                yield subj, pred, obj_query
+        elif subj_query is not None and obj_query is None:
+            prefix, identifier = self.converter.parse_uri(subj_query)
+            if prefix is None or identifier is None:
+                return
+            objects = [
+                URIRef(obj)
+                for obj in cast(Collection[str], self.converter.expand_pair_all(prefix, identifier))
+            ]
+            for obj, pred in itt.product(objects, self.predicates):
+                yield subj_query, pred, obj
+        else:  # subj_query is not None and obj_query is not None
+            return  # too much specification? maybe just return one triple then?
+
+
+def get_flask_mapping_blueprint(converter: Converter, **kwargs: Any) -> "flask.Blueprint":
+    """Get a blueprint for :class:`flask.Flask`.
+
+    :param converter: A converter
+    :param kwargs: Keyword arguments passed through to :class:`flask.Blueprint`
+    :return: A blueprint
+    """
+    from flask import Blueprint, Response, request
+
+    blueprint = Blueprint("mapping", __name__, **kwargs)
+    graph = CURIEServiceGraph(converter=converter)
+    processor = JervenSPARQLProcessor(graph=graph)
+
+    @blueprint.route("/sparql", methods=["GET", "POST"])  # type:ignore
+    def serve_sparql() -> "Response":
+        """Run a SPARQL query and serve the results."""
+        sparql = (request.args if request.method == "GET" else request.json).get("query")
+        if not sparql:
+            return Response("Missing parameter query", 400)
+        results = graph.query(sparql, processor=processor).serialize(format="json")
+        return Response(results)
+
+    return blueprint
+
+
+def get_flask_mapping_app(converter: Converter) -> "flask.Flask":
+    """Get a Flask app for the mapping service."""
+    from flask import Flask
+
+    blueprint = get_flask_mapping_blueprint(converter)
+    app = Flask(__name__)
+    app.register_blueprint(blueprint)
+    return app
diff --git a/src/curies/mapping_service/rdflib_custom.py b/src/curies/mapping_service/rdflib_custom.py
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+# type: ignore
+
+"""A custom SPARQL processor that optimizes the query based on https://github.com/RDFLib/rdflib/pull/2257."""
+
+from typing import Union
+
+from rdflib.plugins.sparql.algebra import translateQuery
+from rdflib.plugins.sparql.evaluate import evalQuery
+from rdflib.plugins.sparql.parser import parseQuery
+from rdflib.plugins.sparql.parserutils import CompValue
+from rdflib.plugins.sparql.processor import SPARQLProcessor
+from rdflib.plugins.sparql.sparql import Query
+
+__all__ = ["JervenSPARQLProcessor"]
+
+
+class JervenSPARQLProcessor(SPARQLProcessor):
+    """A custom SPARQL processor that optimizes the query based on https://github.com/RDFLib/rdflib/pull/2257.
+
+    Why is this necessary? Ideally, we get queries like
+
+    .. code-block:: sparql
+
+        SELECT * WHERE {
+            VALUES ?s { :a :b ... }
+            ?s owl:sameAs ?o
+        }
+
+    This is fine, since the way that RDFLib parses and constructs an abstract syntax tree, the values
+    for ``?s`` get bound properly when calling a custom :func:`rdflib.Graph.triples`. However, it's also
+    valid SPARQL to have the ``VALUES`` clause outside of the ``WHERE`` clause like
+
+    .. code-block:: sparql
+
+        SELECT * WHERE {
+            ?s owl:sameAs ?o
+        }
+        VALUES ?s { :a :b ... }
+
+    Unfortunately, this trips up RDFLib since it doesn't know to bind the values before calling ``triples()``,
+    therefore thwarting our custom implementation that dynamically generates triples based on the bound values
+    themselves.
+
+    This processor, originally by Jerven Bolleman in https://github.com/RDFLib/rdflib/pull/2257,
+    adds some additional logic between parsing + constructing the abstract syntax tree and evaluation
+    of the syntax tree. Basically, the abstract syntax tree has nodes with two or more children. Jerven's
+    clever code (see :func:`_optimize_node` below) finds *Join* nodes that have a ``VALUES`` clause in the
+    second of its two arguments, then flips them around. It does this recursively for the whole tree.
+    This gets us to the goal of having the ``VALUES`` clauses appear first, therefore making sure that their
+    bound values are available to the ``triples`` function.
+    """
+
+    def query(
+        self,
+        query: Union[str, Query],
+        initBindings=None,  # noqa:N803
+        initNs=None,  # noqa:N803
+        base=None,
+        DEBUG=False,
+    ):
+        """Evaluate a SPARQL query on this processor's graph."""
+        if isinstance(query, str):
+            parse_tree = parseQuery(query)
+            query = translateQuery(parse_tree, base, initNs)
+            return self.query(query, initBindings=initBindings, base=base)
+
+        query.algebra = _optimize_node(query.algebra)
+        return evalQuery(self.graph, query, initBindings or {}, base)
+
+
+# From Jerven's PR to RDFLib (https://github.com/RDFLib/rdflib/pull/2257)
+def _optimize_node(comp_value: CompValue) -> CompValue:
+    if (
+        comp_value.name == "Join"
+        and comp_value.p1.name != "ToMultiSet"
+        and comp_value.p2.name == "ToMultiSet"
+    ):
+        comp_value.update(p1=comp_value.p2, p2=comp_value.p1)
+    for inner_comp_value in comp_value.values():
+        if isinstance(inner_comp_value, CompValue):
+            _optimize_node(inner_comp_value)
+    return comp_value