From aefcc72236fe106f76413efec31ca161980c2f13 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Wed, 15 Mar 2023 23:37:48 +0100
Subject: [PATCH] Implement an identifier mapping service (#41)

References https://github.com/biopragmatics/bioregistry/issues/686.

This pull request implements the identifier mapping service described in
[SPARQL-enabled identifier conversion with
Identifiers.org](https://pubmed.ncbi.nlm.nih.gov/25638809/). The goal of
such a service is to act as an interoperability in SPARQL queries that
federate data from multiple places and potentially use different IRIs
for the same things.
---
 docs/source/index.rst                       |   5 +
 setup.cfg                                   |   3 +-
 src/curies/mapping_service/__init__.py      | 215 ++++++++++++++++++++
 src/curies/mapping_service/rdflib_custom.py |  83 ++++++++
 tests/test_mapping_service.py               | 181 ++++++++++++++++
 tests/test_web.py                           |   4 +-
 6 files changed, 488 insertions(+), 3 deletions(-)
 create mode 100644 src/curies/mapping_service/__init__.py
 create mode 100644 src/curies/mapping_service/rdflib_custom.py
 create mode 100644 tests/test_mapping_service.py
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c69c3fc..2bf9e45 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -83,3 +83,8 @@ for the `records` argument and prefixes can be added one at a time
 
     converter = curies.Converter(records=[])
     converter.add_prefix("hgnc", "https://bioregistry.io/hgnc:")
+
+Identifier Mapping Service
+--------------------------
+.. automodapi:: curies.mapping_service
+   :no-inheritance-diagram:
diff --git a/setup.cfg b/setup.cfg
index 2a7468f..f6feefd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -118,7 +118,7 @@ show_missing = True
 exclude_lines =
     pragma: no cover
     raise NotImplementedError
-    if __name__ == __main__:
+    if __name__ == "__main__":
     if TYPE_CHECKING:
     def __str__
     def __repr__
@@ -142,6 +142,7 @@ ignore =
     S603
     W503 # Line break before binary operator (flake8 is wrong)
     E203  # whitespace before ':'
+    S113 # Requests call without timeout
 exclude =
     .tox,
     .git,
diff --git a/src/curies/mapping_service/__init__.py b/src/curies/mapping_service/__init__.py
new file mode 100644
index 0000000..8588572
--- /dev/null
+++ b/src/curies/mapping_service/__init__.py
@@ -0,0 +1,215 @@
+# -*- coding: utf-8 -*-
+
+"""Identifier mappings service.
+
+This contains an implementation of the service described in `SPARQL-enabled identifier
+conversion with Identifiers.org <https://pubmed.ncbi.nlm.nih.gov/25638809/>`_.
+The idea here is that you can write a SPARQL query like the following:
+
+.. code-block:: sparql
+
+    PREFIX biomodel: <http://identifiers.org/biomodels.db/>
+    PREFIX bqbio: <http://biomodels.net/biology-qualifiers#>
+    PREFIX sbmlrdf: <http://identifiers.org/biomodels.vocabulary#>
+    PREFIX up: <http://purl.uniprot.org/core/>
+
+    SELECT DISTINCT ?protein ?protein_domain
+    WHERE {
+        # The first part of this query extracts the proteins appearing in an RDF serialization
+        # of the BioModels database (see https://www.ebi.ac.uk/biomodels/BIOMD0000000372) on
+        # insulin/glucose feedback. Note that modelers call entities appearing in compartmental
+        # models "species", and this does not refer to taxa.
+        biomodel:BIOMD0000000372 sbmlrdf:species/bqbio:isVersionOf ?biomodels_protein .
+
+        # The second part of this query maps BioModels protein IRIs to UniProt protein IRIs
+        # using service XXX - that's what we're implementing here.
+        SERVICE <XXX> {
+            ?biomodels_protein owl:sameAs ?uniprot_protein.
+        }
+
+        # The third part of this query gets links between UniProt proteins and their
+        # domains. Since the service maps between the BioModels query, this only gets
+        # us relevant protein domains to the insulin/glucose model.
+        SERVICE <http://beta.sparql.uniprot.org/sparql> {
+            ?uniprot_protein a up:Protein;
+                up:organism taxon:9606;
+                rdfs:seeAlso ?protein_domain.
+        }
+    }
+
+The SPARQL endpoint running at the web address XXX takes in the bound values for `?biomodels_protein`
+one at a time and dynamically generates triples with `owl:sameAs` as the predicate mapping and other
+equivalent IRIs (based on the definition of the converter) as the objects. This allows for gluing
+together multiple services that use different URIs for the same entities - in this example, there
+are two ways of referring to UniProt Proteins:
+
+1. The BioModels database example represents a SBML model on insulin-glucose feedback and uses legacy
+   Identifiers.org URIs for proteins such as http://identifiers.org/uniprot/P01308.
+2. The first-part UniProt database uses its own PURLs such as https://purl.uniprot.org/uniprot/P01308.
+
+.. seealso::
+
+    - Jerven Bolleman's implementation of this service in Java: https://github.com/JervenBolleman/sparql-identifiers
+    - Vincent Emonet's `SPARQL endpoint for RDFLib generator <https://github.com/vemonet/rdflib-endpoint>`_
+"""
+
+import itertools as itt
+from typing import TYPE_CHECKING, Any, Collection, Iterable, List, Set, Tuple, Union, cast
+
+from rdflib import OWL, Graph, URIRef
+
+from .rdflib_custom import JervenSPARQLProcessor  # type: ignore
+from ..api import Converter
+
+if TYPE_CHECKING:
+    import flask
+
+__all__ = [
+    "CURIEServiceGraph",
+    "get_flask_mapping_blueprint",
+    "get_flask_mapping_app",
+]
+
+
+def _prepare_predicates(predicates: Union[None, str, Collection[str]] = None) -> Set[URIRef]:
+    if predicates is None:
+        return {OWL.sameAs}
+    if isinstance(predicates, str):
+        return {URIRef(predicates)}
+    return {URIRef(predicate) for predicate in predicates}
+
+
+class CURIEServiceGraph(Graph):  # type:ignore
+    """A service that implements identifier mapping based on a converter."""
+
+    converter: Converter
+    predicates: Set[URIRef]
+
+    def __init__(
+        self,
+        *args: Any,
+        converter: Converter,
+        predicates: Union[None, str, List[str]] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Instantiate the graph.
+
+        :param args: Positional arguments to pass to :meth:`rdflib.Graph.__init__`
+        :param converter: A converter object
+        :param predicates: A predicate or set of predicates. If not given, this service
+            will use `owl:sameAs` as a predicate for mapping IRIs.
+        :param kwargs: Keyword arguments to pass to :meth:`rdflib.Graph.__init__`
+
+        In the following example, a service graph is instantiated using a small example
+        converter, then an example SPARQL query is made directly to show how it makes
+        results:
+
+        .. code-block:: python
+
+            from curies import Converter
+            from curies.mapping_service import CURIEServiceGraph
+
+            converter = Converter.from_priority_prefix_map(
+                {
+                    "CHEBI": [
+                        "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=",
+                        "http://identifiers.org/chebi/",
+                        "http://purl.obolibrary.org/obo/CHEBI_",
+                    ],
+                    "GO": ["http://purl.obolibrary.org/obo/GO_"],
+                    "OBO": ["http://purl.obolibrary.org/obo/"],
+                    ...,
+                }
+            )
+            graph = CURIEServiceGraph(converter=converter)
+
+            res = graph.query('''
+                SELECT ?o WHERE {
+                    VALUES ?s {
+                        <http://purl.obolibrary.org/obo/CHEBI_1>
+                    }
+                    ?s owl:sameAs ?o
+                }
+            ''')
+
+
+        The results of this are:
+
+        ======================================  =================================================
+        subject                                 object
+        --------------------------------------  -------------------------------------------------
+        http://purl.obolibrary.org/obo/CHEBI_1  http://purl.obolibrary.org/obo/CHEBI_1
+        http://purl.obolibrary.org/obo/CHEBI_1  http://identifiers.org/chebi/1
+        http://purl.obolibrary.org/obo/CHEBI_1  https://www.ebi.ac.uk/chebi/searchId.do?chebiId=1
+        ======================================  =================================================
+        """
+        self.converter = converter
+        self.predicates = _prepare_predicates(predicates)
+        super().__init__(*args, **kwargs)
+
+    def triples(
+        self, triple: Tuple[URIRef, URIRef, URIRef]
+    ) -> Iterable[Tuple[URIRef, URIRef, URIRef]]:
+        """Generate triples, overriden to dynamically generate mappings based on this graph's converter."""
+        subj_query, pred_query, obj_query = triple
+        if pred_query not in self.predicates:
+            return
+        if subj_query is None and obj_query is None:
+            return  # can't generate based on this
+        if subj_query is None and obj_query is not None:
+            prefix, identifier = self.converter.parse_uri(obj_query)
+            if prefix is None or identifier is None:
+                return
+            subjects = [
+                URIRef(sub)
+                for sub in cast(Collection[str], self.converter.expand_pair_all(prefix, identifier))
+            ]
+            for subj, pred in itt.product(subjects, self.predicates):
+                yield subj, pred, obj_query
+        elif subj_query is not None and obj_query is None:
+            prefix, identifier = self.converter.parse_uri(subj_query)
+            if prefix is None or identifier is None:
+                return
+            objects = [
+                URIRef(obj)
+                for obj in cast(Collection[str], self.converter.expand_pair_all(prefix, identifier))
+            ]
+            for obj, pred in itt.product(objects, self.predicates):
+                yield subj_query, pred, obj
+        else:  # subj_query is not None and obj_query is not None
+            return  # too much specification? maybe just return one triple then?
+
+
+def get_flask_mapping_blueprint(converter: Converter, **kwargs: Any) -> "flask.Blueprint":
+    """Get a blueprint for :class:`flask.Flask`.
+
+    :param converter: A converter
+    :param kwargs: Keyword arguments passed through to :class:`flask.Blueprint`
+    :return: A blueprint
+    """
+    from flask import Blueprint, Response, request
+
+    blueprint = Blueprint("mapping", __name__, **kwargs)
+    graph = CURIEServiceGraph(converter=converter)
+    processor = JervenSPARQLProcessor(graph=graph)
+
+    @blueprint.route("/sparql", methods=["GET", "POST"])  # type:ignore
+    def serve_sparql() -> "Response":
+        """Run a SPARQL query and serve the results."""
+        sparql = (request.args if request.method == "GET" else request.json).get("query")
+        if not sparql:
+            return Response("Missing parameter query", 400)
+        results = graph.query(sparql, processor=processor).serialize(format="json")
+        return Response(results)
+
+    return blueprint
+
+
+def get_flask_mapping_app(converter: Converter) -> "flask.Flask":
+    """Get a Flask app for the mapping service."""
+    from flask import Flask
+
+    blueprint = get_flask_mapping_blueprint(converter)
+    app = Flask(__name__)
+    app.register_blueprint(blueprint)
+    return app
diff --git a/src/curies/mapping_service/rdflib_custom.py b/src/curies/mapping_service/rdflib_custom.py
new file mode 100644
index 0000000..fb8abe3
--- /dev/null
+++ b/src/curies/mapping_service/rdflib_custom.py
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+# type: ignore
+
+"""A custom SPARQL processor that optimizes the query based on https://github.com/RDFLib/rdflib/pull/2257."""
+
+from typing import Union
+
+from rdflib.plugins.sparql.algebra import translateQuery
+from rdflib.plugins.sparql.evaluate import evalQuery
+from rdflib.plugins.sparql.parser import parseQuery
+from rdflib.plugins.sparql.parserutils import CompValue
+from rdflib.plugins.sparql.processor import SPARQLProcessor
+from rdflib.plugins.sparql.sparql import Query
+
+__all__ = ["JervenSPARQLProcessor"]
+
+
+class JervenSPARQLProcessor(SPARQLProcessor):
+    """A custom SPARQL processor that optimizes the query based on https://github.com/RDFLib/rdflib/pull/2257.
+
+    Why is this necessary? Ideally, we get queries like
+
+    .. code-block:: sparql
+
+        SELECT * WHERE {
+            VALUES ?s { :a :b ... }
+            ?s owl:sameAs ?o
+        }
+
+    This is fine, since the way that RDFLib parses and constructs an abstract syntax tree, the values
+    for ``?s`` get bound properly when calling a custom :func:`rdflib.Graph.triples`. However, it's also
+    valid SPARQL to have the ``VALUES`` clause outside of the ``WHERE`` clause like
+
+    .. code-block:: sparql
+
+        SELECT * WHERE {
+            ?s owl:sameAs ?o
+        }
+        VALUES ?s { :a :b ... }
+
+    Unfortunately, this trips up RDFLib since it doesn't know to bind the values before calling ``triples()``,
+    therefore thwarting our custom implementation that dynamically generates triples based on the bound values
+    themselves.
+
+    This processor, originally by Jerven Bolleman in https://github.com/RDFLib/rdflib/pull/2257,
+    adds some additional logic between parsing + constructing the abstract syntax tree and evaluation
+    of the syntax tree. Basically, the abstract syntax tree has nodes with two or more children. Jerven's
+    clever code (see :func:`_optimize_node` below) finds *Join* nodes that have a ``VALUES`` clause in the
+    second of its two arguments, then flips them around. It does this recursively for the whole tree.
+    This gets us to the goal of having the ``VALUES`` clauses appear first, therefore making sure that their
+    bound values are available to the ``triples`` function.
+    """
+
+    def query(
+        self,
+        query: Union[str, Query],
+        initBindings=None,  # noqa:N803
+        initNs=None,  # noqa:N803
+        base=None,
+        DEBUG=False,
+    ):
+        """Evaluate a SPARQL query on this processor's graph."""
+        if isinstance(query, str):
+            parse_tree = parseQuery(query)
+            query = translateQuery(parse_tree, base, initNs)
+            return self.query(query, initBindings=initBindings, base=base)
+
+        query.algebra = _optimize_node(query.algebra)
+        return evalQuery(self.graph, query, initBindings or {}, base)
+
+
+# From Jerven's PR to RDFLib (https://github.com/RDFLib/rdflib/pull/2257)
+def _optimize_node(comp_value: CompValue) -> CompValue:
+    if (
+        comp_value.name == "Join"
+        and comp_value.p1.name != "ToMultiSet"
+        and comp_value.p2.name == "ToMultiSet"
+    ):
+        comp_value.update(p1=comp_value.p2, p2=comp_value.p1)
+    for inner_comp_value in comp_value.values():
+        if isinstance(inner_comp_value, CompValue):
+            _optimize_node(inner_comp_value)
+    return comp_value
diff --git a/tests/test_mapping_service.py b/tests/test_mapping_service.py
new file mode 100644
index 0000000..10e1659
--- /dev/null
+++ b/tests/test_mapping_service.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+
+"""Tests for the identifier mapping service."""
+
+import json
+import unittest
+from typing import Iterable, Set, Tuple
+from urllib.parse import quote
+
+from rdflib import OWL, SKOS
+from rdflib.query import ResultRow
+
+from curies import Converter
+from curies.mapping_service import CURIEServiceGraph, _prepare_predicates, get_flask_mapping_app
+from curies.mapping_service.rdflib_custom import JervenSPARQLProcessor
+
+PREFIX_MAP = {
+    "CHEBI": [
+        "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=",
+        "http://identifiers.org/chebi/",
+        "http://purl.obolibrary.org/obo/CHEBI_",
+    ],
+    "GO": ["http://purl.obolibrary.org/obo/GO_"],
+    "OBO": ["http://purl.obolibrary.org/obo/"],
+}
+
+SPARQL_SIMPLE = """\
+SELECT DISTINCT ?s ?o WHERE {
+    VALUES ?s {
+        <http://purl.obolibrary.org/obo/CHEBI_1>
+        <http://purl.obolibrary.org/obo/CHEBI_2>
+    }
+    ?s owl:sameAs ?o
+}
+""".rstrip()
+
+SPARQL_SIMPLE_BACKWARDS = """\
+SELECT DISTINCT ?s ?o WHERE {
+    VALUES ?o {
+        <http://purl.obolibrary.org/obo/CHEBI_1>
+        <http://purl.obolibrary.org/obo/CHEBI_2>
+    }
+    ?s owl:sameAs ?o
+}
+""".rstrip()
+
+#: This represents a SPARQL query that happens when a service generates it
+SPARQL_FROM_SERVICE = """\
+SELECT REDUCED * WHERE {
+    ?s owl:sameAs ?o .
+}
+VALUES (?s) {
+    (<http://purl.obolibrary.org/obo/CHEBI_1>)
+    (<http://purl.obolibrary.org/obo/CHEBI_2>)
+}
+"""
+
+EXPECTED = {
+    (
+        "http://purl.obolibrary.org/obo/CHEBI_1",
+        "http://purl.obolibrary.org/obo/CHEBI_1",
+    ),
+    ("http://purl.obolibrary.org/obo/CHEBI_1", "http://identifiers.org/chebi/1"),
+    (
+        "http://purl.obolibrary.org/obo/CHEBI_1",
+        "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=1",
+    ),
+    (
+        "http://purl.obolibrary.org/obo/CHEBI_2",
+        "http://purl.obolibrary.org/obo/CHEBI_2",
+    ),
+    ("http://purl.obolibrary.org/obo/CHEBI_2", "http://identifiers.org/chebi/2"),
+    (
+        "http://purl.obolibrary.org/obo/CHEBI_2",
+        "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=2",
+    ),
+}
+
+
+def _stm(rows: Iterable[ResultRow]) -> Set[Tuple[str, str]]:
+    return {(str(row.s), str(row.o)) for row in rows}
+
+
+class TestMappingService(unittest.TestCase):
+    """Test the identifier mapping service."""
+
+    def setUp(self) -> None:
+        """Set up the converter."""
+        self.converter = Converter.from_priority_prefix_map(PREFIX_MAP)
+        self.graph = CURIEServiceGraph(converter=self.converter)
+        self.processor = JervenSPARQLProcessor(self.graph)
+
+    def test_prepare_predicates(self):
+        """Test preparation of predicates."""
+        self.assertEqual({OWL.sameAs}, _prepare_predicates())
+        self.assertEqual({OWL.sameAs}, _prepare_predicates(OWL.sameAs))
+        self.assertEqual(
+            {OWL.sameAs, SKOS.exactMatch}, _prepare_predicates({OWL.sameAs, SKOS.exactMatch})
+        )
+
+    def test_errors(self):
+        """Test errors."""
+        for sparql in [
+            # errors because of unbound subject
+            "SELECT ?s ?o WHERE { ?s owl:sameAs ?o }",
+            # errors because of bad predicate
+            "SELECT ?o WHERE { <http://purl.obolibrary.org/obo/CHEBI_1> rdfs:seeAlso ?o }",
+            "SELECT ?s WHERE { ?s rdfs:seeAlso <http://purl.obolibrary.org/obo/CHEBI_1> }",
+            # errors because of unknown URI
+            "SELECT ?o WHERE { <http://example.com/1> owl:sameAs ?o }",
+            "SELECT ?s WHERE { ?s owl:sameAs <http://example.com/1> }",
+            # errors because predicate is given
+            "SELECT * WHERE { <http://purl.obolibrary.org/obo/CHEBI_1> "
+            "owl:sameAs <http://purl.obolibrary.org/obo/CHEBI_1> }",
+        ]:
+            with self.subTest(sparql=sparql):
+                self.assertEqual([], list(self.graph.query(sparql, processor=self.processor)))
+
+    def test_sparql(self):
+        """Test a sparql query on the graph."""
+        rows = _stm(self.graph.query(SPARQL_SIMPLE, processor=self.processor))
+        self.assertNotEqual(0, len(rows), msg="No results were returned")
+        self.assertEqual(EXPECTED, rows)
+
+    def test_sparql_backwards(self):
+        """Test a sparql query on the graph."""
+        rows = _stm(self.graph.query(SPARQL_SIMPLE_BACKWARDS, processor=self.processor))
+        self.assertNotEqual(0, len(rows), msg="No results were returned")
+        expected = {(o, s) for s, o in EXPECTED}
+        self.assertEqual(expected, rows)
+
+    def test_service_sparql(self):
+        """Test the SPARQL that gets sent when using this as a service."""
+        rows = _stm(self.graph.query(SPARQL_FROM_SERVICE, processor=self.processor))
+        self.assertNotEqual(0, len(rows), msg="No results were returned")
+        self.assertEqual(EXPECTED, rows)
+
+    def test_missing(self):
+        """Test a sparql query on the graph where the URIs can't be parsed."""
+        sparql = """\
+            SELECT ?s ?o WHERE {
+                VALUES ?s { <http://example.org/1> <http://example.org/1> }
+                ?s owl:sameAs ?o
+            }
+        """
+        self.assertEqual([], list(self.graph.query(sparql, processor=self.processor)))
+
+
+class TestMappingWeb(unittest.TestCase):
+    """Test the web component of the mapping service."""
+
+    def setUp(self) -> None:
+        """Set up the test case with a converter and app."""
+        self.converter = Converter.from_priority_prefix_map(PREFIX_MAP)
+        self.app = get_flask_mapping_app(self.converter)
+
+    def test_query(self):
+        """Test querying the app."""
+        with self.app.test_client() as client:
+            self.assert_sparql_results(client, SPARQL_SIMPLE)
+
+    def test_service_query(self):
+        """Test sparql generated by a service (that has values outside of where clause)."""
+        with self.app.test_client() as client:
+            self.assert_sparql_results(client, SPARQL_FROM_SERVICE)
+
+    def test_missing_query(self):
+        """Test error on missing query parameter."""
+        with self.app.test_client() as client:
+            res = client.get("/sparql")
+            self.assertEqual(400, res.status_code, msg=f"Response: {res}")
+
+    def assert_sparql_results(self, client, sparql):
+        """Test a sparql query returns expected values."""
+        res = client.get(f"/sparql?query={quote(sparql)}")
+        self.assertEqual(200, res.status_code, msg=f"Response: {res}")
+        records = {
+            (record["s"]["value"], record["o"]["value"])
+            for record in json.loads(res.text)["results"]["bindings"]
+        }
+        self.assertEqual(EXPECTED, records)
diff --git a/tests/test_web.py b/tests/test_web.py
index c0d3cb1..f54afdf 100644
--- a/tests/test_web.py
+++ b/tests/test_web.py
@@ -37,12 +37,12 @@ def setUp(self) -> None:
 
     def test_resolve_success(self):
         """Test resolution for a valid CURIE redirects properly."""
-        res = self.client.get("/GO:1234567", follow_redirects=False)
+        res = self.client.get("/GO:1234567", allow_redirects=False)
         self.assertEqual(302, res.status_code, msg=res.text)
 
     def test_resolve_failure(self):
         """Test resolution for an invalid CURIE aborts with 404."""
-        res = self.client.get("/NOPREFIX:NOIDENTIFIER", follow_redirects=False)
+        res = self.client.get("/NOPREFIX:NOIDENTIFIER", allow_redirects=False)
         self.assertEqual(FAILURE_CODE, res.status_code, msg=res.text)