Add JSON schema for extended prefix map (#109)

biopragmatics · Apr 22, 2024 · 470f71a · 470f71a
1 parent af3bd03
commit 470f71a
Show file tree

Hide file tree

Showing 7 changed files with 184 additions and 17 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -16,3 +16,4 @@ global-exclude *.py[cod] __pycache__ *.so *.dylib .DS_Store *.gpickle
 
 include README.md LICENSE
 exclude tox.ini .flake8 .bumpversion.cfg .readthedocs.yml codecov.yml
+exclude docs/make_schema.py docs/schema.json
diff --git a/docs/make_schema.py b/docs/make_schema.py
@@ -0,0 +1,54 @@
+"""Generate a JSON schema for extended prefix maps."""
+
+import json
+from pathlib import Path
+
+from curies import Records
+from curies._pydantic_compat import PYDANTIC_V1
+
+HERE = Path(__file__).parent.resolve()
+PATH = HERE.joinpath("schema.json")
+TITLE = "Extended Prefix Map"
+DESCRIPTION = (
+    """\
+An extended prefix map is a generalization of a prefix map that
+includes synonyms for URI prefixes and CURIE prefixes.
+""".strip()
+    .replace("\n", " ")
+    .replace("  ", " ")
+)
+URL = "https://w3id.org/biopragmatics/schema/epm.json"
+
+
+def main() -> None:
+    """Generate a JSON schema for extended prefix maps."""
+    rv = {
+        "$schema": "http://json-schema.org/draft-07/schema#",
+        "$id": URL,
+    }
+
+    if PYDANTIC_V1:
+        import pydantic.schema
+
+        # see https://docs.pydantic.dev/latest/usage/json_schema/#general-notes-on-json-schema-generation
+
+        schema_dict = pydantic.schema.schema(
+            [Records],
+            title=TITLE,
+            description=DESCRIPTION,
+        )
+    else:
+        from pydantic.json_schema import models_json_schema
+
+        _, schema_dict = models_json_schema(
+            [(Records, "validation")],
+            title=TITLE,
+            description=DESCRIPTION,
+        )
+
+    rv.update(schema_dict)
+    PATH.write_text(json.dumps(rv, indent=2) + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/schema.json b/docs/schema.json
@@ -0,0 +1,64 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://w3id.org/biopragmatics/schema/epm.json",
+  "$defs": {
+    "Record": {
+      "description": "A record of some prefixes and their associated URI prefixes.\n\n.. seealso:: https://github.com/cthoyt/curies/issues/70",
+      "properties": {
+        "prefix": {
+          "description": "The canonical CURIE prefix, used in the reverse prefix map",
+          "title": "CURIE prefix",
+          "type": "string"
+        },
+        "uri_prefix": {
+          "description": "The canonical URI prefix, used in the forward prefix map",
+          "title": "URI prefix",
+          "type": "string"
+        },
+        "prefix_synonyms": {
+          "items": {
+            "type": "string"
+          },
+          "title": "CURIE prefix synonyms",
+          "type": "array"
+        },
+        "uri_prefix_synonyms": {
+          "items": {
+            "type": "string"
+          },
+          "title": "URI prefix synonyms",
+          "type": "array"
+        },
+        "pattern": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "The regular expression pattern for entries in this semantic space. Warning: this is an experimental feature.",
+          "title": "Pattern"
+        }
+      },
+      "required": [
+        "prefix",
+        "uri_prefix"
+      ],
+      "title": "Record",
+      "type": "object"
+    },
+    "Records": {
+      "description": "A list of records.",
+      "items": {
+        "$ref": "#/$defs/Record"
+      },
+      "title": "Records",
+      "type": "array"
+    }
+  },
+  "title": "Extended Prefix Map",
+  "description": "An extended prefix map is a generalization of a prefix map that includes synonyms for URI prefixes and CURIE prefixes."
+}
diff --git a/docs/source/struct.rst b/docs/source/struct.rst
@@ -108,7 +108,8 @@ containing an entry for ChEBI) looks like:
        }
    ]
 
-An EPM is simply a list of records (see :class:`curies.Record`). EPMs have the benefit that they are still
+An EPM is simply a list of records (see :class:`curies.Record` and :class:`curies.Records`).
+EPMs have the benefit that they are still
 encoded in JSON and can easily be encoded in YAML, TOML, RDF, and other schemata. Further, prefix maps can be
 automatically upgraded into EPMs (with some caveats) using :func:`curies.upgrade_prefix_map`.
 
@@ -118,3 +119,6 @@ automatically upgraded into EPMs (with some caveats) using :func:`curies.upgrade
     can be loaded using :meth:`curies.Converter.from_extended_prefix_map`.
     We provide a Pydantic model representing it. Later, we hope to have an external, stable definition
     of this data schema.
+
+A JSON schema for EPMs is available at https://w3id.org/biopragmatics/schema/epm.json.
+It can be updated at https://github.com/biopragmatics/curies/tree/main/docs/make_schema.py.
diff --git a/src/curies/__init__.py b/src/curies/__init__.py
@@ -8,6 +8,7 @@
     DuplicateURIPrefixes,
     DuplicateValueError,
     Record,
+    Records,
     Reference,
     ReferenceTuple,
     chain,
@@ -35,6 +36,7 @@
 __all__ = [
     "Converter",
     "Record",
+    "Records",
     "ReferenceTuple",
     "Reference",
     "DuplicateValueError",

diff --git a/src/curies/api.py b/src/curies/api.py
@@ -50,6 +50,7 @@
     "Reference",
     "ReferenceTuple",
     "Record",
+    "Records",
     "DuplicateValueError",
     "DuplicatePrefixes",
     "DuplicateURIPrefixes",
@@ -252,26 +253,21 @@ def from_curie(cls, curie: str, sep: str = ":") -> "Reference":
 class Record(BaseModel):  # type:ignore
     """A record of some prefixes and their associated URI prefixes.
 
-    A list of records can be annotated in a FastAPI setting with the following:
-
-    .. code-block:: python
-
-        from typing import List
-        from curies import Record
-        from pydantic import BaseModel
-
-        class Records(BaseModel):
-            __root__ = List[Record]
-
     .. seealso:: https://github.com/cthoyt/curies/issues/70
     """
 
-    prefix: str = Field(..., description="The canonical prefix, used in the reverse prefix map")
+    prefix: str = Field(
+        ...,
+        title="CURIE prefix",
+        description="The canonical CURIE prefix, used in the reverse prefix map",
+    )
     uri_prefix: str = Field(
-        ..., description="The canonical URI prefix, used in the forward prefix map"
+        ...,
+        title="URI prefix",
+        description="The canonical URI prefix, used in the forward prefix map",
     )
-    prefix_synonyms: List[str] = Field(default_factory=list)
-    uri_prefix_synonyms: List[str] = Field(default_factory=list)
+    prefix_synonyms: List[str] = Field(default_factory=list, title="CURIE prefix synonyms")
+    uri_prefix_synonyms: List[str] = Field(default_factory=list, title="URI prefix synonyms")
     pattern: Optional[str] = Field(
         default=None,
         description="The regular expression pattern for entries in this semantic space. "
@@ -315,6 +311,40 @@ def _key(self) -> RecordKey:
         )
 
 
+if PYDANTIC_V1:
+    # An explanation of RootModels in Pydantic V1 can be found on
+    # https://docs.pydantic.dev/1.10/usage/models/#custom-root-types
+
+    from pydantic import BaseModel
+
+    class Records(BaseModel):  # type:ignore
+        """A list of records."""
+
+        class Config:
+            """Configuration for the records."""
+
+            arbitrary_types_allowed = True
+
+        __root__: List[Record]
+
+        def __iter__(self) -> Iterable[Record]:
+            """Iterate over records."""
+            return cast(Iterable[Record], iter(self.__root__))
+
+else:
+    # An explanation of RootModels in Pydantic V2 can be found on
+    # https://docs.pydantic.dev/latest/concepts/models/#rootmodel-and-custom-root-types
+
+    from pydantic import RootModel
+
+    class Records(RootModel[List[Record]]):  # type:ignore
+        """A list of records."""
+
+        def __iter__(self) -> Iterable[Record]:
+            """Iterate over records."""
+            return cast(Iterable[Record], iter(self.root))
+
+
 class DuplicateSummary(NamedTuple):
     """A triple representing two records that are duplicated, either based on a CURIE or URI prefix."""
 
@@ -548,7 +578,8 @@ def add_record(self, record: Record, case_sensitive: bool = True, merge: bool =
         """Append a record to the converter."""
         matched = self._match_record(record, case_sensitive=case_sensitive)
         if len(matched) > 1:
-            raise ValueError(f"new record has duplicates: {matched}")
+            msg = "".join(f"\n  {m} -> {v}" for m, v in matched.items())
+            raise ValueError(f"new record has duplicates:{msg}")
         if len(matched) == 1:
             if not merge:
                 raise ValueError(f"new record already exists and merge=False: {matched}")

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -21,6 +21,7 @@
     ExpansionError,
     PrefixStandardizationError,
     Record,
+    Records,
     Reference,
     ReferenceTuple,
     URIStandardizationError,
@@ -41,6 +42,16 @@
 GO_URI_PREFIX = "http://purl.obolibrary.org/obo/GO_"
 
 
+class TestStruct(unittest.TestCase):
+    """Test the data structures."""
+
+    def test_records(self):
+        """Test a list of records."""
+        records = Records.parse_obj([{"prefix": "chebi", "uri_prefix": CHEBI_URI_PREFIX}])
+        converter = Converter(records=records)
+        self.assertEqual({"chebi"}, converter.get_prefixes())
+
+
 class TestAddRecord(unittest.TestCase):
     """Test adding records."""