dataclass instantiation tests fixed

Cellular-Semantics · Jul 31, 2024 · e73b2d9 · e73b2d9
1 parent 918a2a4
commit e73b2d9
Show file tree

Hide file tree

Showing 13 changed files with 2,201 additions and 116 deletions.
diff --git a/project.Makefile b/project.Makefile
@@ -13,9 +13,10 @@ merged_schemas:
 	gen-linkml $(SCHEMA_FOLDER)/CAP/CAP_schema.yaml --output $(BUILD_FOLDER)/CAP_schema.yaml --mergeimports --format yaml --no-materialize-attributes
 
 classes: merged_schemas
-	gen-python $(BUILD_FOLDER)/general_schema.yaml --no-slots > $(PYMODEL)/cell_annotation_schema.py
-	gen-python $(BUILD_FOLDER)/BICAN_schema.yaml --no-slots > $(PYMODEL)/bican/cell_annotation_schema.py
-	gen-python $(BUILD_FOLDER)/CAP_schema.yaml --no-slots > $(PYMODEL)/cap/cell_annotation_schema.py
+#	gen-python $(BUILD_FOLDER)/general_schema.yaml --no-slots > $(PYMODEL)/cell_annotation_schema.py
+#	gen-python $(BUILD_FOLDER)/BICAN_schema.yaml --no-slots > $(PYMODEL)/bican/cell_annotation_schema.py
+#	gen-python $(BUILD_FOLDER)/CAP_schema.yaml --no-slots > $(PYMODEL)/cap/cell_annotation_schema.py
+	poetry run python src/cell_annotation_schema/generator/dataclassgen.py
 
 build: merged_schemas classes
 	echo "Release products generated."

diff --git a/src/cell_annotation_schema/datamodel/bican/cell_annotation_schema.py b/src/cell_annotation_schema/datamodel/bican/cell_annotation_schema.py
diff --git a/src/cell_annotation_schema/datamodel/cap/cell_annotation_schema.py b/src/cell_annotation_schema/datamodel/cap/cell_annotation_schema.py
diff --git a/src/cell_annotation_schema/datamodel/cell_annotation_schema.py b/src/cell_annotation_schema/datamodel/cell_annotation_schema.py
@@ -1,5 +1,5 @@
-# Auto generated from general_schema.yaml by pythongen.py version: 0.0.1
-# Generation date: 2024-07-23T16:17:25
+# Auto generated from None by pythongen.py version: 0.0.1
+# Generation date: 2024-07-31T16:21:54
 # Schema: General_Cell_Annotation_Open_Standard
 #
 # id: https://cellular-semantics.sanger.ac.uk/ontology/CAS
@@ -327,4 +327,116 @@ class CellTypeEnum(EnumDefinitionImpl):
     )
 
 # Slots
+class slots:
+    pass
 
+slots.datestamp = Slot(uri=CELL_ANNOTATION_SCHEMA.datestamp, name="datestamp", curie=CELL_ANNOTATION_SCHEMA.curie('datestamp'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.datestamp, domain=None, range=str)
+
+slots.reviewer = Slot(uri=CELL_ANNOTATION_SCHEMA.reviewer, name="reviewer", curie=CELL_ANNOTATION_SCHEMA.curie('reviewer'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.reviewer, domain=None, range=Optional[str])
+
+slots.review = Slot(uri=CELL_ANNOTATION_SCHEMA.review, name="review", curie=CELL_ANNOTATION_SCHEMA.curie('review'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.review, domain=None, range=Optional[Union[str, "ReviewOptions"]])
+
+slots.explanation = Slot(uri=IAO['0000115'], name="explanation", curie=IAO.curie('0000115'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.explanation, domain=None, range=Optional[str])
+
+slots.name = Slot(uri=RDFS.label, name="name", curie=RDFS.curie('label'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.name, domain=None, range=str)
+
+slots.description = Slot(uri=IAO['0000115'], name="description", curie=IAO.curie('0000115'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.description, domain=None, range=Optional[str])
+
+slots.annotation_method = Slot(uri=CELL_ANNOTATION_SCHEMA.annotation_method, name="annotation_method", curie=CELL_ANNOTATION_SCHEMA.curie('annotation_method'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.annotation_method, domain=None, range=Optional[Union[str, "AnnotationMethodOptions"]])
+
+slots.automated_annotation = Slot(uri=CELL_ANNOTATION_SCHEMA.automated_annotation, name="automated_annotation", curie=CELL_ANNOTATION_SCHEMA.curie('automated_annotation'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.automated_annotation, domain=None, range=Optional[Union[dict, AutomatedAnnotation]])
+
+slots.algorithm_name = Slot(uri=CELL_ANNOTATION_SCHEMA.algorithm_name, name="algorithm_name", curie=CELL_ANNOTATION_SCHEMA.curie('algorithm_name'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.algorithm_name, domain=None, range=str)
+
+slots.algorithm_version = Slot(uri=CELL_ANNOTATION_SCHEMA.algorithm_version, name="algorithm_version", curie=CELL_ANNOTATION_SCHEMA.curie('algorithm_version'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.algorithm_version, domain=None, range=str)
+
+slots.algorithm_repo_url = Slot(uri=CELL_ANNOTATION_SCHEMA.algorithm_repo_url, name="algorithm_repo_url", curie=CELL_ANNOTATION_SCHEMA.curie('algorithm_repo_url'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.algorithm_repo_url, domain=None, range=str)
+
+slots.reference_location = Slot(uri=CELL_ANNOTATION_SCHEMA.reference_location, name="reference_location", curie=CELL_ANNOTATION_SCHEMA.curie('reference_location'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.reference_location, domain=None, range=Optional[str])
+
+slots.labelset = Slot(uri=CAS.has_labelset, name="labelset", curie=CAS.curie('has_labelset'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.labelset, domain=None, range=str)
+
+slots.cell_label = Slot(uri=RDFS.label, name="cell_label", curie=RDFS.curie('label'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.cell_label, domain=None, range=str)
+
+slots.cell_fullname = Slot(uri=SKOS.preflabel, name="cell_fullname", curie=SKOS.curie('preflabel'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.cell_fullname, domain=None, range=Optional[str])
+
+slots.cell_ontology_term_id = Slot(uri=RO['0002473'], name="cell_ontology_term_id", curie=RO.curie('0002473'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.cell_ontology_term_id, domain=None, range=Optional[Union[str, "CellTypeEnum"]])
+
+slots.cell_ontology_term = Slot(uri=CELL_ANNOTATION_SCHEMA.cell_ontology_term, name="cell_ontology_term", curie=CELL_ANNOTATION_SCHEMA.curie('cell_ontology_term'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.cell_ontology_term, domain=None, range=Optional[str])
+
+slots.cell_ids = Slot(uri=CAS.has_cellid, name="cell_ids", curie=CAS.curie('has_cellid'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.cell_ids, domain=None, range=Optional[Union[str, List[str]]])
+
+slots.rationale = Slot(uri=CELL_ANNOTATION_SCHEMA.rationale, name="rationale", curie=CELL_ANNOTATION_SCHEMA.curie('rationale'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.rationale, domain=None, range=Optional[str])
+
+slots.rationale_dois = Slot(uri=CELL_ANNOTATION_SCHEMA.rationale_dois, name="rationale_dois", curie=CELL_ANNOTATION_SCHEMA.curie('rationale_dois'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.rationale_dois, domain=None, range=Optional[Union[str, List[str]]])
+
+slots.marker_gene_evidence = Slot(uri=CELL_ANNOTATION_SCHEMA.marker_gene_evidence, name="marker_gene_evidence", curie=CELL_ANNOTATION_SCHEMA.curie('marker_gene_evidence'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.marker_gene_evidence, domain=None, range=Optional[Union[str, List[str]]])
+
+slots.synonyms = Slot(uri=CELL_ANNOTATION_SCHEMA.synonyms, name="synonyms", curie=CELL_ANNOTATION_SCHEMA.curie('synonyms'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.synonyms, domain=None, range=Optional[Union[str, List[str]]])
+
+slots.reviews = Slot(uri=CELL_ANNOTATION_SCHEMA.reviews, name="reviews", curie=CELL_ANNOTATION_SCHEMA.curie('reviews'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.reviews, domain=None, range=Optional[Union[Union[dict, Review], List[Union[dict, Review]]]])
+
+slots.author_annotation_fields = Slot(uri=CELL_ANNOTATION_SCHEMA.author_annotation_fields, name="author_annotation_fields", curie=CELL_ANNOTATION_SCHEMA.curie('author_annotation_fields'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.author_annotation_fields, domain=None, range=Optional[str])
+
+slots.matrix_file_id = Slot(uri=CELL_ANNOTATION_SCHEMA.matrix_file_id, name="matrix_file_id", curie=CELL_ANNOTATION_SCHEMA.curie('matrix_file_id'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.matrix_file_id, domain=None, range=Optional[str])
+
+slots.title = Slot(uri=CELL_ANNOTATION_SCHEMA.title, name="title", curie=CELL_ANNOTATION_SCHEMA.curie('title'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.title, domain=None, range=str)
+
+slots.cellannotation_schema_version = Slot(uri=CELL_ANNOTATION_SCHEMA.cellannotation_schema_version, name="cellannotation_schema_version", curie=CELL_ANNOTATION_SCHEMA.curie('cellannotation_schema_version'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.cellannotation_schema_version, domain=None, range=Optional[str])
+
+slots.cellannotation_timestamp = Slot(uri=CELL_ANNOTATION_SCHEMA.cellannotation_timestamp, name="cellannotation_timestamp", curie=CELL_ANNOTATION_SCHEMA.curie('cellannotation_timestamp'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.cellannotation_timestamp, domain=None, range=Optional[str])
+
+slots.cellannotation_version = Slot(uri=CELL_ANNOTATION_SCHEMA.cellannotation_version, name="cellannotation_version", curie=CELL_ANNOTATION_SCHEMA.curie('cellannotation_version'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.cellannotation_version, domain=None, range=Optional[str])
+
+slots.cellannotation_url = Slot(uri=CELL_ANNOTATION_SCHEMA.cellannotation_url, name="cellannotation_url", curie=CELL_ANNOTATION_SCHEMA.curie('cellannotation_url'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.cellannotation_url, domain=None, range=Optional[str])
+
+slots.author_list = Slot(uri=CELL_ANNOTATION_SCHEMA.author_list, name="author_list", curie=CELL_ANNOTATION_SCHEMA.curie('author_list'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.author_list, domain=None, range=Optional[str])
+
+slots.author_name = Slot(uri=CELL_ANNOTATION_SCHEMA.author_name, name="author_name", curie=CELL_ANNOTATION_SCHEMA.curie('author_name'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.author_name, domain=None, range=str)
+
+slots.author_contact = Slot(uri=CELL_ANNOTATION_SCHEMA.author_contact, name="author_contact", curie=CELL_ANNOTATION_SCHEMA.curie('author_contact'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.author_contact, domain=None, range=Optional[str])
+
+slots.orcid = Slot(uri=CELL_ANNOTATION_SCHEMA.orcid, name="orcid", curie=CELL_ANNOTATION_SCHEMA.curie('orcid'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.orcid, domain=None, range=Optional[str])
+
+slots.labelsets = Slot(uri=CELL_ANNOTATION_SCHEMA.labelsets, name="labelsets", curie=CELL_ANNOTATION_SCHEMA.curie('labelsets'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.labelsets, domain=None, range=Union[Union[dict, Labelset], List[Union[dict, Labelset]]])
+
+slots.annotations = Slot(uri=CELL_ANNOTATION_SCHEMA.annotations, name="annotations", curie=CELL_ANNOTATION_SCHEMA.curie('annotations'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.annotations, domain=None, range=Union[Union[dict, Annotation], List[Union[dict, Annotation]]])
+
+slots.id = Slot(uri=CELL_ANNOTATION_SCHEMA.id, name="id", curie=CELL_ANNOTATION_SCHEMA.curie('id'),
+                   model_uri=CELL_ANNOTATION_SCHEMA.id, domain=None, range=URIRef)
diff --git a/src/cell_annotation_schema/file_utils.py b/src/cell_annotation_schema/file_utils.py
@@ -2,6 +2,7 @@
 import json
 import warnings
 
+from typing import Union
 from urllib.request import urlopen
 from pathlib import Path
 from linkml_runtime.linkml_model import SchemaDefinition
@@ -36,7 +37,7 @@ def get_json_from_file(filename):
         warnings.warn("Failed to parse JSON in " + filename + ": " + str(exc))
 
 
-def read_schema(schema) -> SchemaDefinition:
+def read_schema(schema: Union[str, dict]) -> SchemaDefinition:
     """
     Reads the given LinkML schema.
     Parameters:

diff --git a/src/cell_annotation_schema/generator/__init__.py b/src/cell_annotation_schema/generator/__init__.py
diff --git a/src/cell_annotation_schema/generator/dataclassgen.py b/src/cell_annotation_schema/generator/dataclassgen.py
@@ -0,0 +1,63 @@
+"""
+Linkml data class generator (gen-python) is generating problematic code because of the subclass and id mechanism we use.
+
+Details of the issue:
+Taxonomy-annotations doesn't have a id field so it uses the first slot as id slot (which is labelset).
+BicanTaxonomy-annotations (with range Bican Annotations) has a id field so it uses the cell_set_accession field as id slot and works fine.
+
+When we use the gen-python command on the schema, BicanTaxonomy dataclass post_init normalisation code is generated correctly as follows:
+```
+_normalize_inlined_as_dict(slot_name="annotations", slot_type=BicanAnnotation, key_name="cell_set_accession", keyed=True)
+```
+
+But the Taxonomy dataclass post_init normalisation code is generated as follows (expectedly but not correctly):
+```
+_normalize_inlined_as_dict(slot_name="annotations", slot_type=Annotation, key_name="labelset", keyed=False)
+```
+
+And since the BicanTaxonomy post_init is calling the super.post_init, it is trying to normalise the Annotation dataclass
+ with the labelset field as id field which is not correct.
+
+Solution this code applies:
+
+We read schema into memory and remove `annotations` slot from the Taxonomy class and add it to BicanTaxonomy
+(as if it is not inherited but its own slot) so that the problematic normalisation code is not generated.
+"""
+import os
+
+from pathlib import Path
+from typing import Union
+from linkml import generators
+
+from linkml_runtime.linkml_model import SchemaDefinition
+from linkml_runtime.loaders import yaml_loader
+
+from cell_annotation_schema.file_utils import read_schema
+from cell_annotation_schema.ontology.schema import decorate_linkml_schema
+
+SOURCE_DIR = Path(__file__).parent.parent
+
+
+def generate_data_class(cas_schema: Union[str, dict], class_path: str):
+    """
+    Generate data class from CAS schema.
+
+    Args:
+        cas_schema: CAS schema path or dict representing it.
+        class_path: Output class path.
+    Returns:
+        str: Data class string.
+    """
+    schema_def = read_schema(cas_schema)
+    schema_dict = decorate_linkml_schema(schema_def)
+    schema_def = yaml_loader.load(schema_dict, target_class=SchemaDefinition)
+    gen = generators.PythonGenerator(schema_def)
+    output = gen.serialize()
+    with open(class_path, "w") as class_file:
+        class_file.write(output)
+
+
+if __name__ == "__main__":
+    generate_data_class("base", os.path.join(SOURCE_DIR, "datamodel/cell_annotation_schema.py"))
+    generate_data_class("bican", os.path.join(SOURCE_DIR, "datamodel/bican/cell_annotation_schema.py"))
+    generate_data_class("cap", os.path.join(SOURCE_DIR, "datamodel/cap/cell_annotation_schema.py"))
diff --git a/src/cell_annotation_schema/ontology/data.py b/src/cell_annotation_schema/ontology/data.py
@@ -62,7 +62,8 @@ def dump_to_rdf(
         validate_data(schema_def, instance)
     instance = serialise_author_annotation(instance)
 
-    py_inst = get_py_instance(instance, schema_name, schema_def)
+    root_class = get_root_class(schema_name)
+    py_inst = get_py_instance(instance, None, schema_def, root_class)
 
     prefixes = DEFAULT_PREFIXES.copy()
     prefixes["_base"] = ontology_iri
@@ -85,45 +86,52 @@ def dump_to_rdf(
     return g
 
 
-def get_py_instance(instance_dict, schema_name, schema_def):
+def get_py_instance(instance_dict, schema_name, schema_def, root_class=None):
     """
     Returns a Python instance of the schema class from the given data instance.
     Args:
         instance_dict: The data instance dictionary.
         schema_name: The name of the schema to be used for RDF generation.
         schema_def: The schema definition object.
+        root_class: The root class of the schema if this is not a core (base,cap or bican) schema.
     Returns:
         The Python instance of the schema class.
     """
-    root_class = CAS_ROOT_CLASS
     if isinstance(schema_name, str):
         if schema_name.lower() == "base":
-            root_class = CAS_ROOT_CLASS
-            # return from_dict(data_class=Taxonomy, data=instance_dict)
+            return Taxonomy(**instance_dict)
         elif schema_name.lower() == "bican":
-            root_class = "BicanTaxonomy"
-            # return from_dict(data_class=BicanTaxonomy, data=instance_dict)
+            return BicanTaxonomy(**instance_dict)
         elif schema_name.lower() == "cap":
-            root_class = "CapTaxonomy"
-            # return from_dict(data_class=CapTaxonomy, data=instance_dict)
+            return CapTaxonomy(**instance_dict)
 
     # unknown schema, dynamically generate the python module and instantiate
     gen = generators.PythonGenerator(schema_def)
     output = gen.serialize()
     python_module = compile_python(output)
     py_target_class = getattr(python_module, root_class)
-
-    # try:
     py_inst = py_target_class(**instance_dict)
-    # except Exception as e:
-    #     print(f"Could not instantiate {py_target_class} from the data; exception: {e}")
-    #     import traceback
-    #     print(traceback.format_exc())
-    #     return None
 
     return py_inst
 
 
+def get_root_class(schema_name):
+    """
+    Returns the root class of the schema based on the schema name.
+    Args:
+        schema_name: The name of the schema.
+    Returns: The root class of the schema.
+    """
+    root_class = None
+    if schema_name.lower() == "base":
+        root_class = CAS_ROOT_CLASS
+    elif schema_name.lower() == "bican":
+        root_class = "BicanTaxonomy"
+    elif schema_name.lower() == "cap":
+        root_class = "CapTaxonomy"
+    return root_class
+
+
 def add_cl_existential_restrictions(g: rdflib.Graph):
     """
     Adds existential restrictions to the CL class in the given RDF graph.