Skip to content

Commit

Permalink
dataclass instantiation tests fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
hkir-dev committed Jul 31, 2024
1 parent 918a2a4 commit e73b2d9
Show file tree
Hide file tree
Showing 13 changed files with 2,201 additions and 116 deletions.
7 changes: 4 additions & 3 deletions project.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ merged_schemas:
gen-linkml $(SCHEMA_FOLDER)/CAP/CAP_schema.yaml --output $(BUILD_FOLDER)/CAP_schema.yaml --mergeimports --format yaml --no-materialize-attributes

classes: merged_schemas
gen-python $(BUILD_FOLDER)/general_schema.yaml --no-slots > $(PYMODEL)/cell_annotation_schema.py
gen-python $(BUILD_FOLDER)/BICAN_schema.yaml --no-slots > $(PYMODEL)/bican/cell_annotation_schema.py
gen-python $(BUILD_FOLDER)/CAP_schema.yaml --no-slots > $(PYMODEL)/cap/cell_annotation_schema.py
# gen-python $(BUILD_FOLDER)/general_schema.yaml --no-slots > $(PYMODEL)/cell_annotation_schema.py
# gen-python $(BUILD_FOLDER)/BICAN_schema.yaml --no-slots > $(PYMODEL)/bican/cell_annotation_schema.py
# gen-python $(BUILD_FOLDER)/CAP_schema.yaml --no-slots > $(PYMODEL)/cap/cell_annotation_schema.py
poetry run python src/cell_annotation_schema/generator/dataclassgen.py

build: merged_schemas classes
echo "Release products generated."
Expand Down
176 changes: 164 additions & 12 deletions src/cell_annotation_schema/datamodel/bican/cell_annotation_schema.py

Large diffs are not rendered by default.

175 changes: 168 additions & 7 deletions src/cell_annotation_schema/datamodel/cap/cell_annotation_schema.py

Large diffs are not rendered by default.

116 changes: 114 additions & 2 deletions src/cell_annotation_schema/datamodel/cell_annotation_schema.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Auto generated from general_schema.yaml by pythongen.py version: 0.0.1
# Generation date: 2024-07-23T16:17:25
# Auto generated from None by pythongen.py version: 0.0.1
# Generation date: 2024-07-31T16:21:54
# Schema: General_Cell_Annotation_Open_Standard
#
# id: https://cellular-semantics.sanger.ac.uk/ontology/CAS
Expand Down Expand Up @@ -327,4 +327,116 @@ class CellTypeEnum(EnumDefinitionImpl):
)

# Slots
class slots:
pass

slots.datestamp = Slot(uri=CELL_ANNOTATION_SCHEMA.datestamp, name="datestamp", curie=CELL_ANNOTATION_SCHEMA.curie('datestamp'),
model_uri=CELL_ANNOTATION_SCHEMA.datestamp, domain=None, range=str)

slots.reviewer = Slot(uri=CELL_ANNOTATION_SCHEMA.reviewer, name="reviewer", curie=CELL_ANNOTATION_SCHEMA.curie('reviewer'),
model_uri=CELL_ANNOTATION_SCHEMA.reviewer, domain=None, range=Optional[str])

slots.review = Slot(uri=CELL_ANNOTATION_SCHEMA.review, name="review", curie=CELL_ANNOTATION_SCHEMA.curie('review'),
model_uri=CELL_ANNOTATION_SCHEMA.review, domain=None, range=Optional[Union[str, "ReviewOptions"]])

slots.explanation = Slot(uri=IAO['0000115'], name="explanation", curie=IAO.curie('0000115'),
model_uri=CELL_ANNOTATION_SCHEMA.explanation, domain=None, range=Optional[str])

slots.name = Slot(uri=RDFS.label, name="name", curie=RDFS.curie('label'),
model_uri=CELL_ANNOTATION_SCHEMA.name, domain=None, range=str)

slots.description = Slot(uri=IAO['0000115'], name="description", curie=IAO.curie('0000115'),
model_uri=CELL_ANNOTATION_SCHEMA.description, domain=None, range=Optional[str])

slots.annotation_method = Slot(uri=CELL_ANNOTATION_SCHEMA.annotation_method, name="annotation_method", curie=CELL_ANNOTATION_SCHEMA.curie('annotation_method'),
model_uri=CELL_ANNOTATION_SCHEMA.annotation_method, domain=None, range=Optional[Union[str, "AnnotationMethodOptions"]])

slots.automated_annotation = Slot(uri=CELL_ANNOTATION_SCHEMA.automated_annotation, name="automated_annotation", curie=CELL_ANNOTATION_SCHEMA.curie('automated_annotation'),
model_uri=CELL_ANNOTATION_SCHEMA.automated_annotation, domain=None, range=Optional[Union[dict, AutomatedAnnotation]])

slots.algorithm_name = Slot(uri=CELL_ANNOTATION_SCHEMA.algorithm_name, name="algorithm_name", curie=CELL_ANNOTATION_SCHEMA.curie('algorithm_name'),
model_uri=CELL_ANNOTATION_SCHEMA.algorithm_name, domain=None, range=str)

slots.algorithm_version = Slot(uri=CELL_ANNOTATION_SCHEMA.algorithm_version, name="algorithm_version", curie=CELL_ANNOTATION_SCHEMA.curie('algorithm_version'),
model_uri=CELL_ANNOTATION_SCHEMA.algorithm_version, domain=None, range=str)

slots.algorithm_repo_url = Slot(uri=CELL_ANNOTATION_SCHEMA.algorithm_repo_url, name="algorithm_repo_url", curie=CELL_ANNOTATION_SCHEMA.curie('algorithm_repo_url'),
model_uri=CELL_ANNOTATION_SCHEMA.algorithm_repo_url, domain=None, range=str)

slots.reference_location = Slot(uri=CELL_ANNOTATION_SCHEMA.reference_location, name="reference_location", curie=CELL_ANNOTATION_SCHEMA.curie('reference_location'),
model_uri=CELL_ANNOTATION_SCHEMA.reference_location, domain=None, range=Optional[str])

slots.labelset = Slot(uri=CAS.has_labelset, name="labelset", curie=CAS.curie('has_labelset'),
model_uri=CELL_ANNOTATION_SCHEMA.labelset, domain=None, range=str)

slots.cell_label = Slot(uri=RDFS.label, name="cell_label", curie=RDFS.curie('label'),
model_uri=CELL_ANNOTATION_SCHEMA.cell_label, domain=None, range=str)

slots.cell_fullname = Slot(uri=SKOS.preflabel, name="cell_fullname", curie=SKOS.curie('preflabel'),
model_uri=CELL_ANNOTATION_SCHEMA.cell_fullname, domain=None, range=Optional[str])

slots.cell_ontology_term_id = Slot(uri=RO['0002473'], name="cell_ontology_term_id", curie=RO.curie('0002473'),
model_uri=CELL_ANNOTATION_SCHEMA.cell_ontology_term_id, domain=None, range=Optional[Union[str, "CellTypeEnum"]])

slots.cell_ontology_term = Slot(uri=CELL_ANNOTATION_SCHEMA.cell_ontology_term, name="cell_ontology_term", curie=CELL_ANNOTATION_SCHEMA.curie('cell_ontology_term'),
model_uri=CELL_ANNOTATION_SCHEMA.cell_ontology_term, domain=None, range=Optional[str])

slots.cell_ids = Slot(uri=CAS.has_cellid, name="cell_ids", curie=CAS.curie('has_cellid'),
model_uri=CELL_ANNOTATION_SCHEMA.cell_ids, domain=None, range=Optional[Union[str, List[str]]])

slots.rationale = Slot(uri=CELL_ANNOTATION_SCHEMA.rationale, name="rationale", curie=CELL_ANNOTATION_SCHEMA.curie('rationale'),
model_uri=CELL_ANNOTATION_SCHEMA.rationale, domain=None, range=Optional[str])

slots.rationale_dois = Slot(uri=CELL_ANNOTATION_SCHEMA.rationale_dois, name="rationale_dois", curie=CELL_ANNOTATION_SCHEMA.curie('rationale_dois'),
model_uri=CELL_ANNOTATION_SCHEMA.rationale_dois, domain=None, range=Optional[Union[str, List[str]]])

slots.marker_gene_evidence = Slot(uri=CELL_ANNOTATION_SCHEMA.marker_gene_evidence, name="marker_gene_evidence", curie=CELL_ANNOTATION_SCHEMA.curie('marker_gene_evidence'),
model_uri=CELL_ANNOTATION_SCHEMA.marker_gene_evidence, domain=None, range=Optional[Union[str, List[str]]])

slots.synonyms = Slot(uri=CELL_ANNOTATION_SCHEMA.synonyms, name="synonyms", curie=CELL_ANNOTATION_SCHEMA.curie('synonyms'),
model_uri=CELL_ANNOTATION_SCHEMA.synonyms, domain=None, range=Optional[Union[str, List[str]]])

slots.reviews = Slot(uri=CELL_ANNOTATION_SCHEMA.reviews, name="reviews", curie=CELL_ANNOTATION_SCHEMA.curie('reviews'),
model_uri=CELL_ANNOTATION_SCHEMA.reviews, domain=None, range=Optional[Union[Union[dict, Review], List[Union[dict, Review]]]])

slots.author_annotation_fields = Slot(uri=CELL_ANNOTATION_SCHEMA.author_annotation_fields, name="author_annotation_fields", curie=CELL_ANNOTATION_SCHEMA.curie('author_annotation_fields'),
model_uri=CELL_ANNOTATION_SCHEMA.author_annotation_fields, domain=None, range=Optional[str])

slots.matrix_file_id = Slot(uri=CELL_ANNOTATION_SCHEMA.matrix_file_id, name="matrix_file_id", curie=CELL_ANNOTATION_SCHEMA.curie('matrix_file_id'),
model_uri=CELL_ANNOTATION_SCHEMA.matrix_file_id, domain=None, range=Optional[str])

slots.title = Slot(uri=CELL_ANNOTATION_SCHEMA.title, name="title", curie=CELL_ANNOTATION_SCHEMA.curie('title'),
model_uri=CELL_ANNOTATION_SCHEMA.title, domain=None, range=str)

slots.cellannotation_schema_version = Slot(uri=CELL_ANNOTATION_SCHEMA.cellannotation_schema_version, name="cellannotation_schema_version", curie=CELL_ANNOTATION_SCHEMA.curie('cellannotation_schema_version'),
model_uri=CELL_ANNOTATION_SCHEMA.cellannotation_schema_version, domain=None, range=Optional[str])

slots.cellannotation_timestamp = Slot(uri=CELL_ANNOTATION_SCHEMA.cellannotation_timestamp, name="cellannotation_timestamp", curie=CELL_ANNOTATION_SCHEMA.curie('cellannotation_timestamp'),
model_uri=CELL_ANNOTATION_SCHEMA.cellannotation_timestamp, domain=None, range=Optional[str])

slots.cellannotation_version = Slot(uri=CELL_ANNOTATION_SCHEMA.cellannotation_version, name="cellannotation_version", curie=CELL_ANNOTATION_SCHEMA.curie('cellannotation_version'),
model_uri=CELL_ANNOTATION_SCHEMA.cellannotation_version, domain=None, range=Optional[str])

slots.cellannotation_url = Slot(uri=CELL_ANNOTATION_SCHEMA.cellannotation_url, name="cellannotation_url", curie=CELL_ANNOTATION_SCHEMA.curie('cellannotation_url'),
model_uri=CELL_ANNOTATION_SCHEMA.cellannotation_url, domain=None, range=Optional[str])

slots.author_list = Slot(uri=CELL_ANNOTATION_SCHEMA.author_list, name="author_list", curie=CELL_ANNOTATION_SCHEMA.curie('author_list'),
model_uri=CELL_ANNOTATION_SCHEMA.author_list, domain=None, range=Optional[str])

slots.author_name = Slot(uri=CELL_ANNOTATION_SCHEMA.author_name, name="author_name", curie=CELL_ANNOTATION_SCHEMA.curie('author_name'),
model_uri=CELL_ANNOTATION_SCHEMA.author_name, domain=None, range=str)

slots.author_contact = Slot(uri=CELL_ANNOTATION_SCHEMA.author_contact, name="author_contact", curie=CELL_ANNOTATION_SCHEMA.curie('author_contact'),
model_uri=CELL_ANNOTATION_SCHEMA.author_contact, domain=None, range=Optional[str])

slots.orcid = Slot(uri=CELL_ANNOTATION_SCHEMA.orcid, name="orcid", curie=CELL_ANNOTATION_SCHEMA.curie('orcid'),
model_uri=CELL_ANNOTATION_SCHEMA.orcid, domain=None, range=Optional[str])

slots.labelsets = Slot(uri=CELL_ANNOTATION_SCHEMA.labelsets, name="labelsets", curie=CELL_ANNOTATION_SCHEMA.curie('labelsets'),
model_uri=CELL_ANNOTATION_SCHEMA.labelsets, domain=None, range=Union[Union[dict, Labelset], List[Union[dict, Labelset]]])

slots.annotations = Slot(uri=CELL_ANNOTATION_SCHEMA.annotations, name="annotations", curie=CELL_ANNOTATION_SCHEMA.curie('annotations'),
model_uri=CELL_ANNOTATION_SCHEMA.annotations, domain=None, range=Union[Union[dict, Annotation], List[Union[dict, Annotation]]])

slots.id = Slot(uri=CELL_ANNOTATION_SCHEMA.id, name="id", curie=CELL_ANNOTATION_SCHEMA.curie('id'),
model_uri=CELL_ANNOTATION_SCHEMA.id, domain=None, range=URIRef)
3 changes: 2 additions & 1 deletion src/cell_annotation_schema/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import warnings

from typing import Union
from urllib.request import urlopen
from pathlib import Path
from linkml_runtime.linkml_model import SchemaDefinition
Expand Down Expand Up @@ -36,7 +37,7 @@ def get_json_from_file(filename):
warnings.warn("Failed to parse JSON in " + filename + ": " + str(exc))


def read_schema(schema) -> SchemaDefinition:
def read_schema(schema: Union[str, dict]) -> SchemaDefinition:
"""
Reads the given LinkML schema.
Parameters:
Expand Down
Empty file.
63 changes: 63 additions & 0 deletions src/cell_annotation_schema/generator/dataclassgen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""
Linkml data class generator (gen-python) is generating problematic code because of the subclass and id mechanism we use.
Details of the issue:
Taxonomy-annotations doesn't have a id field so it uses the first slot as id slot (which is labelset).
BicanTaxonomy-annotations (with range Bican Annotations) has a id field so it uses the cell_set_accession field as id slot and works fine.
When we use the gen-python command on the schema, BicanTaxonomy dataclass post_init normalisation code is generated correctly as follows:
```
_normalize_inlined_as_dict(slot_name="annotations", slot_type=BicanAnnotation, key_name="cell_set_accession", keyed=True)
```
But the Taxonomy dataclass post_init normalisation code is generated as follows (expectedly but not correctly):
```
_normalize_inlined_as_dict(slot_name="annotations", slot_type=Annotation, key_name="labelset", keyed=False)
```
And since the BicanTaxonomy post_init is calling the super.post_init, it is trying to normalise the Annotation dataclass
with the labelset field as id field which is not correct.
Solution this code applies:
We read schema into memory and remove `annotations` slot from the Taxonomy class and add it to BicanTaxonomy
(as if it is not inherited but its own slot) so that the problematic normalisation code is not generated.
"""
import os

from pathlib import Path
from typing import Union
from linkml import generators

from linkml_runtime.linkml_model import SchemaDefinition
from linkml_runtime.loaders import yaml_loader

from cell_annotation_schema.file_utils import read_schema
from cell_annotation_schema.ontology.schema import decorate_linkml_schema

SOURCE_DIR = Path(__file__).parent.parent


def generate_data_class(cas_schema: Union[str, dict], class_path: str):
"""
Generate data class from CAS schema.
Args:
cas_schema: CAS schema path or dict representing it.
class_path: Output class path.
Returns:
str: Data class string.
"""
schema_def = read_schema(cas_schema)
schema_dict = decorate_linkml_schema(schema_def)
schema_def = yaml_loader.load(schema_dict, target_class=SchemaDefinition)
gen = generators.PythonGenerator(schema_def)
output = gen.serialize()
with open(class_path, "w") as class_file:
class_file.write(output)


if __name__ == "__main__":
generate_data_class("base", os.path.join(SOURCE_DIR, "datamodel/cell_annotation_schema.py"))
generate_data_class("bican", os.path.join(SOURCE_DIR, "datamodel/bican/cell_annotation_schema.py"))
generate_data_class("cap", os.path.join(SOURCE_DIR, "datamodel/cap/cell_annotation_schema.py"))
40 changes: 24 additions & 16 deletions src/cell_annotation_schema/ontology/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def dump_to_rdf(
validate_data(schema_def, instance)
instance = serialise_author_annotation(instance)

py_inst = get_py_instance(instance, schema_name, schema_def)
root_class = get_root_class(schema_name)
py_inst = get_py_instance(instance, None, schema_def, root_class)

prefixes = DEFAULT_PREFIXES.copy()
prefixes["_base"] = ontology_iri
Expand All @@ -85,45 +86,52 @@ def dump_to_rdf(
return g


def get_py_instance(instance_dict, schema_name, schema_def):
def get_py_instance(instance_dict, schema_name, schema_def, root_class=None):
"""
Returns a Python instance of the schema class from the given data instance.
Args:
instance_dict: The data instance dictionary.
schema_name: The name of the schema to be used for RDF generation.
schema_def: The schema definition object.
root_class: The root class of the schema if this is not a core (base,cap or bican) schema.
Returns:
The Python instance of the schema class.
"""
root_class = CAS_ROOT_CLASS
if isinstance(schema_name, str):
if schema_name.lower() == "base":
root_class = CAS_ROOT_CLASS
# return from_dict(data_class=Taxonomy, data=instance_dict)
return Taxonomy(**instance_dict)
elif schema_name.lower() == "bican":
root_class = "BicanTaxonomy"
# return from_dict(data_class=BicanTaxonomy, data=instance_dict)
return BicanTaxonomy(**instance_dict)
elif schema_name.lower() == "cap":
root_class = "CapTaxonomy"
# return from_dict(data_class=CapTaxonomy, data=instance_dict)
return CapTaxonomy(**instance_dict)

# unknown schema, dynamically generate the python module and instantiate
gen = generators.PythonGenerator(schema_def)
output = gen.serialize()
python_module = compile_python(output)
py_target_class = getattr(python_module, root_class)

# try:
py_inst = py_target_class(**instance_dict)
# except Exception as e:
# print(f"Could not instantiate {py_target_class} from the data; exception: {e}")
# import traceback
# print(traceback.format_exc())
# return None

return py_inst


def get_root_class(schema_name):
"""
Returns the root class of the schema based on the schema name.
Args:
schema_name: The name of the schema.
Returns: The root class of the schema.
"""
root_class = None
if schema_name.lower() == "base":
root_class = CAS_ROOT_CLASS
elif schema_name.lower() == "bican":
root_class = "BicanTaxonomy"
elif schema_name.lower() == "cap":
root_class = "CapTaxonomy"
return root_class


def add_cl_existential_restrictions(g: rdflib.Graph):
"""
Adds existential restrictions to the CL class in the given RDF graph.
Expand Down
Loading

0 comments on commit e73b2d9

Please sign in to comment.