Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validation of User Configuration #28

Merged
8 commits merged into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
"pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None),
"xarray": ("http://xarray.pydata.org/en/stable/", None),
"chemicals": ("https://chemicals.readthedocs.io/", None),
"cerberus": ("https://docs.python-cerberus.org/", None),
}


Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ def read(filename):
package_dir={"": "src"},
packages=find_packages(where="src", exclude=("tests",)),
install_requires=[
"cerberus",
"cf_xarray",
"chemicals",
"click-loguru",
"dask",
"distributed",
"dill",
"dpath",
"pendulum",
"pint-xarray",
Expand All @@ -50,6 +50,7 @@ def read(filename):
extras_require={
"dev": [
"black",
"dill",
"flake8",
"isort",
"pooch",
Expand Down
14 changes: 11 additions & 3 deletions src/pymorize/cmorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .logging import logger
from .pipeline import Pipeline
from .rule import Rule
from .validate import PIPELINES_VALIDATOR, RULES_VALIDATOR


class CMORizer:
Expand Down Expand Up @@ -155,15 +156,22 @@ def from_dict(cls, data):
pymorize_cfg=data.get("pymorize", {}),
general_cfg=data.get("general", {}),
)
if "rules" in data:
if not RULES_VALIDATOR.validate({"rules": data["rules"]}):
raise ValueError(RULES_VALIDATOR.errors)
for rule in data.get("rules", []):
rule_obj = Rule.from_dict(rule)
instance.add_rule(rule_obj)
instance._post_init_populate_rules_with_tables()
instance._post_init_create_data_request()
instance._post_init_data_request_variables()
if "pipelines" in data:
if not PIPELINES_VALIDATOR.validate({"pipelines": data["pipelines"]}):
raise ValueError(PIPELINES_VALIDATOR.errors)
for pipeline in data.get("pipelines", []):
pipeline_obj = Pipeline.from_dict(pipeline)
instance.add_pipeline(pipeline_obj)

instance._post_init_populate_rules_with_tables()
instance._post_init_create_data_request()
instance._post_init_data_request_variables()
return instance

def add_rule(self, rule):
Expand Down
3 changes: 0 additions & 3 deletions src/pymorize/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,13 +156,10 @@ def dummy_load_data(data, rule_spec, cmorizer, *args, **kwargs):
"""
A dummy function for testing. Loads the xarray tutorial data
"""
allowed_input_sources = ["xr_tutorial"]
logger.info("Loading data")
input_source = rule_spec.get("input_source", "xr_tutorial")
if input_source == "xr_tutorial":
data = xr.tutorial.open_dataset("air_temperature")
else:
raise NotImplementedError(f"Only {allowed_input_sources} are supported for now")
if rule_spec.get("input_type") == "xr.DataArray":
data = getattr(data, rule_spec.get("da_name", "air"))
return data
Expand Down
111 changes: 111 additions & 0 deletions src/pymorize/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""
Provides validation of user configuration files by checking against a schema.
"""

import importlib

from cerberus import Validator


class PipelineValidator(Validator):
"""
Validator for pipeline configuration.

See Also
--------
* https://cerberus-sanhe.readthedocs.io/customize.html#class-based-custom-validators
"""

def _validate_is_qualname(self, is_qualname, field, value):
"""Test if a string is a Python qualname.

The rule's arguments are validated against this schema:
{'type': 'boolean'}. This means that you can use a boolean value
for the schema argument "is_qualname" in your rule definition.
"""
if is_qualname and not isinstance(value, str):
self._error(field, "Must be a string")
if is_qualname:
parts = value.split(".")
module_name, attr_name = ".".join(parts[:-1]), parts[-1]
try:
module = importlib.import_module(module_name)
if not hasattr(module, attr_name):
self._error(field, "Must be a valid Python qualname")
except (ImportError, ModuleNotFoundError):
self._error(field, "Must be a valid Python qualname")

def _validate(self, document):
super()._validate(document)
if "steps" not in document and "uses" not in document:
self._error(
"document", 'At least one of "steps" or "uses" must be specified'
)


PIPELINES_SCHEMA = {
"pipelines": {
"type": "list",
"schema": {
"type": "dict",
"schema": {
"name": {"type": "string", "required": False},
"uses": {"type": "string", "excludes": "steps"},
"steps": {
"type": "list",
"excludes": "uses",
"schema": {"type": "string", "is_qualname": True},
},
},
},
},
}
"""dict : Schema for validating pipelines configuration."""

PIPELINES_VALIDATOR = PipelineValidator(PIPELINES_SCHEMA)
"""Validator : Validator for pipelines configuration."""

RULES_SCHEMA = {
"rules": {
"type": "list",
"schema": {
"type": "dict",
"schema": {
"name": {"type": "string", "required": False},
"cmor_variable": {"type": "string", "required": True},
"input_type": {
"type": "string",
"required": False,
"allowed": [
"xr.DataArray",
"xr.Dataset",
],
},
"input_source": {
"type": "string",
"required": False,
"allowed": [
"xr_tutorial",
],
},
"input_patterns": {
"type": "list",
"schema": {"type": "string"},
"required": True,
},
"enabled": {"type": "boolean", "required": False},
"description": {"type": "string", "required": False},
"pipelines": {
"type": "list",
# FIXME(PG): Should cross-check with pipelines.
"schema": {"type": "string"},
},
"cmor_units": {"type": "string", "required": False},
# FIXME(PS): How is it currently defined?
"model_units": {"type": "string", "required": False},
Comment on lines +104 to +105
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@siligam: This still needs to be added correctly for units.py. I wasn't sure how you had designed it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pgierz "model_units" is referred as "source_units" in units.py. It is a optional parameter. If it is not provided, it reads the model units from xarray dataset. At times, user prefers to use some other unit instead of units from xarray dataset. In this case, "source_units" takes precedence if provided.
The setting "model_units" : {"required": False} is apt.

},
},
},
}
"""dict : Schema for validating rules configuration."""
RULES_VALIDATOR = Validator(RULES_SCHEMA)
8 changes: 8 additions & 0 deletions tests/configs/test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,11 @@ rules:
input_source: "xr_tutorial"
input_patterns:
- "test_input"
- name: test_rule3
enabled: false
input_patterns: ["/a/b/c"]
cmor_variable: "so"
- name: test_rule4
cmor_variable: "thetao"
pipelines: ["sleeper_pipeline"]
input_patterns: ["/a/b/c"]
49 changes: 49 additions & 0 deletions tests/unit/test_validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest

from pymorize.validate import PIPELINES_SCHEMA, PipelineValidator


@pytest.fixture
def validator():
return PipelineValidator(PIPELINES_SCHEMA)


def test_initialize(validator):
assert validator.schema == PIPELINES_SCHEMA


def test_is_qualname(validator):
# Test with valid qualname
validator._validate_is_qualname(True, "field", "os.path.join")


def test_is_qualname_error(validator):
# Test with invalid qualname
with pytest.raises(Exception):
validator._validate_is_qualname(True, "field", "non.existent.module")


def test_validate(validator):
# Test with valid document
document = {"pipelines": [{"steps": ["os.path.join"]}]}
assert validator.validate(document)


def test_validate_neither_steps_nor_uses(validator):
# Test with invalid document (neither 'steps' nor 'uses' specified)
document = {"name": "test"}
valid_document = validator.validate(document)
assert valid_document is False
# with pytest.raises(
# Exception, match='At least one of "steps" or "uses" must be specified'
# ):
# validator.validate(document)


def test_validate_error_non_qualname(validator):
# Test with invalid pipeline configuration (invalid 'steps' qualname)
pipelines = {"pipelines": [{"name": "test", "steps": ["non.existent.module"]}]}
valid_document = validator.validate(pipelines)
assert valid_document is False
# with pytest.raises(Exception, match="Must be a valid Python qualname"):
# validator.validate(pipelines)
Loading