esm-tools · Sep 20, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 8, 2024
diff --git a/doc/conf.py b/doc/conf.py
@@ -79,6 +79,7 @@
     "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None),
     "xarray": ("http://xarray.pydata.org/en/stable/", None),
     "chemicals": ("https://chemicals.readthedocs.io/", None),
+    "cerberus": ("https://docs.python-cerberus.org/", None),
 }
 
 

diff --git a/setup.py b/setup.py
@@ -31,12 +31,12 @@ def read(filename):
     package_dir={"": "src"},
     packages=find_packages(where="src", exclude=("tests",)),
     install_requires=[
+        "cerberus",
         "cf_xarray",
         "chemicals",
         "click-loguru",
         "dask",
         "distributed",
-        "dill",
         "dpath",
         "pendulum",
         "pint-xarray",
@@ -50,6 +50,7 @@ def read(filename):
     extras_require={
         "dev": [
             "black",
+            "dill",
             "flake8",
             "isort",
             "pooch",

diff --git a/src/pymorize/cmorizer.py b/src/pymorize/cmorizer.py
@@ -9,6 +9,7 @@
 from .logging import logger
 from .pipeline import Pipeline
 from .rule import Rule
+from .validate import PIPELINES_VALIDATOR, RULES_VALIDATOR
 
 
 class CMORizer:
@@ -155,15 +156,22 @@ def from_dict(cls, data):
             pymorize_cfg=data.get("pymorize", {}),
             general_cfg=data.get("general", {}),
         )
+        if "rules" in data:
+            if not RULES_VALIDATOR.validate({"rules": data["rules"]}):
+                raise ValueError(RULES_VALIDATOR.errors)
         for rule in data.get("rules", []):
             rule_obj = Rule.from_dict(rule)
             instance.add_rule(rule_obj)
-        instance._post_init_populate_rules_with_tables()
-        instance._post_init_create_data_request()
-        instance._post_init_data_request_variables()
+        if "pipelines" in data:
+            if not PIPELINES_VALIDATOR.validate({"pipelines": data["pipelines"]}):
+                raise ValueError(PIPELINES_VALIDATOR.errors)
         for pipeline in data.get("pipelines", []):
             pipeline_obj = Pipeline.from_dict(pipeline)
             instance.add_pipeline(pipeline_obj)
+
+        instance._post_init_populate_rules_with_tables()
+        instance._post_init_create_data_request()
+        instance._post_init_data_request_variables()
         return instance
 
     def add_rule(self, rule):

diff --git a/src/pymorize/generic.py b/src/pymorize/generic.py
@@ -156,13 +156,10 @@ def dummy_load_data(data, rule_spec, cmorizer, *args, **kwargs):
     """
     A dummy function for testing. Loads the xarray tutorial data
     """
-    allowed_input_sources = ["xr_tutorial"]
     logger.info("Loading data")
     input_source = rule_spec.get("input_source", "xr_tutorial")
     if input_source == "xr_tutorial":
         data = xr.tutorial.open_dataset("air_temperature")
-    else:
-        raise NotImplementedError(f"Only {allowed_input_sources} are supported for now")
     if rule_spec.get("input_type") == "xr.DataArray":
         data = getattr(data, rule_spec.get("da_name", "air"))
     return data

diff --git a/src/pymorize/validate.py b/src/pymorize/validate.py
@@ -0,0 +1,111 @@
+"""
+Provides validation of user configuration files by checking against a schema.
+"""
+
+import importlib
+
+from cerberus import Validator
+
+
+class PipelineValidator(Validator):
+    """
+    Validator for pipeline configuration.
+
+    See Also
+    --------
+    * https://cerberus-sanhe.readthedocs.io/customize.html#class-based-custom-validators
+    """
+
+    def _validate_is_qualname(self, is_qualname, field, value):
+        """Test if a string is a Python qualname.
+
+        The rule's arguments are validated against this schema:
+        {'type': 'boolean'}. This means that you can use a boolean value
+        for the schema argument "is_qualname" in your rule definition.
+        """
+        if is_qualname and not isinstance(value, str):
+            self._error(field, "Must be a string")
+        if is_qualname:
+            parts = value.split(".")
+            module_name, attr_name = ".".join(parts[:-1]), parts[-1]
+            try:
+                module = importlib.import_module(module_name)
+                if not hasattr(module, attr_name):
+                    self._error(field, "Must be a valid Python qualname")
+            except (ImportError, ModuleNotFoundError):
+                self._error(field, "Must be a valid Python qualname")
+
+    def _validate(self, document):
+        super()._validate(document)
+        if "steps" not in document and "uses" not in document:
+            self._error(
+                "document", 'At least one of "steps" or "uses" must be specified'
+            )
+
+
+PIPELINES_SCHEMA = {
+    "pipelines": {
+        "type": "list",
+        "schema": {
+            "type": "dict",
+            "schema": {
+                "name": {"type": "string", "required": False},
+                "uses": {"type": "string", "excludes": "steps"},
+                "steps": {
+                    "type": "list",
+                    "excludes": "uses",
+                    "schema": {"type": "string", "is_qualname": True},
+                },
+            },
+        },
+    },
+}
+"""dict : Schema for validating pipelines configuration."""
+
+PIPELINES_VALIDATOR = PipelineValidator(PIPELINES_SCHEMA)
+"""Validator : Validator for pipelines configuration."""
+
+RULES_SCHEMA = {
+    "rules": {
+        "type": "list",
+        "schema": {
+            "type": "dict",
+            "schema": {
+                "name": {"type": "string", "required": False},
+                "cmor_variable": {"type": "string", "required": True},
+                "input_type": {
+                    "type": "string",
+                    "required": False,
+                    "allowed": [
+                        "xr.DataArray",
+                        "xr.Dataset",
+                    ],
+                },
+                "input_source": {
+                    "type": "string",
+                    "required": False,
+                    "allowed": [
+                        "xr_tutorial",
+                    ],
+                },
+                "input_patterns": {
+                    "type": "list",
+                    "schema": {"type": "string"},
+                    "required": True,
+                },
+                "enabled": {"type": "boolean", "required": False},
+                "description": {"type": "string", "required": False},
+                "pipelines": {
+                    "type": "list",
+                    # FIXME(PG): Should cross-check with pipelines.
+                    "schema": {"type": "string"},
+                },
+                "cmor_units": {"type": "string", "required": False},
+                # FIXME(PS): How is it currently defined?
+                "model_units": {"type": "string", "required": False},
+            },
+        },
+    },
+}
+"""dict : Schema for validating rules configuration."""
+RULES_VALIDATOR = Validator(RULES_SCHEMA)
diff --git a/tests/configs/test_config.yaml b/tests/configs/test_config.yaml
@@ -31,3 +31,11 @@ rules:
     input_source: "xr_tutorial"
     input_patterns:
       - "test_input"
+  - name: test_rule3
+    enabled: false
+    input_patterns: ["/a/b/c"]
+    cmor_variable: "so"
+  - name: test_rule4
+    cmor_variable: "thetao"
+    pipelines: ["sleeper_pipeline"]
+    input_patterns: ["/a/b/c"]
diff --git a/tests/unit/test_validate.py b/tests/unit/test_validate.py
@@ -0,0 +1,49 @@
+import pytest
+
+from pymorize.validate import PIPELINES_SCHEMA, PipelineValidator
+
+
+@pytest.fixture
+def validator():
+    return PipelineValidator(PIPELINES_SCHEMA)
+
+
+def test_initialize(validator):
+    assert validator.schema == PIPELINES_SCHEMA
+
+
+def test_is_qualname(validator):
+    # Test with valid qualname
+    validator._validate_is_qualname(True, "field", "os.path.join")
+
+
+def test_is_qualname_error(validator):
+    # Test with invalid qualname
+    with pytest.raises(Exception):
+        validator._validate_is_qualname(True, "field", "non.existent.module")
+
+
+def test_validate(validator):
+    # Test with valid document
+    document = {"pipelines": [{"steps": ["os.path.join"]}]}
+    assert validator.validate(document)
+
+
+def test_validate_neither_steps_nor_uses(validator):
+    # Test with invalid document (neither 'steps' nor 'uses' specified)
+    document = {"name": "test"}
+    valid_document = validator.validate(document)
+    assert valid_document is False
+    # with pytest.raises(
+    #     Exception, match='At least one of "steps" or "uses" must be specified'
+    # ):
+    #     validator.validate(document)
+
+
+def test_validate_error_non_qualname(validator):
+    # Test with invalid pipeline configuration (invalid 'steps' qualname)
+    pipelines = {"pipelines": [{"name": "test", "steps": ["non.existent.module"]}]}
+    valid_document = validator.validate(pipelines)
+    assert valid_document is False
+    # with pytest.raises(Exception, match="Must be a valid Python qualname"):
+    #     validator.validate(pipelines)