WCRP-CORDEX · huard · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 4, 2024
diff --git a/.github/workflows/update-cv.yaml b/.github/workflows/update-cv.yaml
@@ -8,6 +8,7 @@ on:
   push:
     paths:
       - 'CORDEX-CMIP6_*.json'
+      - 'scripts/*.py'
     branches:
       - main
 
@@ -43,6 +44,10 @@ jobs:
         create-cv
         mv CORDEX-CMIP6_CV.json tables/Tables
 
+    - name: Create JSON schema
+      run: |
+        python scripts/create-json-schema.py
+
     - name: Check if there are any changes
       id: verify_diff
       run: |
@@ -52,10 +57,10 @@ jobs:
     - name: Commit and push
       if: steps.verify_diff.outputs.changed == 'true'
       run: |
-        cd tables
         git config --local user.email "github-actions[bot]@users.noreply.github.com"
         git config --local user.name "github-actions[bot]"
         (pre-commit run --all-files) || true
-        git commit Tables/CORDEX-CMIP6_CV.json -m "CV update"
+        git commit tables/Tables/CORDEX-CMIP6_CV.json -m "CV update"
+        git commit cmip6-cordex-global-attrs-schema.json -m "JSON schema update"
         git status
         git push
diff --git a/scripts/cv2schema.py b/scripts/cv2schema.py
@@ -0,0 +1,130 @@
+import json
+from pathlib import Path
+
+"""
+# Export the CVs to JSON-schema.
+
+The function `make_global_attrs_schema` reads the CVs in the root directory and return a JSON schema. This schema can
+then be used to validate global attributes from CORDEX simulations.
+
+For example
+
+```
+import jsonschema
+import xarray as xr
+ds = xr.open_dataset("<path to netCDF file>")
+schema = make_global_attrs_schema()
+jsonschema.validate(ds.attrs, schema)
+```
+
+Any missing or incorrect global attribute will raise a `ValidationError`.
+"""
+
+DIR = Path(".")
+
+
+def make_global_attrs_schema(prefix: str = None, enum: bool = False) -> dict:
+    """Create a JSON schema for netCDF global attributes from the JSON CVs.
+
+    Parameters
+    ----------
+    prefix : str
+        Prefix to add to all properties.
+    enum : bool
+        If True, return an enum schema instead of oneOf, leading to smaller, easier to read schemas.
+
+    Returns
+    -------
+    dict
+        JSON schema for global attributes.
+    """
+    prefix = prefix + ":" if prefix else ""
+
+    # Read required global attributes
+    reqs = read_cv("required_global_attributes")["required_global_attributes"]
+
+    schema = {
+        "$schema": "http://json-schema.org/draft-07/schema#",
+        "$id": "cmip6-cordex-global-attrs-schema.json#",
+        "title": "CORDEX-CMIP6 metadata schema for global attributes",
+        "description": "JSON schema for global attributes of CORDEX-CMIP6 datasets. This schema is automatically "
+        "generated from the CVs. Manual edits will be overwritten.",
+        "type": "object",
+        "properties": {},
+        "required": [prefix + fid for fid in reqs],
+    }
+
+    integer_fields = []
+
+    formats = {"creation_date": "date-time"}
+
+    props = {}
+    for fid in reqs:
+        if fid in integer_fields:
+            # Could be replaced by patternProperties, but at the expense of readability
+            props[prefix + fid] = {"type": "integer"}
+        else:
+            try:
+                cv = read_cv(fid)
+                for key, val in cv_to_property(cv, enum=enum).items():
+                    props[prefix + key] = val
+            except FileNotFoundError:
+                props[prefix + fid] = {"type": "string"}
+
+        if fid in formats:
+            props[prefix + fid]["format"] = formats[fid]
+
+    schema["properties"].update(props)
+
+    return schema
+
+
+def cv_to_property(cv: dict, enum: bool = False) -> dict:
+    """Convert a CV to a JSON schema property.
+
+    Parameters
+    ----------
+    cv : dict
+        CV dictionary.
+    enum: bool
+        If True, return an enum schema instead of oneOf.
+    """
+    if len(cv) > 1:
+        raise ValueError("CV has more than one key.")
+
+    field = {
+        "source_id": "label",
+        "experiment_id": "description",
+        "domain_id": "domain",
+        "driving_source_id": "driving_source",
+    }
+
+    out = {}
+    for fid, keys in cv.items():
+        items = []
+        if isinstance(keys, dict):
+            for key, value in keys.items():
+                if isinstance(value, str):
+                    items.append({"const": key, "title": value})
+                elif isinstance(value, dict):
+                    items.append({"const": key, "title": value.get(field[fid], "")})
+            if enum:
+                out[fid] = {"enum": [item["const"] for item in items]}
+            else:
+                out[fid] = {"oneOf": items}
+        elif isinstance(keys, list):
+            out[fid] = {"enum": keys}
+    return out
+
+
+def read_cv(key: str) -> dict:
+    """Read a CV file and return it as a dictionary."""
+    path = DIR / f"CORDEX-CMIP6_{key}.json"
+    with open(path) as f:
+        return json.load(f)
+
+
+if __name__ == "__main__":
+    schema = make_global_attrs_schema(prefix="cordex6", enum=True)
+    with open("cmip6-cordex-global-attrs-schema.json", "w") as f:
+        json.dump(schema, f, indent=2)