Skip to content

Commit

Permalink
Merge pull request #48 from esm-tools/feat/pre-validate
Browse files Browse the repository at this point in the history
pre validation
  • Loading branch information
pgierz authored Nov 6, 2024
2 parents 352c54b + 8909581 commit e937423
Showing 1 changed file with 44 additions and 2 deletions.
46 changes: 44 additions & 2 deletions src/pymorize/cmorizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path

import dask
import pandas as pd
import questionary
import yaml
from dask.distributed import Client
Expand All @@ -13,9 +14,11 @@

from .data_request import (DataRequest, DataRequestTable, DataRequestVariable,
IgnoreTableFiles)
from .filecache import fc
from .logging import logger
from .pipeline import Pipeline
from .rule import Rule
from .timeaverage import _frequency_from_approx_interval
from .utils import wait_for_workers
from .validate import PIPELINES_VALIDATOR, RULES_VALIDATOR

Expand Down Expand Up @@ -214,8 +217,46 @@ def _post_init_inherit_rules(self):

def _post_init_checks(self):
# Sanity Checks:
self._check_rules_for_table()
self._check_rules_for_output_dir()
# :PS: @PG the following functions are not defined yet
# self._check_rules_for_table()
# self._check_rules_for_output_dir()
self._check_is_subperiod()

def _check_is_subperiod(self):
logger.info("checking frequency in netcdf file and in table...")
try:
rule = self.rules[0]
except IndexError:
logger.info("No rules found to for checking frequency. ..skipping..")
return
table_freq = _frequency_from_approx_interval(
rule.data_request_variable.table.approx_interval
)
# is_subperiod from pandas does not support YE or ME notation
table_freq = table_freq.rstrip("E")
first_filenames = []
for input_collection in rule.inputs:
try:
first_filenames.append(input_collection.files[0])
except IndexError:
logger.info("No input files found. ..skipping..")
return
if len(first_filenames) == 1:
filename = first_filenames[0]
data_freq = fc.get(filename).freq
else: # Multi-variable Rule, handle differently
data_freqs = set([fc.get(filename).freq for filename in first_filenames])
if len(data_freqs) != 1:
raise ValueError(
f"You have a compound variable and have multiple internal frequencies! This is not allowed: {data_freqs}"
)
data_freq = data_freqs[0]
is_subperiod = pd.tseries.frequencies.is_subperiod(data_freq, table_freq)
if not is_subperiod:
raise ValueError(
f"Frequency in source file {data_freq} is not a subperiod of frequency in table {table_freq}."
)
logger.info(f"Frequency of data {data_freq}. Frequency in tables {table_freq}")

@classmethod
def from_dict(cls, data):
Expand Down Expand Up @@ -244,6 +285,7 @@ def from_dict(cls, data):
instance._post_init_populate_rules_with_tables()
instance._post_init_create_data_request()
instance._post_init_data_request_variables()
instance._post_init_checks()
return instance

def add_rule(self, rule):
Expand Down

0 comments on commit e937423

Please sign in to comment.