diff --git a/src/pymorize/cmorizer.py b/src/pymorize/cmorizer.py index 71a4607..67b5ff0 100644 --- a/src/pymorize/cmorizer.py +++ b/src/pymorize/cmorizer.py @@ -1,6 +1,7 @@ from pathlib import Path import dask +import pandas as pd import questionary import yaml from dask.distributed import Client @@ -13,9 +14,11 @@ from .data_request import (DataRequest, DataRequestTable, DataRequestVariable, IgnoreTableFiles) +from .filecache import fc from .logging import logger from .pipeline import Pipeline from .rule import Rule +from .timeaverage import _frequency_from_approx_interval from .utils import wait_for_workers from .validate import PIPELINES_VALIDATOR, RULES_VALIDATOR @@ -214,8 +217,46 @@ def _post_init_inherit_rules(self): def _post_init_checks(self): # Sanity Checks: - self._check_rules_for_table() - self._check_rules_for_output_dir() + # :PS: @PG the following functions are not defined yet + # self._check_rules_for_table() + # self._check_rules_for_output_dir() + self._check_is_subperiod() + + def _check_is_subperiod(self): + logger.info("checking frequency in netcdf file and in table...") + try: + rule = self.rules[0] + except IndexError: + logger.info("No rules found to for checking frequency. ..skipping..") + return + table_freq = _frequency_from_approx_interval( + rule.data_request_variable.table.approx_interval + ) + # is_subperiod from pandas does not support YE or ME notation + table_freq = table_freq.rstrip("E") + first_filenames = [] + for input_collection in rule.inputs: + try: + first_filenames.append(input_collection.files[0]) + except IndexError: + logger.info("No input files found. ..skipping..") + return + if len(first_filenames) == 1: + filename = first_filenames[0] + data_freq = fc.get(filename).freq + else: # Multi-variable Rule, handle differently + data_freqs = set([fc.get(filename).freq for filename in first_filenames]) + if len(data_freqs) != 1: + raise ValueError( + f"You have a compound variable and have multiple internal frequencies! This is not allowed: {data_freqs}" + ) + data_freq = data_freqs[0] + is_subperiod = pd.tseries.frequencies.is_subperiod(data_freq, table_freq) + if not is_subperiod: + raise ValueError( + f"Frequency in source file {data_freq} is not a subperiod of frequency in table {table_freq}." + ) + logger.info(f"Frequency of data {data_freq}. Frequency in tables {table_freq}") @classmethod def from_dict(cls, data): @@ -244,6 +285,7 @@ def from_dict(cls, data): instance._post_init_populate_rules_with_tables() instance._post_init_create_data_request() instance._post_init_data_request_variables() + instance._post_init_checks() return instance def add_rule(self, rule):