From d91f7a513fe4b908411061b1fba1bd0d97c143a2 Mon Sep 17 00:00:00 2001 From: Harrison Cook Date: Wed, 3 Apr 2024 15:47:02 +1100 Subject: [PATCH] Add PIT, fix conflicts on rebase, fix broken modules --- src/scores/probability/__init__.py | 1 + src/scores/probability/pit.py | 414 +++++++++++++++++++++ src/scores/processing/cdf/cdf_functions.py | 216 +++++++++++ tests/probabilty/pit_test_data.py | 263 +++++++++++++ tests/probabilty/test_pit.py | 162 ++++++++ tests/processing/cdf/test_functions.py | 101 +++++ 6 files changed, 1157 insertions(+) create mode 100644 src/scores/probability/pit.py create mode 100644 tests/probabilty/pit_test_data.py create mode 100644 tests/probabilty/test_pit.py diff --git a/src/scores/probability/__init__.py b/src/scores/probability/__init__.py index d4293b69..ff3b5545 100644 --- a/src/scores/probability/__init__.py +++ b/src/scores/probability/__init__.py @@ -10,6 +10,7 @@ crps_cdf_brier_decomposition, crps_for_ensemble, ) +from scores.probability.pit import pit from scores.probability.roc_impl import roc_curve_data from scores.processing.isoreg_impl import isotonic_fit diff --git a/src/scores/probability/pit.py b/src/scores/probability/pit.py new file mode 100644 index 00000000..34b08abb --- /dev/null +++ b/src/scores/probability/pit.py @@ -0,0 +1,414 @@ +""" +Functions for calculating the probability integral transform (PIT) and +assessing probabilistic calibration of predictive distributions. +""" + +from typing import Optional, Sequence + +import numpy as np +import pandas as pd +import xarray as xr + +from scores.probability.crps_impl import crps_cdf_reformat_inputs +from scores.processing.cdf.cdf_functions import ( + add_thresholds, + check_cdf, + check_cdf_support, + expectedvalue_from_cdf, + fill_cdf, + integrate_square_piecewise_linear, + observed_cdf, + round_values, + variance_from_cdf, +) +from scores.utils import dims_complement + + +def pit( + fcst_cdf: xr.DataArray, + obs: xr.DataArray, + fcst_threshold_dim: str = "threshold", + possible_pointmass: Optional[float] = None, + included_pit_thresholds: Optional[Sequence[float]] = None, + pit_precision: float = 0, +): # pylint: disable=too-many-locals + """ + Calculates the probability integral transforms (PITs) of an array of predictive distributions + `fcst_cdf` and corresponding observations `obs`. Here, the PIT of a forecast-observation pair + is interpreted as an empirical CDF (Taggart 2022, c.f., Gneiting et al 2013). + This generalisation allows the forecast to be a CDF which may have points of discontinuity. + Given 0 < alpha < 1, value pit_cdf(alpha) is the probability that the observation fell at or + below the alpha-quantile of the predictive distribution, for a randomly selected + forecast-observation pair. + + Predictive CDFs can be continuous or discontinuous. If the CDF is discontinuous at a point x, + the CDF is said to have 'point mass at x'. This implementation of the PIT assumes that + `fcst_cdf` is continuous everywhere, except possibly at a single threshold `possible_pointmass` of + possible point mass, where it is assumed that F(t) = 0 whenever t < possible_pointmass. + + For wind (m/s) and rainfall (mm) forecasts, set `possible_pointmass=0`. + For temperature (degrees Celsius) forecasts, typically set `possible_pointmass=None`. + + Whenever there are at least two non-NaN values in `fcst_cdf` along `fcst_threshold_dim`, + any NaNs in `fcst_cdf` will be replaced by linearly interpolating and extrapolating + values along `fcst_threshold_dim`, then clipping to 0 and 1, prior to calculating the PIT. + Use `jive.metrics.cdfs.propagate_nan` on `fcst_cdf` or `jive.metrics.cdfs.fill_cdf` to + deal with NaNs prior to using `pit` if this behaviour is undesirable. + + The returned array gives PIT CDFs for every observation and every predictive CDF, even where + the observation is not paired to a predictive CDF or vice versa. In the case that there is + no pairing, the resulting PIT CDF consists purely of NaN values. + + Args: + fcst_cdf: the forecast CDFs, with threshohold dimension `fcst_threshold_dim`. + obs: the corresponding observations, not in CDF format. Would have the same shape + as `fcst_cdf` if `fcst_threshold_dim` was collapsed. + fcst_threshold_dim: name of the dimension indexing the threshold in `fcst_cdf`. + possible_pointmass: forecast threshold at which point mass is possible. + `None` if no point mass is possible. + included_pit_thresholds: thresholds at which to report the value of the PIT CDF. If + `None`, then the thresholds `numpy.linspace(0, 1, 201)` will be included. + pit_precision: round PIT values to specified precision (e.g. to nearest 0.05). + Set to 0 for no rounding. + + Returns: + xr.DataArray of PIT CDFs, with thresholds along the 'pit_threshold' dimension. + 'pit_threshold' values are the union of `included_pit_thresholds` and (rounded) + PIT values. Here 'PIT values' are the values of the forecast CDFs at the observations. + + Raises: + ValueError: if `fcst_threshold_dim` is not a dimension of `fcst_cdf`. + ValueError: if `fcst_cdf` has non-NaN values outside the closed interval [0,1]. + ValueError: if fcst_cdf[fcst_threshold_dim]` coordinates are not increasing. + ValueError: if `fcst_threshold_dim` is a dimension of `obs`. + ValueError: if dimensions of `obs` are not also dimensions of `fcst_cdf`. + ValueError: if `pit_precision` is negative. + + References: + - Robert Taggart, "Assessing calibration when predictive distributions have + discontinuities", Bureau Research Report, 64, 2022. + http://www.bom.gov.au/research/publications/researchreports/BRR-064.pdf + - Tilmann Gneiting and Roopesh Ranjan, "Combining predictive distributions." + Electron. J. Statist. 7 1747 - 1782, 2013. https://doi.org/10.1214/13-EJS823 + """ + + _check_pit_args(fcst_cdf, obs, fcst_threshold_dim, pit_precision) + + # reformat fcst_cdf, obs so they have same shape and are both in CDF format + cdf_fcst, cdf_obs, _ = crps_cdf_reformat_inputs( + fcst_cdf, + obs, + fcst_threshold_dim, + threshold_weight=None, + additional_thresholds=None, + fcst_fill_method="linear", + threshold_weight_fill_method="step", # this value doesn't matter + ) + + # get PIT, interpreted as single values rather than as CDFs + pit_singlevalues = cdf_fcst.where(np.isclose(cdf_obs, 1)).min(fcst_threshold_dim) + + included_pit_thresholds = included_pit_thresholds if included_pit_thresholds is not None else np.linspace(0, 1, 201) + + # get all the PIT thresholds for the returned array + final_pit_thresholds = np.concatenate( + ( + included_pit_thresholds, + round_values(pit_singlevalues, pit_precision).values.flatten(), + ) + ) + final_pit_thresholds = np.sort(pd.unique(final_pit_thresholds)) + # remove NaNs + final_pit_thresholds = final_pit_thresholds[~np.isnan(final_pit_thresholds)] + + # identify pit values where observation fell in fcst point mass + pit_singlevalues2, obs2 = xr.broadcast(pit_singlevalues, obs) + is_pit_pointmass = (np.isclose(obs2, possible_pointmass)) & (pit_singlevalues2 > 0) + + # calculate the PIT CDF associated with non-point mass + # first, calculate all PIT CDFs, assuming non-pointmass + pit_cdf_no_ptmass = _pit_no_ptmass( + pit_singlevalues, + final_pit_thresholds, + pit_precision=pit_precision, + ) + + # then, nan PIT CDFs associated with point mass, via broadcasting + dab_pit_cdf_no_ptmass, dab_is_pit_pointmass = xr.broadcast(pit_cdf_no_ptmass, is_pit_pointmass) + + dab_pit_cdf_no_ptmass = dab_pit_cdf_no_ptmass.where(~dab_is_pit_pointmass) + + # now deal with PIT CDFs associated with point mass + pit_cdf_ptmass = _pit_ptmass(pit_singlevalues, final_pit_thresholds) + pit_cdf_ptmass = pit_cdf_ptmass.where(dab_is_pit_pointmass) + + # now combine the two different PIT CDF cases + pit_cdf = pit_cdf_ptmass.combine_first(pit_cdf_no_ptmass) + + return pit_cdf + + +def _check_pit_args( + fcst_cdf: xr.DataArray, + obs: xr.DataArray, + fcst_threshold_dim: str, + pit_precision: float, +): + """Checks that args of `pit` are valid.""" + + check_cdf(fcst_cdf, fcst_threshold_dim, None, "fcst_cdf", "fcst_threshold_dim", "dims") + + if fcst_threshold_dim in obs.dims: + raise ValueError(f"'{fcst_threshold_dim}' is a dimension of `obs`") + + if not set(obs.dims).issubset(fcst_cdf.dims): + raise ValueError("Dimensions of `obs` must be a subset of dimensions of `fcst_cdf`") + + if pit_precision < 0: + raise ValueError("`pit_precision` must be nonnegative") + + +def _pit_ptmass(pit_singlevalues: xr.DataArray, pit_thresholds: np.array) -> xr.DataArray: + """ + Calculates the PIT CDF for a forecast-observation pair assuming that: + + - the observation fell where there is (possible) point mass x in the forecast CDF + - the forecast CDF satisfies fcst_cdf(t) = 0 whenever t < x. + + In this case, the PIT CDF is of the form + + `t -> min(t/fcst_cdf(x), 1)` + + where x is the point at which the forecast CDF has point mass. + If fcst_cdf(x) = 0 then the PIT CDF is 1. + + Args: + pit_singlevalues: the value of the forecast CDF at the observation. + Here the observation is at the location of forecast point mass. + Values of `pit_singlevalues` are assumed to be in the closed interval [0,1], apart from any NaN. + pit_thresholds: 1-dimensional array containing thresholds for the returned array of PIT CDF values. + Elements are assumed to have been sorted and without replication. + + Returns: + An xr.DataArray of PIT CDF values with threshold dimension 'pit_threshold'. + """ + + da_pit_thresholds = xr.DataArray( + data=pit_thresholds, + dims=["pit_threshold"], + coords={"pit_threshold": pit_thresholds}, + ) + + _, da_pit_thresholds = xr.broadcast(pit_singlevalues, da_pit_thresholds) + + pit_cdf = (da_pit_thresholds / pit_singlevalues).clip(0, 1) + + pit_cdf = pit_cdf.where(pit_singlevalues != 0, 1) + + return pit_cdf + + +def _pit_no_ptmass(pit_singlevalues: xr.DataArray, pit_thresholds: np.array, pit_precision=0) -> xr.DataArray: + """ + Calculates values of the PIT CDF assuming that the observation fell where there is + no point mass in the forecast CDF. In this case, + + - pit_cdf(t) = 0 when t < obs + - pit_cdf(t) = 1 when t >= obs. + + Args: + pit_singlevalues: the value of the forecast CDF at the observation. + pit_thresholds: 1-dimensional array of thresholds in the returned PIT CDF array + at which PIT CDF values are reported. + pit_precision: precision at which to round pit_singlevalues + prior to constructing PIT CDF. If `precision=0` then no rounding is performed. + + Returns: + PIT CDFs as an xr.DataArray with thresholds in dimension 'pit_threshold'. + """ + + pit_cdf = observed_cdf( + pit_singlevalues, + "pit_threshold", + threshold_values=pit_thresholds, + include_obs_in_thresholds=True, + precision=pit_precision, + ) + + return pit_cdf + + +def pit_histogram_values( + pit_cdf: xr.DataArray, + pit_threshold_dim: str = "pit_threshold", + n_bins: int = 10, + dims: Optional[Sequence[str]] = None, +) -> xr.DataArray: + """ + The PIT histogram is a histogram showing how many times PIT values fall in a bin. + This function gives the relative frequency of binned PIT 'values', based on PIT CDFs. + If MPC denotes the mean PIT CDF, then the relative frequency for a bin with interval (a, b] + is given by MPC(b) - MPC(a). + + The unit interval [0,1] is partitioned into bins of equal length. + Each bin is is an interval of the form (a,b], except for the first + bin which is of the form [0,b]. If a bin endpoint is not among the `pit_threshold_dim` coordinates, + then the value of the PIT CDF at that endpoint is estimated using linear interpolation and/or + linear extrapolation, clipped to the interval [0,1]. Any NaNs in the input `pit_cdf` + will likewise be filled, unless there are fewer than two non-NaN values along the `pit_threshold_dim` + dimension. + + Args: + pit_cdf: values of the PIT CDFs, with threshold dimension `pit_threshold_dim`. + pit_threshold_dim: the name of the dimension in `pit_cdf` that has PIT threshold values. + n_bins: the number of bins. + dims: the dimensions of `pit_cdf` to be preserved in the output, apart from `pit_threshold_dim`. + All other dimensions in `pit_cdf`, including `pit_threshold_dim`, are collapsed. + + Returns: + xr.DataArray with relative frequency of PIT values for each bin. + + Raises: + ValueError: if `pit_threshold_dim` is not a dimension of `pit_cdf`. + ValueError: if coordinates of `pit_cdf[pit_threshold_dim]` are not increasing. + ValueError: if `pit_cdf` has non-NaN values outside the closed interval [0,1]. + ValueError: if `pit_threshold_dim` is in `dims`. + ValueError: if `dims` is not a subset of `pit_cdf.dims`. + ValueError: if `pit_cdf` is not 1 when `pit_cdf[pit_threshold_dim] > 1`. + ValueError: if `pit_cdf` is not 0 when `pit_cdf[pit_threshold_dim] < 0`. + ValueError: if `n_bins` is not positive. + """ + _check_args_with_pits(pit_cdf, pit_threshold_dim, dims) + if n_bins < 1: + raise ValueError("`n_bins` must be at least 1") + + bin_endpoints = np.linspace(0, 1, n_bins + 1) + + dims_to_collapse = dims_complement(pit_cdf, dims=dims) + dims_to_collapse.remove(pit_threshold_dim) + + mean_pitcdf = pit_cdf.mean(dims_to_collapse) + + # calculate the value of the mean PIT CDF at the bin endpoints + # linear interpolation/extrapolation is used if endpoints are not in the threshold coordinates + mean_pitcdf = add_thresholds(mean_pitcdf, pit_threshold_dim, bin_endpoints, "linear") + + # just get mean PIT CDF values at the endpoints + mean_pitcdf = mean_pitcdf.where(mean_pitcdf[pit_threshold_dim].isin(bin_endpoints)).dropna( + pit_threshold_dim, how="all" + ) + + diffs = mean_pitcdf - mean_pitcdf.shift(**{pit_threshold_dim: 1}) + + # frequency in each bin, indexed by the right endpoint of the bin + result = diffs.sel(**{pit_threshold_dim: bin_endpoints[1:]}) + + # need to count frequency for PIT=0 as well. Will put this in the first bin + result = result.where(result[pit_threshold_dim] != bin_endpoints[1]) + result = result.combine_first(mean_pitcdf.sel(**{pit_threshold_dim: bin_endpoints[1:]})) + + result = result.rename({pit_threshold_dim: "right_endpoint"}) + result = result.assign_coords( + left_endpoint=result.right_endpoint - 1 / n_bins, + bin_centre=result.right_endpoint - 1 / (2 * n_bins), + ) + + return result + + +def _check_args_with_pits(pit_cdf, pit_threshold_dim, dims): + """Checks that arguments involving PIT CDFs are valid.""" + check_cdf(pit_cdf, pit_threshold_dim, dims, "pit_cdf", "pit_threshold_dim", "dims") + check_cdf_support(pit_cdf, pit_threshold_dim, 0, 1, "pit_cdf", "pit_threshold_dim") + + +def pit_scores( + pit_cdf: xr.DataArray, + pit_threshold_dim: str = "pit_threshold", + dims: Optional[Sequence[str]] = None, +) -> xr.Dataset: + """ + Given an array of PIT CDFs (probability integral transform in CDF format), + calculates the combined PIT CDF over the specified dimensions. Returns a + "calibration score" for the mean PIT CDF, as well as the expected value and the + variance of a random variable distributed by each mean PIT CDF. + + The calibration score of a combined PIT CDF is a measure of the degree of probabilistic + miscalibration of the forecast cases used to construct the mean_pit_cdf. Specifically, + + `score = integral((combined_pit_cdf(x) - x)^2, 0 <= x <= 1)` + + so that the score measures the deviation of the mean_pit_cdf from the standard uniform CDF. + A lower score is better. + + If Y is a random variable distributed by combined_pit_cdf, then the expected value E(Y) + measures the extent to which the forecasts had an over-forecast tendency (E(Y) < 0.5) + or an under-forecast tendency (E(Y) > 0.5). + + If Y ~ combined_pit_cdf, then the variance Var(Y) is a measure of dispersion of the forecast + system. If Var(Y) < 1/12 then the system is over-dispersed (i.e. its predictive distributions + are too wide) while if Var(Y) > 1/12 then the system is under-dispersed (i.e. its predictive + distributions are too narrow). + + See Taggart (2022) for further information about this score (labelled there as PS_2) and its + decomposition. + + + Whenever there are at least two non-NaN values in `pit_cdf` along `pit_threshold_dim`, + any NaNs in `pit_cdf` will be replaced by linearly interpolating and extrapolating + values along `pit_threshold_dim`, then clipping to 0 and 1, prior to calculating output. + Use `jive.metrics.cdfs.propagate_nan` on `pit_cdf` or `jive.metrics.cdfs.fill_cdf` to + deal with NaNs prior to using `pit_scores` if this behaviour is undesirable. + + Args: + pit_cdf: array of PITs in CDF form, with threshold dimension `pit_threshold_dim`. + pit_threshold_dim: the name of the threshold dimension of `pit_cdf`. + dims: The dimensions of `pit_cdf` to be preserved in the output, apart from `pit_threshold_dim`. + All other dimensions in `pit_cdf`, including `pit_threshold_dim`, are collapsed. + + Returns: + An xr.Dataset with data variables "score", "expectation" and "variance". + + Raises: + ValueError: if `pit_threshold_dim` is not a dimension of `pit_cdf`. + ValueError: if coordinates of `pit_cdf[pit_threshold_dim]` are not increasing. + ValueError: if `pit_cdf` has non-NaN values outside the closed interval [0,1]. + ValueError: if `pit_threshold_dim` is in `dims`. + ValueError: if `dims` is not contained in `pit_cdf.dims`. + ValueError: if `pit_cdf` is not 1 when `pit_cdf[pit_threshold_dim] > 1`. + ValueError: if `pit_cdf` is not 0 when `pit_cdf[pit_threshold_dim] < 0`. + + References: + Robert Taggart, "Assessing calibration when predictive distributions have discontinuities", + Bureau Research Report, 64, 2022. + http://www.bom.gov.au/research/publications/researchreports/BRR-064.pdf + """ + _check_args_with_pits(pit_cdf, pit_threshold_dim, dims) + + # only interested in PIT thresholds between 0 and 1 inclusive. + pit_cdf = pit_cdf.sel({pit_threshold_dim: slice(0, 1)}) + + pit_cdf = fill_cdf(pit_cdf, pit_threshold_dim, "linear", 2) + + dims_to_collapse = dims_complement(pit_cdf, dims=dims) + dims_to_collapse.remove(pit_threshold_dim) # this dim will be collapsed later + pit_cdf = pit_cdf.mean(dims_to_collapse) + + # will compute integral(func(x)^2) where func(x) = mean_pit_cdf(x) - x + std_uniform_cdf = xr.DataArray( + data=pit_cdf[pit_threshold_dim], + dims=[pit_threshold_dim], + coords={pit_threshold_dim: pit_cdf[pit_threshold_dim]}, + ) + + func = pit_cdf - std_uniform_cdf + func = func.where(std_uniform_cdf >= 0).where(std_uniform_cdf <= 1) + + score = integrate_square_piecewise_linear(func, pit_threshold_dim).rename("score") + + expectation = expectedvalue_from_cdf(pit_cdf, pit_threshold_dim).rename("expectation") + variance = variance_from_cdf(pit_cdf, pit_threshold_dim).rename("variance") + + result = xr.merge([score, expectation, variance]) + + return result diff --git a/src/scores/processing/cdf/cdf_functions.py b/src/scores/processing/cdf/cdf_functions.py index 54945eb4..22460f0c 100644 --- a/src/scores/processing/cdf/cdf_functions.py +++ b/src/scores/processing/cdf/cdf_functions.py @@ -13,6 +13,7 @@ from scores.probability.checks import ( cdf_values_within_bounds, check_nan_decreasing_inputs, + coords_increasing, ) from scores.typing import XarrayLike @@ -403,3 +404,218 @@ def cdf_envelope( result.loc["lower"] = np.where(~np.isnan(cdf), cdf_lower, np.nan) return result + + +def check_cdf( + cdf: xr.DataArray, + threshold_dim: str, + dims: Optional[Iterable[str]], + varname_cdf: str = "cdf", + varname_threshold_dim: str = "threshold_dim", + varname_dims: str = "dims", +): + """ + Performs various checks on a CDF values of a real-valued random variable + in an array `cdf` and its dimensions. Does not check that `cdf` is nondecreasing + along the `threshold_dim` dimension; + use the function `nan_decreasing_cdfs` instead. + + Args: + cdf: array of CDF values to check. + threshold_dim: name of the threshold dimension in `cdf`. + dims: if not `None`, a subset of dimensions of `cdf`, not including + `threshold_dim`. + varname_cdf: the variable name for `cdf` used in the error message. + varname_threshold_dim: the variable name for `threshold_dim` used in the error message. + varname_dims: the variable name for `dims` used in the error message. + + Raises: + ValueError: if `threshold_dim` is not a dimension of `cdf`. + ValueError: if `threshold_dim` is in `dims`. + ValueError: if `dims` is not a subset of `cdf.dims`. + ValueError: if `cdf` has non-NaN values outside the closed interval [0,1]. + ValueError: if `cdf[threshold_dim]` coordinates are not increasing. + """ + if threshold_dim not in cdf.dims: + raise ValueError(f"`{varname_threshold_dim}` is not a dimension of `{varname_cdf}`") + + if dims is not None: + if threshold_dim in dims: + raise ValueError(f"`{varname_threshold_dim}` is in `{varname_dims}`") + if not set(dims).issubset(cdf.dims): + raise ValueError(f"`{varname_dims}` is not a subset of dimensions of `{varname_cdf}`") + + if not cdf_values_within_bounds(cdf): + raise ValueError(f"values of `{varname_cdf}` must be in the closed interval [0,1]") + + if not coords_increasing(cdf, threshold_dim): + raise ValueError(f"coordinates along `{varname_threshold_dim}` of `{varname_cdf}` must be increasing") + + +def check_cdf_support( + cdf: xr.DataArray, + threshold_dim: str = "threshold", + lower_supp: float = 0, + upper_supp: float = np.inf, + varname_cdf: str = "cdf", + varname_threshold_dim: str = "threshold_dim", +): + """ + Checks that the CDF of a real-valued random variable is supported on + the closed interval [lower_supp, upper_supp], i.e., that + + - cdf(x) = 0 when x < `lower_support` + - cdf(x) = 1 when x > `upper_support` + + for x in `cdf[threshold_dim].values`. + + NaNs are ignored. Tests for "equality" uses `np.allclose`. + + Args: + cdf: array of CDF values, with threshold dimension `threshold_dim`. + threshold_dim: name of the threshold dimension in `cdf`. + lower_supp: left endpoint of the interval of support. Can be `-np.inf`. + upper_supp: right endpoint of the interval of support. Can be `np.inf`. + varname_cdf: the variable name for `cdf` used in the error message. + varname_threshold_dim: the variable name for `cdf` used in the error message. + + Returns: + True if all checks pass. + + Raises: + ValueError: if `lower_support` > `upper_support`. + ValueError: if cdf(x) != 0 when x < `lower_support`. + ValueError: if cdf(x) != 1 when x > `upper_support`. + """ + if upper_supp < lower_supp: + raise ValueError("`upper_supp < lower_supp`") + + upper = cdf.where(cdf[threshold_dim] > upper_supp).values + lower = cdf.where(cdf[threshold_dim] < lower_supp).values + + if not np.allclose(upper[~np.isnan(upper)], 1): + raise ValueError(f"`{varname_cdf}` is not 1 when `{varname_cdf}[{varname_threshold_dim}] > {upper_supp}`") + + if not np.allclose(lower[~np.isnan(lower)], 0): + raise ValueError(f"`{varname_cdf}` is not 0 when `{varname_cdf}[{varname_threshold_dim}] < {lower_supp}`") + + +def expectedvalue_from_cdf( + cdf: xr.DataArray, + threshold_dim: str = "threshold", + nonnegative_support: bool = True, +) -> xr.DataArray: + """ + Returns the expected value E(Y) of a real-valued random variable Y with + distribution given by `cdf`. + + The current implementation assumes that Y >= 0, i.e., the CDF has nonnnegative support. + This assumption may be relaxed in a future implementation. + + The calculation is based on the formula + + E(Y) = integral(1 - cdf(x), x >=0), + + where it is assumed that the CDF is piecewise linear between CDF values given by the + array `cdf` when x >= 0. + + Args: + cdf: array of CDF values, with threshold dimension `threshold_dim`. + threshold_dim: name of the threshold dimension in `cdf`. + nonnegative_support: `True` if the support of the probability measure + described by `cdf` is contained in the half line [0, inf). + + Returns: + xr.DataArray containing the expected value of each CDF in `cdf`, with + `threshold_dim` collapsed. + + Raises: + ValueError: if `nonnegative_support = False`. + ValueError: if `threshold_dim` is not a dimension of `cdf`. + ValueError: if `cdf` has non-NaN values outside the closed interval [0,1]. + ValueError: if `cdf[threshold_dim]` coordinates are not increasing. + ValueError: if `cdf` is not 0 when `cdf[threshold_dim] < 0` and + `nonnegative_support = True`. + """ + if nonnegative_support: + check_cdf_support(cdf, threshold_dim, 0, np.inf) + else: + raise ValueError("This function currently only handles the case when the cdf has nonnegative support") + + check_cdf(cdf, threshold_dim, None, "cdf", "threshold_dim", "dims") + + # only select CDF values where 0 <= threshold_dim. + integrand = 1 - cdf.sel({threshold_dim: slice(0, np.inf)}) + + return integrand.integrate(threshold_dim) + + +def variance_from_cdf(cdf: xr.DataArray, threshold_dim: str = "threshold") -> xr.DataArray: + """ + Returns the variance Var(Y) of a random variable Y with distribution given by `cdf`, + under the assumption that Y >= 0. The calculation is based on the formula + + Var(Y) = 2 * integral(x * (1 - cdf(x)), x >= 0) + E(Y)^2, + + where it is assumed that the CDF is piecewise linear between CDF values given by the + array `cdf` when x >= 0. + + Args: + cdf: array of CDF values, with threshold dimension `threshold_dim`. + threshold_dim: name of the threshold dimension in `cdf`. + + Returns: + xr.DataArray containing the variance Var(Y) associated with each CDF in `cdf`, + with `threshold_dim` collapsed. + + Raises: + ValueError: if `threshold_dim` is not a dimension of `cdf`. + ValueError: if `cdf` has non-NaN values outside the closed interval [0,1]. + ValueError: if `cdf[threshold_dim]` coordinates are not increasing. + ValueError: if `cdf` is not 0 when `cdf[threshold_dim] < 0`. + """ + check_cdf(cdf, threshold_dim, None, "cdf", "threshold_dim", "dims") + check_cdf_support(cdf, threshold_dim, 0, np.inf) + + expected_value = expectedvalue_from_cdf(cdf, threshold_dim) + + # only use values of CDF where threshold >= 0. + cdf = cdf.sel({threshold_dim: slice(0, np.inf)}) + integral = _var_from_cdf(cdf, threshold_dim) + result = 2 * integral - expected_value**2 + + return result + + +def _var_from_cdf(function_values, threshold_dim): + """ + Calculates the integral + + integral(t * (1 - F(t))) + + where it is assumed that the function F is piecewise linear between values in + given by the array `function_values`. + + If there is a NaN along `threshold_dim`, then NaN is returned. + """ + # notation: F_i(t) = m_i * t + b_i whenever x[i-1] <= t <= x[i] + # and x[i] is a value in `function_values[threshold_dim]`. + + x_values = function_values[threshold_dim] + x_shifted = function_values[threshold_dim].shift(**{threshold_dim: 1}) + # difference in x values + diff_xs = x_values - x_shifted + # difference in function values y_i = F(x[i]) + diff_ys = function_values - function_values.shift(**{threshold_dim: 1}) + # gradients m + m_values = diff_ys / diff_xs + # intercepts b_i + b_values = function_values - m_values * x_values + # integral(t * (1 - F(t))) on the interval (x[i-1], x[i]), for each i, using calculus: + integral_i = (1 - b_values) * (x_values**2 - x_shifted**2) / 2 - m_values * (x_values**3 - x_shifted**3) / 3 + + integral = integral_i.sum(threshold_dim) + # return NaN if NaN in function_values + integral = integral.where(~np.isnan(function_values).any(threshold_dim)) + + return integral diff --git a/tests/probabilty/pit_test_data.py b/tests/probabilty/pit_test_data.py new file mode 100644 index 00000000..783e6a0b --- /dev/null +++ b/tests/probabilty/pit_test_data.py @@ -0,0 +1,263 @@ +""" +Test data for `scores.probability.pit`. +""" + +import numpy as np +import xarray as xr +from numpy import nan + +DA_PIT_VALUES = xr.DataArray( + data=[0, 0.4, 0.5, nan], + dims=["station"], + coords={"station": [1001, 1002, 1003, 1004]}, +) + +ARRAY_PIT_THRESH = np.array([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]) + +EXP_PIT_PTMASSS = xr.DataArray( + data=[ + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # fcst cdf had no point mass at .2 + [0, 0.25, 0.5, 0.75, 1, 1, 1, 1, 1, 1, 1], # fcst cdf with point mass + [0, 0.2, 0.4, 0.6, 0.8, 1, 1, 1, 1, 1, 1], # fcst cdf with point mass + [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], + ], + dims=["station", "pit_threshold"], + coords={"station": [1001, 1002, 1003, 1004], "pit_threshold": ARRAY_PIT_THRESH}, +) + +EXP_PIT_NOPTMASS = xr.DataArray( + data=[ + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], + [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], + ], + dims=["station", "pit_threshold"], + coords={"station": [1001, 1002, 1003, 1004], "pit_threshold": ARRAY_PIT_THRESH}, +) + + +DA_FCST_CDF = xr.DataArray( + data=[ + [ + [0, 0, 0.1, 0.1, 0.4, 0.5, 0.9, 1], + [0, 0, 0.4, 0.5, 0.7, 0.8, 1, 1], + [0, 0, 0.4, 0.5, 0.5, 0.7, 0.8, 0.9], + [0, 0, 0.4, 0.5, nan, 0.7, 0.8, 0.9], + [0, 0, 0, 0.5, 0.7, 0.8, 1, 1], + [nan, nan, nan, nan, nan, nan, nan, nan], + [0, 0, 0.4, 0.5, 0.7, 0.8, 1, 1], + ], + [ + [0, 0, 0.1, 0.1, 0.4, 0.5, 0.9, 1], + [0, 0, 0.4, 0.5, 0.7, 0.8, 1, 1], + [0, 0, 0.4, 0.5, 0.5, 0.5, 0.5, 0.5], + [0, 0, 0.4, 0.5, nan, 0.7, 0.8, 0.9], + [0, 0, 0, 0.5, 0.7, 0.8, 1, 1], + [nan, nan, nan, nan, nan, nan, nan, nan], + [0, 0, 0.4, 0.5, 0.7, 0.8, 1, 1], + ], + ], + dims=["lead_day", "station", "x"], + coords={ + "station": [1000, 1001, 1002, 1003, 1004, 1005, 1006], + "lead_day": [1, 2], + "x": [0, 1, 2, 3, 4, 5, 6, 7], + }, +) + +DA_OBS = xr.DataArray( + data=[2, 7.5, 4.3, 2, 3.2, nan, 3.0], + dims=["station"], + coords={"station": [1001, 1002, 1003, 1004, 1005, 1006, 2000]}, +) + +EXP_PIT = xr.DataArray( + data=[ + [ + [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], + [0, 0.25, 0.5, 0.75, 1, 1, 1, 1, 1, 1, 1, 1, 1], # obs at point mass + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], # obs at 95th p'tile + [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], # obs nearest 65th p'tile + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # obs at poss pt mass, cdf cts + [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], + [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], + [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], + ], + [ + [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], + [0, 0.25, 0.5, 0.75, 1, 1, 1, 1, 1, 1, 1, 1, 1], # obs at point mass + [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], # obs at 50th p'tile + [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], # obs nearest 65th p'tile + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # obs at poss pt mass, cdf cts + [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], + [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], + [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], + ], + ], + dims=["lead_day", "station", "pit_threshold"], + coords={ + "lead_day": [1, 2], + "station": [1000, 1001, 1002, 1003, 1004, 1005, 1006, 2000], + "pit_threshold": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.65, 0.7, 0.8, 0.9, 0.95, 1], + }, +) + +DA_OBS2 = xr.DataArray( + data=[2, 7.5, 4.3, 2, 3.2, nan], + dims=["x"], + coords={"x": [1001, 1002, 1003, 1004, 1005, 1006]}, +) + +DA_OBS3 = xr.DataArray( + data=[2, 7.5, 4.3, 2, 3.2, nan], + dims=["y"], + coords={"y": [1001, 1002, 1003, 1004, 1005, 1006]}, +) + +DA_FCST_CDF2 = xr.DataArray( + data=[[0, 0, 0.4, 0.5, 0.7, 0.8, 1, 1.2]], + dims=["station", "x"], + coords={"station": [1001], "x": [0, 1, 2, 3, 4, 5, 6, 7]}, +) + +DA_FCST_CDF3 = xr.DataArray( + data=[[0, 0, 0.4, 0.5, 0.7, 0.8, 1, 1]], + dims=["station", "x"], + coords={"station": [1001], "x": [0, 1, 20, 3, 4, 5, 6, 7]}, +) + +DA_PIT_CDF1 = xr.DataArray( # pit_cdf(0) = 0 + data=[[[0, 0.5, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1], [nan, nan, nan, nan, nan, nan]]], + dims=["date", "station", "pit_thresh"], + coords={ + "station": [1001, 1002, 1003], + "date": ["2020-01-01"], + "pit_thresh": [0, 0.2, 0.4, 0.6, 0.8, 1], + }, +) + +EXP_PIT_HIST1A = xr.DataArray( # 4 bins + data=[0.625 / 2, 1.5 / 2 - 0.625 / 2, 1 - 1.5 / 2, 1 - 1], + dims=["right_endpoint"], + coords={ + "right_endpoint": [0.25, 0.5, 0.75, 1], + "left_endpoint": ("right_endpoint", [0, 0.25, 0.5, 0.75]), + "bin_centre": ("right_endpoint", [0.125, 0.375, 0.625, 0.875]), + }, +) + +EXP_PIT_HIST1B = xr.DataArray( # 2 bins + data=[0.625 / 2 + 1.5 / 2 - 0.625 / 2, 1 - 1.5 / 2 + 1 - 1], + dims=["right_endpoint"], + coords={ + "right_endpoint": [0.5, 1], + "left_endpoint": ("right_endpoint", [0, 0.5]), + "bin_centre": ("right_endpoint", [0.25, 0.75]), + }, +) + +DA_PIT_CDF2 = xr.DataArray( # pit_cdf(0) > 0 + data=[[[0.2, 0.5, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1], [nan, nan, nan, nan, nan, nan]]], + dims=["date", "station", "pit_thresh"], + coords={ + "station": [1001, 1002, 1003], + "date": ["2020-01-01"], + "pit_thresh": [0, 0.2, 0.4, 0.6, 0.8, 1], + }, +) + +DA_PIT_CDF3 = xr.DataArray( # support outside [0,1] + data=[[[0.2, 0.2, 0.5, 1]]], + dims=["date", "station", "pit_thresh"], + coords={ + "station": [1001], + "date": ["2020-01-01"], + "pit_thresh": [-1, 0, 0.5, 1], + }, +) + +DA_PIT_CDF4 = xr.DataArray( # support outside [0,1] + data=[[[0, 0.2, 0.5, 0.9]]], + dims=["date", "station", "pit_thresh"], + coords={ + "station": [1001], + "date": ["2020-01-01"], + "pit_thresh": [-1, 0, 1, 2], + }, +) + +EXP_PIT_HIST2A = xr.DataArray( # preserve some dims, 5 bins + data=[[0.5, 0.5, 0, 0, 0], [0, 0, 1, 0, 0], [nan, nan, nan, nan, nan]], + dims=["station", "right_endpoint"], + coords={ + "station": [1001, 1002, 1003], + "right_endpoint": [0.2, 0.4, 0.6, 0.8, 1.0], + "left_endpoint": ("right_endpoint", [0, 0.2, 0.4, 0.6, 0.8]), + "bin_centre": ("right_endpoint", [0.1, 0.3, 5.0, 0.7, 0.9]), + }, +) + +EXP_PIT_HIST2B = xr.DataArray( # collapse all dims + data=[0.625 / 2, 1.5 / 2 - 0.625 / 2, 1 - 1.5 / 2, 1 - 1], + dims=["right_endpoint"], + coords={ + "right_endpoint": [0.25, 0.5, 0.75, 1], + "left_endpoint": ("right_endpoint", [0, 0.25, 0.5, 0.75]), + "bin_centre": ("right_endpoint", [0.125, 0.375, 0.625, 0.875]), + }, +) + +DA_PIT_SCORE1 = xr.DataArray( # support outside [0,1] + data=[ + [[0.0, 0.0, 0.5, 1.0, 1.0, 1.0, 1.0], [0, 0, 0.25, 0.5, 0.75, 1, 1]], + [[0.0, 0.0, 0.25, 0.5, 0.75, 1, 1], [0, 0, 0.25, nan, 0.75, 1, 1]], + [[0.0, 0.0, 0.0, 0.0, 0.5, 1.0, 1.0], [nan, nan, nan, nan, nan, nan, nan]], + ], + dims=["station", "date", "pit_thresh"], + coords={ + "station": [1001, 1002, 1003], + "date": ["2020-01-01", "2020-01-02"], + "pit_thresh": [-1, 0, 0.25, 0.5, 0.75, 1, 2], + }, +) + +EXP_PIT_SCORE1A = xr.Dataset(data_vars={"score": 0.0, "expectation": 1 / 2, "variance": 1 / 12}) + +EXP_PIT_SCORE1B = xr.Dataset( + data_vars={ + "score": ( + ["station", "date"], + [[1 / 24 + 1 / 24, 0], [0, 0], [1 / 24 + 1 / 24, nan]], + ), + "expectation": (["station", "date"], [[0.25, 0.5], [0.5, 0.5], [0.75, nan]]), + "variance": ( + ["station", "date"], + [[1 / (12 * 4), 1 / 12], [1 / 12, 1 / 12], [1 / (12 * 4), nan]], + ), + }, + coords={"station": [1001, 1002, 1003], "date": ["2020-01-01", "2020-01-02"]}, +) + +DA_PIT_SCORE2 = xr.DataArray( # support outside [0,1] + data=[ + [[0.0, 0.0, 0.25, 0.5, 0.75, 1, 1], [0, 0, 0.25, nan, 0.75, 1, 1]], + [[0.0, 0.0, 0.0, 0.0, 0.5, 1.0, 1.0], [nan, nan, nan, nan, nan, nan, nan]], + ], + dims=["station", "date", "pit_thresh"], + coords={ + "station": [1001, 1002], + "date": ["2020-01-01", "2020-01-02"], + "pit_thresh": [-1, 0, 0.25, 0.5, 0.75, 1, 2], + }, +) + + +EXP_PIT_SCORE2 = xr.Dataset( + data_vars={ + "score": (["station"], [0.0, 1 / 24 + 1 / 24]), + "expectation": (["station"], [1 / 2, 0.75]), + "variance": (["station"], [1 / 12, 1 / (12 * 4)]), + }, + coords={"station": [1001, 1002]}, +) diff --git a/tests/probabilty/test_pit.py b/tests/probabilty/test_pit.py new file mode 100644 index 00000000..7ee89f72 --- /dev/null +++ b/tests/probabilty/test_pit.py @@ -0,0 +1,162 @@ +""" +Units tests for `scores.probability.pit`. +""" + +import numpy as np +import pytest +import xarray as xr + +from scores.probability.pit import ( + _check_args_with_pits, + _pit_no_ptmass, + _pit_ptmass, + pit, + pit_histogram_values, + pit_scores, +) +from tests.probabilty import pit_test_data as ptd + + +def test__pit_ptmass(): + """Tests `_pit_ptmass`.""" + result = _pit_ptmass(ptd.DA_PIT_VALUES, ptd.ARRAY_PIT_THRESH) + xr.testing.assert_allclose(result, ptd.EXP_PIT_PTMASSS) + + +def test__pit_no_ptmass(): + """Tests `_pit_no_ptmass`.""" + result = _pit_no_ptmass(ptd.DA_PIT_VALUES, ptd.ARRAY_PIT_THRESH) + xr.testing.assert_allclose(result, ptd.EXP_PIT_NOPTMASS) + + +def test_pit(): + """Tests `pit`.""" + result = pit( + ptd.DA_FCST_CDF, + ptd.DA_OBS, + "x", + possible_pointmass=2, + included_pit_thresholds=ptd.ARRAY_PIT_THRESH, + pit_precision=0.05, + ) + + xr.testing.assert_allclose(result, ptd.EXP_PIT) + + +@pytest.mark.parametrize( + ("fcst_cdf", "obs", "fcst_threshold_dim", "pit_precision", "error_msg_snippet"), + [ + ( + ptd.DA_FCST_CDF, + ptd.DA_OBS, + "y", + 0, + "`fcst_threshold_dim` is not a dimension of `fcst_cdf`", + ), + ( + ptd.DA_FCST_CDF2, + ptd.DA_OBS, + "x", + 0, + "values of `fcst_cdf` must be in the closed", + ), + ( + ptd.DA_FCST_CDF3, + ptd.DA_OBS, + "x", + 0, + "coordinates along `fcst_threshold_dim` of `fcst_cdf` must be increasing", + ), + (ptd.DA_FCST_CDF, ptd.DA_OBS2, "x", 0, "'x' is a dimension of `obs`"), + ( + ptd.DA_FCST_CDF, + ptd.DA_OBS3, + "x", + 0, + "Dimensions of `obs` must be a subset of dimensions of `fcst_cdf`", + ), + (ptd.DA_FCST_CDF, ptd.DA_OBS, "x", -1.0, "`pit_precision` must be nonnegative"), + ], +) +def test_pit_raises(fcst_cdf, obs, fcst_threshold_dim, pit_precision, error_msg_snippet): + """Tests that `pit` raises as expected.""" + with pytest.raises(ValueError, match=error_msg_snippet): + pit(fcst_cdf, obs, fcst_threshold_dim, pit_precision=pit_precision) + + +@pytest.mark.parametrize( + ("pit_cdf", "n_bins", "expected"), + [ + (ptd.DA_PIT_CDF1, 4, ptd.EXP_PIT_HIST1A), + (ptd.DA_PIT_CDF1, 2, ptd.EXP_PIT_HIST1B), + (ptd.DA_PIT_CDF2, 4, ptd.EXP_PIT_HIST2B), + ], +) +def test_pit_histogram_values(pit_cdf, n_bins, expected): + """Tests `pit_histogram_values` with a variety of imputs.""" + result = pit_histogram_values(pit_cdf, pit_threshold_dim="pit_thresh", n_bins=n_bins) + xr.testing.assert_allclose(result, expected) + + +def test_pit_histogram_values2(): + """ + Tests `pit_histogram_values`. Uses `xr.testing.assert_allclose` + fails due to tiny floating point arithmetic differences in calculated coordinates. + """ + result = pit_histogram_values(ptd.DA_PIT_CDF2, pit_threshold_dim="pit_thresh", n_bins=5, dims=["station"]) + np.testing.assert_allclose(result, ptd.EXP_PIT_HIST2A) + + +@pytest.mark.parametrize( + ("pit_cdf", "pit_threshold_dim", "dims", "error_msg_snippet"), + [ + (ptd.DA_PIT_CDF2, "x", None, "`pit_threshold_dim` is not a dimension"), + ( + ptd.DA_PIT_CDF2, + "pit_thresh", + ["pit_thresh"], + "`pit_threshold_dim` is in `dims`", + ), + (ptd.DA_PIT_CDF2, "pit_thresh", ["zzz"], "`dims` is not a subset of"), + (ptd.DA_FCST_CDF2, "x", None, "values of `pit_cdf` must be in the closed"), + ( + ptd.DA_PIT_CDF3, + "pit_thresh", + None, + "`pit_cdf` is not 0 when `pit_cdf\\[pit_threshold_dim\\] < 0`", + ), + ( + ptd.DA_PIT_CDF4, + "pit_thresh", + None, + "`pit_cdf` is not 1 when `pit_cdf\\[pit_threshold_dim\\] > 1`", + ), + ], +) +def test__check_args_with_pits(pit_cdf, pit_threshold_dim, dims, error_msg_snippet): + """Tests that `_check_args_with_pits` raises as expected.""" + with pytest.raises(ValueError, match=error_msg_snippet): + _check_args_with_pits(pit_cdf, pit_threshold_dim, dims) + + +def test_pit_histogram_raises(): + """ + Tests that `pit_histogram_values` raises as expected. Note that all but + one raises tests for `pit_histogram_values` are performed by `test__check_args_with_pits`. + """ + with pytest.raises(ValueError, match="`n_bins` must be at least 1"): + pit_histogram_values(ptd.DA_PIT_CDF2, pit_threshold_dim="pit_thresh", n_bins=-10, dims=None) + + +@pytest.mark.parametrize( + ("pit_cdf", "dims", "expected"), + [ + (ptd.DA_PIT_SCORE1, None, ptd.EXP_PIT_SCORE1A), + (ptd.DA_PIT_SCORE1, ["station", "date"], ptd.EXP_PIT_SCORE1B), + (ptd.DA_PIT_SCORE2, ["station"], ptd.EXP_PIT_SCORE2), + ], +) +def test_pit_scores(pit_cdf, dims, expected): + """Tests `pit_scores`""" + result = pit_scores(pit_cdf, "pit_thresh", dims) + xr.testing.assert_allclose(result, expected) diff --git a/tests/processing/cdf/test_functions.py b/tests/processing/cdf/test_functions.py index b3ef1f24..9086e818 100644 --- a/tests/processing/cdf/test_functions.py +++ b/tests/processing/cdf/test_functions.py @@ -16,6 +16,13 @@ propagate_nan, round_values, ) +from scores.processing.cdf.cdf_functions import ( + _var_from_cdf, + check_cdf, + check_cdf_support, + expectedvalue_from_cdf, + variance_from_cdf, +) from tests.probabilty import cdf_test_data, crps_test_data from tests.processing.cdf import functions_test_data as ftd @@ -213,3 +220,97 @@ def test_decreasing_cdfs(cdf, tolerance, expected): """Tests `decreasing_cdfs` with a variety of inputs.""" result = decreasing_cdfs(cdf, "x", tolerance) xr.testing.assert_equal(result, expected) + + +@pytest.mark.parametrize( + ("cdf", "threshold_dim", "dims", "error_msg_snippet"), + [ + ( + cdf_test_data.DA_OBSERVED_CDF, + "crazy", + None, + "`thresh_dim` is not a dimension of `da_cdf`", + ), + ( + cdf_test_data.DA_OBSERVED_CDF, + "date", + ["date", "station"], + "`thresh_dim` is in `my_dims`", + ), + ( + cdf_test_data.DA_OBSERVED_CDF, + "station", + ["date", "bird"], + "`my_dims` is not a subset of dimensions of `da_cdf`", + ), + ( + cdf_test_data.DA_WITHIN_BOUNDS3, + "x", + None, + "values of `da_cdf` must be in the closed interval", + ), + ( + cdf_test_data.DA_NAN_DECREASING_CDFS2, + "x", + None, + "coordinates along `thresh_dim` of `da_cdf` must be increasing", + ), + ], +) +def test_check_cdf(cdf, threshold_dim, dims, error_msg_snippet): + """Tests that `check_cdf` raises an exceptions as expected.""" + with pytest.raises(ValueError, match=error_msg_snippet): + check_cdf(cdf, threshold_dim, dims, "da_cdf", "thresh_dim", "my_dims") + + +@pytest.mark.parametrize( + ("cdf", "lower_supp", "upper_supp"), + [ + (cdf_test_data.DA_SUPPORT1, -np.inf, np.inf), + (cdf_test_data.DA_SUPPORT1, 0, 1), + (cdf_test_data.DA_SUPPORT2, 1, 2), + (cdf_test_data.DA_SUPPORT3, 1, 2), + ], +) +def test_check_cdf_support(cdf, lower_supp, upper_supp): + """Tests that `check_cdf_support` does not raise as expected.""" + check_cdf_support(cdf, "x", lower_supp, upper_supp, "my_cdf", "my_dim") + + +@pytest.mark.parametrize( + ("cdf", "lower_supp", "upper_supp", "error_msg_snippet"), + [ + (cdf_test_data.DA_SUPPORT1, 3, 1, "`upper_supp < lower_supp`"), + (cdf_test_data.DA_SUPPORT1, np.inf, 9, "`upper_supp < lower_supp`"), + (cdf_test_data.DA_SUPPORT1, 2, 3, "`my_cdf` is not 0 when `my_cdf\\[my_dim\\] < 2`"), + (cdf_test_data.DA_SUPPORT1, -2, 0.9, "`my_cdf` is not 1 when `my_cdf\\[my_dim\\] > 0.9`"), + ], +) +def test_check_cdf_support_raises(cdf, lower_supp, upper_supp, error_msg_snippet): + """Tests that `check_cdf_support` raises as expected.""" + with pytest.raises(ValueError, match=error_msg_snippet): + check_cdf_support(cdf, "x", lower_supp, upper_supp, "my_cdf", "my_dim") + + +def test_expectedvalue_from_cdf(): + """Tests that `expectedvalue_from_cdf` gives correct output.""" + result = expectedvalue_from_cdf(cdf_test_data.DA_CDF_EXPECTEDVALUE, "x") + xr.testing.assert_allclose(result, cdf_test_data.EXP_EXPECTEDVALUE) + + +def test_expectedvalue_from_cdf_raises(): + """Tests that `expectedvalue_from_cdf` raises if `nonnegative_support = False`.""" + with pytest.raises(ValueError, match="This function currently only handles"): + expectedvalue_from_cdf(cdf_test_data.DA_CDF_EXPECTEDVALUE, "x", nonnegative_support=False) + + +def test___var_from_cdf(): + """Tests that `_var_from_cdf` gives correct output.""" + result = _var_from_cdf(cdf_test_data.DA_FUNCVALS, "x") + xr.testing.assert_allclose(result, cdf_test_data.EXP_VARCDF) + + +def test_variance_from_cdf(): + """Tests that 'variance_from_cdf' gives correct output.""" + result = variance_from_cdf(cdf_test_data.DA_CDF_VARIANCE, "x") + xr.testing.assert_allclose(result, cdf_test_data.EXP_VARIANCE)