diff --git a/.pylintrc b/.pylintrc index b840463..ace1a5e 100644 --- a/.pylintrc +++ b/.pylintrc @@ -10,7 +10,10 @@ disable= C0302, # allow too many lines in module C0411, # allow custom import order + E0606, # allow false positive used-before-assignment + R0801, # allow similar lines in 2 files + R0915, # allow too many statements W0105, # allow no effect string statement W0102, # allow dangerous default value [] diff --git a/deel/puncc/api/nonconformity_scores.py b/deel/puncc/api/nonconformity_scores.py index 6c7555c..81971c8 100644 --- a/deel/puncc/api/nonconformity_scores.py +++ b/deel/puncc/api/nonconformity_scores.py @@ -24,7 +24,7 @@ This module provides nonconformity scores for conformal prediction. To be used when building a :ref:`calibrator `. """ -import pkgutil +import importlib from typing import Callable from typing import Iterable @@ -33,13 +33,13 @@ from deel.puncc.api.utils import logit_normalization_check from deel.puncc.api.utils import supported_types_check -if pkgutil.find_loader("pandas") is not None: +if importlib.util.find_spec("pandas") is not None: import pandas as pd -if pkgutil.find_loader("tensorflow") is not None: +if importlib.util.find_spec("tensorflow") is not None: import tensorflow as tf -if pkgutil.find_loader("torch") is not None: +if importlib.util.find_spec("torch") is not None: import torch @@ -176,7 +176,7 @@ def difference(y_pred: Iterable, y_true: Iterable) -> Iterable: """ supported_types_check(y_pred, y_true) - if pkgutil.find_loader("torch") is not None and isinstance( + if importlib.util.find_spec("torch") is not None and isinstance( y_pred, torch.Tensor ): y_pred = y_pred.cpu().detach().numpy() @@ -205,7 +205,9 @@ def absolute_difference(y_pred: Iterable, y_true: Iterable) -> Iterable: return abs(difference(y_pred, y_true)) -def scaled_ad(Y_pred: Iterable, y_true: Iterable, eps: float = 1e-12) -> Iterable: +def scaled_ad( + Y_pred: Iterable, y_true: Iterable, eps: float = 1e-12 +) -> Iterable: """Scaled Absolute Deviation, normalized by an estimation of the conditional mean absolute deviation (conditional MAD). Considering :math:`Y_{\\text{pred}} = (\mu_{\\text{pred}}, \sigma_{\\text{pred}})`: @@ -235,7 +237,7 @@ def scaled_ad(Y_pred: Iterable, y_true: Iterable, eps: float = 1e-12) -> Iterabl if len(y_true.shape) != 1: raise RuntimeError("Each y_true must contain a point observation.") - if pkgutil.find_loader("pandas") is not None and isinstance( + if importlib.util.find_spec("pandas") is not None and isinstance( Y_pred, pd.DataFrame ): y_pred, sigma_pred = Y_pred.iloc[:, 0], Y_pred.iloc[:, 1] @@ -245,8 +247,10 @@ def scaled_ad(Y_pred: Iterable, y_true: Iterable, eps: float = 1e-12) -> Iterabl # MAD then Scaled MAD and computed mean_absolute_deviation = absolute_difference(y_pred, y_true) if np.any(sigma_pred + eps <= 0): - print("Warning: calibration points with MAD predictions" - " below -eps won't be used for calibration.") + print( + "Warning: calibration points with MAD predictions" + " below -eps won't be used for calibration." + ) nonneg = sigma_pred + eps > 0 return mean_absolute_deviation[nonneg] / (sigma_pred[nonneg] + eps) @@ -282,7 +286,7 @@ def cqr_score(Y_pred: Iterable, y_true: Iterable) -> Iterable: if len(y_true.shape) != 1: raise RuntimeError("Each y_pred must contain a point observation.") - if pkgutil.find_loader("pandas") is not None and isinstance( + if importlib.util.find_spec("pandas") is not None and isinstance( Y_pred, pd.DataFrame ): q_lo, q_hi = Y_pred.iloc[:, 0], Y_pred.iloc[:, 1] @@ -295,7 +299,7 @@ def cqr_score(Y_pred: Iterable, y_true: Iterable) -> Iterable: if isinstance(diff_lo, np.ndarray): return np.maximum(diff_lo, diff_hi) - if pkgutil.find_loader("pandas") is not None and isinstance( + if importlib.util.find_spec("pandas") is not None and isinstance( diff_lo, (pd.DataFrame, pd.Series) ): return (pd.concat([diff_lo, diff_hi]).groupby(level=0)).max() @@ -303,12 +307,12 @@ def cqr_score(Y_pred: Iterable, y_true: Iterable) -> Iterable: # "CQR score not implemented for DataFrames. Please provide ndarray or tensors." # ) - if pkgutil.find_loader("tensorflow") is not None and isinstance( + if importlib.util.find_spec("tensorflow") is not None and isinstance( diff_lo, tf.Tensor ): return tf.math.maximum(diff_lo, diff_hi) - # if pkgutil.find_loader("torch") is not None and isinstance( + # if importlib.util.find_spec("torch") is not None and isinstance( # diff_lo, torch.Tensor # ): # return torch.maximum(diff_lo, diff_hi) diff --git a/deel/puncc/api/splitting.py b/deel/puncc/api/splitting.py index bcf65f7..25ff7c7 100644 --- a/deel/puncc/api/splitting.py +++ b/deel/puncc/api/splitting.py @@ -23,7 +23,7 @@ """ This module provides data splitting schemes. """ -import pkgutil +import importlib from abc import ABC from typing import Iterable from typing import List @@ -36,7 +36,7 @@ from deel.puncc.api.utils import sample_len_check from deel.puncc.api.utils import supported_types_check -if pkgutil.find_loader("pandas") is not None: +if importlib.util.find_spec("pandas") is not None: import pandas as pd @@ -171,7 +171,7 @@ def __call__( folds = [] for fit, calib in kfold.split(X): - if pkgutil.find_loader("pandas") is not None and isinstance( + if importlib.util.find_spec("pandas") is not None and isinstance( X, pd.DataFrame ): if isinstance(y, pd.DataFrame): diff --git a/deel/puncc/api/utils.py b/deel/puncc/api/utils.py index e06c01c..5b57695 100644 --- a/deel/puncc/api/utils.py +++ b/deel/puncc/api/utils.py @@ -24,7 +24,7 @@ This module implements utility functions. """ import logging -import pkgutil +import importlib import sys from typing import Any from typing import Iterable @@ -34,13 +34,13 @@ import numpy as np -if pkgutil.find_loader("pandas") is not None: +if importlib.util.find_spec("pandas") is not None: import pandas as pd -if pkgutil.find_loader("tensorflow") is not None: +if importlib.util.find_spec("tensorflow") is not None: import tensorflow as tf -if pkgutil.find_loader("torch") is not None: +if importlib.util.find_spec("torch") is not None: import torch logger = logging.getLogger(__name__) @@ -126,15 +126,15 @@ def supported_types_check(*data: Iterable): if isinstance(a, np.ndarray): pass - elif pkgutil.find_loader("pandas") is not None and isinstance( + elif importlib.util.find_spec("pandas") is not None and isinstance( a, (pd.DataFrame, pd.Series) ): pass - elif pkgutil.find_loader("tensorflow") is not None and isinstance( + elif importlib.util.find_spec("tensorflow") is not None and isinstance( a, tf.Tensor ): pass - elif pkgutil.find_loader("torch") is not None and isinstance( + elif importlib.util.find_spec("torch") is not None and isinstance( a, torch.Tensor ): pass @@ -277,15 +277,15 @@ def quantile( if isinstance(a, np.ndarray): pass - elif pkgutil.find_loader("pandas") is not None and isinstance( + elif importlib.util.find_spec("pandas") is not None and isinstance( a, pd.DataFrame ): a = a.to_numpy() - elif pkgutil.find_loader("tensorflow") is not None and isinstance( + elif importlib.util.find_spec("tensorflow") is not None and isinstance( a, tf.Tensor ): a = a.numpy() - # elif pkgutil.find_loader("torch") is not None: + # elif importlib.util.find_spec("torch") is not None: # if isinstance(a, torch.Tensor): # a = a.cpu().detach().numpy() else: diff --git a/deel/puncc/plotting.py b/deel/puncc/plotting.py index b750d9c..09659d8 100644 --- a/deel/puncc/plotting.py +++ b/deel/puncc/plotting.py @@ -155,6 +155,9 @@ def plot_prediction_intervals( """ + # Initialisation + current_rcparams = None + # Figure size configuration if "figsize" in fig_kw.keys(): figsize = fig_kw["figsize"] @@ -256,7 +259,7 @@ def plot_prediction_intervals( ax.set_xlim(X[0] - int_size * 0.01, X[-1] + int_size * 0.01) # restablish rcparams - if restablish_rcparams: + if current_rcparams is not None and restablish_rcparams: matplotlib.rcParams.update(current_rcparams) return ax diff --git a/docs/source/theory_overview.rst b/docs/source/theory_overview.rst index 0af62a8..8fbfd8d 100644 --- a/docs/source/theory_overview.rst +++ b/docs/source/theory_overview.rst @@ -20,26 +20,46 @@ Depending on the application fields of machine learning models, uncertainty can Conformal Prediction -------------------- -Conformal Prediction (CP) is a set of *distribution-free*, *model-agnostic* and -*non-asymptotic* methods to estimate uncertainty by constructing **valid** *prediction sets*, i.e. with guaranteed probability of marginal coverage. +Conformal Prediction (CP) is a set of methods to estimate uncertainty +by constructing by constructing **valid** *prediction sets*, +i.e. prediction sets with a probabilistic guarantee +of marginal coverage. +The following three features make CP methods particularly attractive: + - *Distribution-free*. CP methods can be applied regardless of the underlying data-generating distribution. + - *Model-agnostic*. CP works with any ML model, even with black-box models where we only have access to the outputs of the model. + - *Non-asymptotic*. CP methods provide finite-sample probabilistic guarantees, that is, the guarantees hold without the need to assume that the number of available data grows to infinity. Given an error rate (or significance level) :math:`\alpha \in (0,1)`, set by the user, a set of exchangeable (or more simply i.i.d.) -train data :math:`\{ (X_i, Y_i) \}_{i=1}^{n}` and test point -:math:`(X_{new}, Y_{new})` generated for a joint distribution :math:`\mathbb{P}_{XY}`, -a conformal prediction procedure builds prediction sets :math:`{C}_{\alpha}(\cdot)` so that: +train data :math:`\{ (X_i, Y_i) \}_{i=1}^{n}` and a test point +:math:`(X_{new}, Y_{new})`, +all of which are generated from the same joint distribution :math:`\mathbb{P}_{XY}`, +a conformal prediction procedure uses the training data +to build prediction sets :math:`\widehat{C}_{\alpha}(\cdot)` so that: .. math:: - \mathbb{P} \Big\{ Y_{new} \in {C}_{\alpha}\left(X_{new}\right) \Big\} \geq 1 - \alpha. + \mathbb{P} \Big\{ Y_{new} \in \widehat{C}_{\alpha}\left(X_{new}\right) \Big\} \geq 1 - \alpha. -Over many calibration and test sets, :math:`{C}_{\alpha}(X_{new})` will contain +Over many calibration and test sets, :math:`\widehat{C}_{\alpha}(X_{new})` will contain the observed values of :math:`Y_{new}` with frequency of *at least* :math:`(1-\alpha)`. -Within the conformal prediction framework, the inequality above holds for any model, -any data distribution :math:`\mathbb{P}_{XY}` and any finite sample sizes. +.. + Within the conformal prediction framework, the inequality above holds for any model, + any data distribution :math:`\mathbb{P}_{XY}` and any finite sample sizes. + +Usually, the conformal prediction method uses a point-predictor model :math:`\widehat{f}` +and turns it into the set predictor :math:`C_\alpha` +via a calibration procedure. +Within the conformal prediction framework, +the inequality above holds for any model, +any data distribution :math:`\mathbb{P}_{XY}` and any training set sample size, under the following minimal assumptions: + - *Exchangeability*. The data :math:`(X_1,Y_i),\dots, (X_n, Y_n), (X_{new}, Y_{new})` form an exchangeable sequence (this is a milder assumption than the data being i.i.d.). + - *Independence of train and calibration data.* The data for the model training is independent from the data for the model calibration. + It is noteworthy that the coverage probability is marginalized over :math:`X`. -Therefore, it is likely to undercover conditionally to some specific regions in the space of :math:`X`. +Therefore, the CP algorithm is likely to achieve the coverage rate of :math:`1-\alpha` +by under-covering conditionally to some specific regions in the space of :math:`X` and over-covering in other regions. Conformal prediction can act as a *post-processing procedure* to attain rigorous probability coverages, as it can "conformalize" any existing predictor during or after training (black box predictors), @@ -50,7 +70,7 @@ literature used on regression and classification models. We also refer to Angelopoulos and Bates [Angelopoulos2022]_ for a hands-on introduction to conformal prediction and awesome conformal prediction `github `_ for additional ressources. -In the following, let :math:`D_{train} = {(X_i, Y_i)}_{i=1..n_{train}} \sim P_{XY}` +In the following, let :math:`D = {(X_i, Y_i)}_{i=1}^n \sim P_{XY}` be the training data and :math:`\alpha \in (0, 1)` the significance level (target maximum error rate). Conformal Regression @@ -61,14 +81,22 @@ Split (inductive) Conformal .. _theory splitcp: The split (also called inductive) conformal prediction [Papadopoulos2002]_ [Lei2018]_ requires a hold-out calibration -dataset :math:`D_{calibration}` to estimate prediction errors and use them to build the prediction interval for a new sample :math:`X_{new}`. +dataset: the dataset :math:`D` is split into a proper training set +:math:`D_{train}=\big\lbrace(X_i,Y_i), i=1,\dots,n_{train}\big\rbrace` +and an independent calibration dataset :math:`D_{calib}=\big\lbrace(X_i,Y_i),i=1,\dots,n_{calib}\big\rbrace`. +The purpose of the calibration dataset is +to estimate prediction errors and use them to build the prediction interval for a new sample :math:`X_{new}`. Given a prediction model :math:`\widehat{f}` trained on :math:`D_{train}`, the algorithm is summarized in the following: -#. Choose a nonconformity score :math:`s`: :math:`R = s(\widehat{f}(X),Y)`. For example, one can pick the mean absolute deviation :math:`R = |\widehat{f}(X)-Y|`. -#. Compute the nonconformity scores on the calibration dataset: :math:`\bar{R} = \{R_i\}_{}`, for :math:`i=1,\dots,|D_{calibration}|`, where :math:`|D_{calibration}|` is the cardinality of :math:`D_{calibration}`. -#. Compute the error margin :math:`\delta_{\alpha}` as the :math:`(1-\alpha)(1 + \frac{1}{| D_{calibration} |})`-th empirical quantile of :math:`\bar{R}`. -#. Build the prediction interval :math:`\widehat{C}_{\alpha}(X_{new}) = \Big[ \widehat{f}(X_{new}) - \delta_{\alpha}^{f} \,,\, \widehat{f}(X_{new}) + \delta_{\alpha}^{f} \Big]`. +#. Choose a nonconformity score :math:`s`, and define the error :math:`R` over a sample :math:`(X,Y)` as :math:`R = s(\widehat{f}(X),Y)`. For example, one can pick the absolute deviation :math:`R = |\widehat{f}(X)-Y|`. +#. Compute the nonconformity scores on the calibration dataset: :math:`\mathcal{R} = \{R_i\}_{}`, where :math:`R_i=s(\widehat{f}(X_i), Y_i)` for :math:`i=1,\dots,n_{calib}`. +#. Compute the error margin :math:`\delta_{\alpha}` as the :math:`(1-\alpha)(1 + 1/n_{calib})`-th empirical quantile of :math:`\mathcal{R}`. +#. Build the prediction interval as + +.. math:: + + \widehat{C}_{\alpha}(X_{new}) = \Big[ \widehat{f}(X_{new}) - \delta_{\alpha} \,,\, \widehat{f}(X_{new}) + \delta_{\alpha} \Big]. Note that this procedure yields a constant-width prediction interval centered on the point estimate :math:`\widehat{f}(X_{new})`. @@ -105,15 +133,22 @@ Conformalized Quantile Regression (CQR) ####################################### .. _theory cqr: -Split conformal prediction can be extended to `quantile predictors `_ :math:`q(\cdot)` -by using the nonconformity score: +Split conformal prediction can be extended to `quantile predictors `_ :math:`q(\cdot)`. +Given a nominal error rate :math:`\alpha,` +and positive error rates :math:`\alpha_{lo}` +and :math:`\alpha_{hi}` +such that :math:`\alpha_{lo}+\alpha_{hi}=1,` +we denote by :math:`\widehat{q}_{\alpha_{lo}}` and +:math:`\widehat{q}_{1-\alpha_{hi}}` +the predictors of the :math:`\alpha_{lo}` *-th* and :math:`(1-\alpha_{hi})` *-th* quantiles of :math:`Y | X.` +The quantile predictors are trained on :math:`D_{train}` +and calibrated on :math:`D_{calib}` +by using the following nonconformity score: .. math:: R_i^{} = \text{max}\{ \widehat{q}_{\alpha_{lo}}(X_i) - Y_i, Y_i - \widehat{q}_{1 - \alpha_{hi}}(X_i)\}, -for :math:`i=1,\dots,|D_{calibration}|`. :math:`\widehat{q}_{\alpha_{lo}}` and :math:`\widehat{q}_{1-\alpha_{hi}}` are -the predictors of the :math:`\alpha_{lo}` *-th* and :math:`(1-\alpha_{hi})` *-th* quantiles of :math:`Y | X`, respectively. For example, if we set :math:`\alpha = 0.1`, we would fit two predictors :math:`\widehat{q}_{0.05}(\cdot)` and :math:`\widehat{q}_{0.95}(\cdot)` on training data :math:`D_{train}` and compute the scores on :math:`D_{calibration}`. @@ -129,7 +164,7 @@ The procedure, named *Conformalized Quantile Regression* [Romano2019]_, yields t When data are exchangeable, the correction margin :math:`\delta_{\alpha}` guarantees finite-sample marginal coverage for the quantile predictions, and this holds also for misspecified (i.e. "bad") predictors. -If the fitted :math:`\widehat{q}_{\alpha_{lo}}` and :math:`\widehat{q}_{1-\alpha_{hi}}` approximate (empirically) well the conditional distribution :math:`Y | X` of the data, we will get a small margin :math:`\delta_{\alpha}`: this means that on average, the prediction errors on the :math:`D_{calibration}` were small. +If the fitted :math:`\widehat{q}_{\alpha_{lo}}` and :math:`\widehat{q}_{1-\alpha_{hi}}` approximate (empirically) well the conditional distribution :math:`Y | X` of the data, we will get a small margin :math:`\delta_{\alpha}`: this means that on average, the prediction errors on the :math:`D_{calib}` were small. Also, if the base predictors have strong theoretical properties, our CP procedure inherits these properties of :math:`\widehat{q}_{}(\cdot)`. We could have an asymptotically, conditionally accurate predictor and also have a theoretically valid, distribution-free guarantee on the marginal coverage! @@ -175,10 +210,10 @@ If :math:`K = n`, we obtain the *Jackknife+*, **leave-one-out** version of the a The lower and upper bounds of the prediction interval are given by: - 1. Compute :math:`\bar{R}_{L} = \{ \widehat{f}_{-S_{k(i)}}(X_{new}) - R_i^{CV} \}_{i=1}^{n}` - 2. :math:`\widehat{L}_{\alpha}(X_{new}) = \lfloor \alpha (n+1) \rfloor`-th smallest value in :math:`\bar{R}_{L}` (lower bound) - 3. Compute :math:`\bar{R}_{U} = \{ \widehat{f}_{-S_{k(i)}}(X_{new}) + R_i^{CV} \}_{i=1}^{n}` - 4. :math:`\widehat{U}_{\alpha}(X_{new}) = \lceil (1-\alpha) (n+1) \rceil`-th smallest value in :math:`\bar{R}_{U}` (upper bound) + #. Compute :math:`\bar{R}_{L} = \{ \widehat{f}_{-S_{k(i)}}(X_{new}) - R_i^{CV} \}_{i=1}^{n}` + #. :math:`\widehat{L}_{\alpha}(X_{new}) = \lfloor \alpha (n+1) \rfloor`-th smallest value in :math:`\bar{R}_{L}` (lower bound) + #. Compute :math:`\bar{R}_{U} = \{ \widehat{f}_{-S_{k(i)}}(X_{new}) + R_i^{CV} \}_{i=1}^{n}` + #. :math:`\widehat{U}_{\alpha}(X_{new}) = \lceil (1-\alpha) (n+1) \rceil`-th smallest value in :math:`\bar{R}_{U}` (upper bound) .. math:: @@ -190,9 +225,63 @@ Ensemble Batch Prediction Intervals (EnbPI) ******************************************* .. _theory enbpi: -Source: [Xu2021]_ -TBC +Introduced in [Xu2021]_, +the EnbPI algorithm builds prediction intervals +for time series data of the form +:math:`Y_t = f(X_t) + \epsilon_t`, +where :math:`\epsilon_t` are identically distributed, +but not necessarily independent. +Given a training data set :math:`D=\lbrace (X_i, Y_i) \rbrace_{i=1}^n` +and a test set :math:`D_{test} = \lbrace (X_t,Y_t) \rbrace_{t=n+1}^{n_{test}}`, +the EnbPI algorithm aims at constructing prediction sets +for each test point :math:`X_t`. +As with the CV+ or Jackknife+ methods, +the EnbPI algorithm does not require a held-out calibration set, +as it uses a bootstrap algorithm instead. +Let :math:`\mathcal{A}` be a training algorithm +(i.e. an algorithm that maps a dataset to a predictor), +and :math:`\phi` an aggregation function +that aggregates different individual models together, +e.g. via a simple average, a bagging or an ensembling method. +The algorithm EnbPI is performed in three stages: + +**Training** + #. Sample :math:`B` bootstrap data sets :math:`S_b`, for :math:`b=1,\dots, B` with replacement from :math:`D`. + #. Train :math:`B` bootstrap models :math:`\widehat{f}^b = \mathcal{A}(S_b)`. + +**Calibration** + #. Compute the predictions on each training sample :math:`X_i\in D`. Only the models :math:`\widehat{f}^b` where :math:`X_i\not\in S_b` are used in the aggregation: :math:`\widehat{f}_{-i}(X_i):=\phi\big( \lbrace \widehat{f}^b(X_i) | X_i\not\in S_b\rbrace\big)`. + #. Compute the errors :math:`R_i=|Y_i-\widehat{f}_{-i}(X_i)|`, and stock them as :math:`\mathcal{R}_1:=\lbrace R_i,i=1,\dots, n\rbrace`. + +**Inference** + #. Compute the predictions on each test sample :math:`X_t\in D_{test}` by setting :math:`\widehat{f}_{-t}(X_t):= \frac{1}{T}\sum_{i=1}^T \widehat{f}_{-i}(X_t)`. + #. Update the error set: :math:`\mathcal R_t` (see below). + #. Compute the width of the prediction intervals :math:`\delta_{\alpha, t}` as the :math:`(1-\alpha)`-th empirical quantile of :math:`\mathcal{R}_t`. + + +The prediction interval for :math:`X_t` is then given by + +.. math:: + + \widehat{C}_{\alpha} = \big[ \widehat{f}_{-t}(X_t)-\delta_{\alpha, t}, \widehat{f}_{-t}(X_t)+\delta_{\alpha, t}]. + +In order to update the error set :math:`\mathcal{R}_t`, +a *memory* parameter :math:`s` is employed. +Every :math:`s` test examples, the first :math:`s` errors in the set +:math:`\mathcal{R}` are dropped and the errors over the last :math:`s` +test examples are added to the error set :math:`\mathcal{R}`. +I.e. if :math:`t-n = 0\ mod\ s` then :math:`\mathcal{R}_t = \lbrace R_i, i=t-n,\dots,t-1\rbrace` +and if :math:`t-n \neq 0\ mod\ s` then :math:`\mathcal{R}_t=\mathcal{R}_{t-1}`. + + +.. note:: + + The EnbPI algorithm does not provide an exact probabilistic guarantee as the previous CP methods do. + The guarantee provided by the EnbPI algorithm is only approximate, + and holds under additional assumptions on the error process + :math:`\epsilon_t`. However, it does not require the data to be exchangeable. + .. Introduced in [Xu2021]_, the EnbPI algorithms builds prediction intervals for time series data of the form :math:`Y_t = f(X_t) + \epsilon_t`, where :math:`\epsilon_t` are identically distributed. .. Unlike the proper conformal algorithms seen above, EnbPI requires some additional hypothesis to attain the coverage guarantee. @@ -226,9 +315,41 @@ Adaptive Prediction Sets (APS) ******************************************* .. _theory aps: -Source: [Romano2020]_ +As for the Split Conformal Regression algorithm, +the APS algorithm introduced in [Romano2020]_ +requires us to split the data set :math:`D` into a proper training set :math:`D_{train}` +and an independent calibration set :math:`D_{calib}`. +A classifier :math:`\widehat{\pi}` is trained +using the proper training set :math:`D_{train}` only. +We assume that the output of the classifier is given by the softmax scores for the different classes. +I.e. for each input :math:`x`, +the output :math:`\widehat{\pi}(x)=(\widehat{\pi}_1(x),\dots,\widehat{\pi}_K(x))` +is a probability vector and :math:`k=1,\dots, K` +represent the possible different classes in the classification task. + We represent by :math:`\widehat{\pi}_{(1)}(x)\geq \cdots\geq \widehat{\pi}_{(K)}(x)` + the softmax vector :math:`\widehat{\pi}` arranged in decreasing order, + i.e. :math:`(k)` is the index of the class having the :math:`k`-th largest probability mass. + +In order to construct the prediction sets :math:`\widehat{C}_\alpha`, +the APS algorithm works in two stages: + +**Calibration** + #. For each example :math:`X_i` in the calibration data set, we compute the error :math:`R_i` as the probability mass needed for reaching the true label :math:`Y_i`, i.e. :math:`R_i=\widehat{\pi}_{(1)}+\cdots+\widehat{\pi}_{(k)}`, wehere :math:`(k)=Y_i`. + #. Stock all errors in a vector :math:`\mathcal{R}`. + +**Inference** + #. Compute the error margin :math:`\delta_{\alpha}` as the :math:`(1-\alpha)(1 + 1/n_{calib})`-th empirical quantile of :math:`\mathcal{R}`. + #. The prediction set for a test point :math:`X_{new}` is defined as + + .. math:: + \widehat{C}_{\alpha}(X_{new})=\big\lbrace + (1),\dots,(k) + \big\rbrace\quad \text{where}\quad + k = \min\big\lbrace i : \widehat{\pi}_{(1)}+\cdots+\widehat{\pi}_{(i)}\geq \delta_\alpha\big\rbrace. + + + -TBC Regularized Adaptive Prediction Sets (RAPS) *******************************************