diff --git a/.pylintrc b/.pylintrc
index b840463..ace1a5e 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -10,7 +10,10 @@ disable=
     C0302, # allow too many lines in module
     C0411, # allow custom import order
 
+    E0606, # allow false positive used-before-assignment
+
     R0801, # allow similar lines in 2 files
+    R0915, # allow too many statements
 
     W0105, # allow no effect string statement
     W0102, # allow dangerous default value []
diff --git a/deel/puncc/api/nonconformity_scores.py b/deel/puncc/api/nonconformity_scores.py
index 6c7555c..81971c8 100644
--- a/deel/puncc/api/nonconformity_scores.py
+++ b/deel/puncc/api/nonconformity_scores.py
@@ -24,7 +24,7 @@
 This module provides nonconformity scores for conformal prediction. To be used
 when building a :ref:`calibrator <calibration>`.
 """
-import pkgutil
+import importlib
 from typing import Callable
 from typing import Iterable
 
@@ -33,13 +33,13 @@
 from deel.puncc.api.utils import logit_normalization_check
 from deel.puncc.api.utils import supported_types_check
 
-if pkgutil.find_loader("pandas") is not None:
+if importlib.util.find_spec("pandas") is not None:
     import pandas as pd
 
-if pkgutil.find_loader("tensorflow") is not None:
+if importlib.util.find_spec("tensorflow") is not None:
     import tensorflow as tf
 
-if pkgutil.find_loader("torch") is not None:
+if importlib.util.find_spec("torch") is not None:
     import torch
 
 
@@ -176,7 +176,7 @@ def difference(y_pred: Iterable, y_true: Iterable) -> Iterable:
     """
     supported_types_check(y_pred, y_true)
 
-    if pkgutil.find_loader("torch") is not None and isinstance(
+    if importlib.util.find_spec("torch") is not None and isinstance(
         y_pred, torch.Tensor
     ):
         y_pred = y_pred.cpu().detach().numpy()
@@ -205,7 +205,9 @@ def absolute_difference(y_pred: Iterable, y_true: Iterable) -> Iterable:
     return abs(difference(y_pred, y_true))
 
 
-def scaled_ad(Y_pred: Iterable, y_true: Iterable, eps: float = 1e-12) -> Iterable:
+def scaled_ad(
+    Y_pred: Iterable, y_true: Iterable, eps: float = 1e-12
+) -> Iterable:
     """Scaled Absolute Deviation, normalized by an estimation of the conditional
     mean absolute deviation (conditional MAD). Considering
     :math:`Y_{\\text{pred}} = (\mu_{\\text{pred}}, \sigma_{\\text{pred}})`:
@@ -235,7 +237,7 @@ def scaled_ad(Y_pred: Iterable, y_true: Iterable, eps: float = 1e-12) -> Iterabl
     if len(y_true.shape) != 1:
         raise RuntimeError("Each y_true must contain a point observation.")
 
-    if pkgutil.find_loader("pandas") is not None and isinstance(
+    if importlib.util.find_spec("pandas") is not None and isinstance(
         Y_pred, pd.DataFrame
     ):
         y_pred, sigma_pred = Y_pred.iloc[:, 0], Y_pred.iloc[:, 1]
@@ -245,8 +247,10 @@ def scaled_ad(Y_pred: Iterable, y_true: Iterable, eps: float = 1e-12) -> Iterabl
     # MAD then Scaled MAD and computed
     mean_absolute_deviation = absolute_difference(y_pred, y_true)
     if np.any(sigma_pred + eps <= 0):
-        print("Warning: calibration points with MAD predictions"
-              " below -eps won't be used for calibration.")
+        print(
+            "Warning: calibration points with MAD predictions"
+            " below -eps won't be used for calibration."
+        )
 
     nonneg = sigma_pred + eps > 0
     return mean_absolute_deviation[nonneg] / (sigma_pred[nonneg] + eps)
@@ -282,7 +286,7 @@ def cqr_score(Y_pred: Iterable, y_true: Iterable) -> Iterable:
     if len(y_true.shape) != 1:
         raise RuntimeError("Each y_pred must contain a point observation.")
 
-    if pkgutil.find_loader("pandas") is not None and isinstance(
+    if importlib.util.find_spec("pandas") is not None and isinstance(
         Y_pred, pd.DataFrame
     ):
         q_lo, q_hi = Y_pred.iloc[:, 0], Y_pred.iloc[:, 1]
@@ -295,7 +299,7 @@ def cqr_score(Y_pred: Iterable, y_true: Iterable) -> Iterable:
     if isinstance(diff_lo, np.ndarray):
         return np.maximum(diff_lo, diff_hi)
 
-    if pkgutil.find_loader("pandas") is not None and isinstance(
+    if importlib.util.find_spec("pandas") is not None and isinstance(
         diff_lo, (pd.DataFrame, pd.Series)
     ):
         return (pd.concat([diff_lo, diff_hi]).groupby(level=0)).max()
@@ -303,12 +307,12 @@ def cqr_score(Y_pred: Iterable, y_true: Iterable) -> Iterable:
         #     "CQR score not implemented for DataFrames. Please provide ndarray or tensors."
         # )
 
-    if pkgutil.find_loader("tensorflow") is not None and isinstance(
+    if importlib.util.find_spec("tensorflow") is not None and isinstance(
         diff_lo, tf.Tensor
     ):
         return tf.math.maximum(diff_lo, diff_hi)
 
-    # if pkgutil.find_loader("torch") is not None and isinstance(
+    # if importlib.util.find_spec("torch") is not None and isinstance(
     #     diff_lo, torch.Tensor
     # ):
     #     return torch.maximum(diff_lo, diff_hi)
diff --git a/deel/puncc/api/splitting.py b/deel/puncc/api/splitting.py
index bcf65f7..25ff7c7 100644
--- a/deel/puncc/api/splitting.py
+++ b/deel/puncc/api/splitting.py
@@ -23,7 +23,7 @@
 """
 This module provides data splitting schemes.
 """
-import pkgutil
+import importlib
 from abc import ABC
 from typing import Iterable
 from typing import List
@@ -36,7 +36,7 @@
 from deel.puncc.api.utils import sample_len_check
 from deel.puncc.api.utils import supported_types_check
 
-if pkgutil.find_loader("pandas") is not None:
+if importlib.util.find_spec("pandas") is not None:
     import pandas as pd
 
 
@@ -171,7 +171,7 @@ def __call__(
         folds = []
 
         for fit, calib in kfold.split(X):
-            if pkgutil.find_loader("pandas") is not None and isinstance(
+            if importlib.util.find_spec("pandas") is not None and isinstance(
                 X, pd.DataFrame
             ):
                 if isinstance(y, pd.DataFrame):
diff --git a/deel/puncc/api/utils.py b/deel/puncc/api/utils.py
index e06c01c..5b57695 100644
--- a/deel/puncc/api/utils.py
+++ b/deel/puncc/api/utils.py
@@ -24,7 +24,7 @@
 This module implements utility functions.
 """
 import logging
-import pkgutil
+import importlib
 import sys
 from typing import Any
 from typing import Iterable
@@ -34,13 +34,13 @@
 
 import numpy as np
 
-if pkgutil.find_loader("pandas") is not None:
+if importlib.util.find_spec("pandas") is not None:
     import pandas as pd
 
-if pkgutil.find_loader("tensorflow") is not None:
+if importlib.util.find_spec("tensorflow") is not None:
     import tensorflow as tf
 
-if pkgutil.find_loader("torch") is not None:
+if importlib.util.find_spec("torch") is not None:
     import torch
 
 logger = logging.getLogger(__name__)
@@ -126,15 +126,15 @@ def supported_types_check(*data: Iterable):
         if isinstance(a, np.ndarray):
             pass
 
-        elif pkgutil.find_loader("pandas") is not None and isinstance(
+        elif importlib.util.find_spec("pandas") is not None and isinstance(
             a, (pd.DataFrame, pd.Series)
         ):
             pass
-        elif pkgutil.find_loader("tensorflow") is not None and isinstance(
+        elif importlib.util.find_spec("tensorflow") is not None and isinstance(
             a, tf.Tensor
         ):
             pass
-        elif pkgutil.find_loader("torch") is not None and isinstance(
+        elif importlib.util.find_spec("torch") is not None and isinstance(
             a, torch.Tensor
         ):
             pass
@@ -277,15 +277,15 @@ def quantile(
 
     if isinstance(a, np.ndarray):
         pass
-    elif pkgutil.find_loader("pandas") is not None and isinstance(
+    elif importlib.util.find_spec("pandas") is not None and isinstance(
         a, pd.DataFrame
     ):
         a = a.to_numpy()
-    elif pkgutil.find_loader("tensorflow") is not None and isinstance(
+    elif importlib.util.find_spec("tensorflow") is not None and isinstance(
         a, tf.Tensor
     ):
         a = a.numpy()
-    # elif pkgutil.find_loader("torch") is not None:
+    # elif importlib.util.find_spec("torch") is not None:
     #     if isinstance(a, torch.Tensor):
     #         a = a.cpu().detach().numpy()
     else:
diff --git a/deel/puncc/plotting.py b/deel/puncc/plotting.py
index b750d9c..09659d8 100644
--- a/deel/puncc/plotting.py
+++ b/deel/puncc/plotting.py
@@ -155,6 +155,9 @@ def plot_prediction_intervals(
 
     """
 
+    # Initialisation
+    current_rcparams = None
+
     # Figure size configuration
     if "figsize" in fig_kw.keys():
         figsize = fig_kw["figsize"]
@@ -256,7 +259,7 @@ def plot_prediction_intervals(
     ax.set_xlim(X[0] - int_size * 0.01, X[-1] + int_size * 0.01)
 
     # restablish rcparams
-    if restablish_rcparams:
+    if current_rcparams is not None and restablish_rcparams:
         matplotlib.rcParams.update(current_rcparams)
 
     return ax
diff --git a/docs/source/theory_overview.rst b/docs/source/theory_overview.rst
index 0af62a8..8fbfd8d 100644
--- a/docs/source/theory_overview.rst
+++ b/docs/source/theory_overview.rst
@@ -20,26 +20,46 @@ Depending on the application fields of machine learning models, uncertainty can
 Conformal Prediction
 --------------------
 
-Conformal Prediction (CP) is a set of *distribution-free*, *model-agnostic* and
-*non-asymptotic* methods to estimate uncertainty by constructing **valid** *prediction sets*, i.e. with guaranteed probability of marginal coverage.
+Conformal Prediction (CP) is a set of methods to estimate uncertainty
+by constructing by constructing **valid** *prediction sets*, 
+i.e. prediction sets with a probabilistic guarantee
+of marginal coverage.
+The following three features make CP methods particularly attractive:
+    - *Distribution-free*. CP methods can be applied regardless of the underlying data-generating distribution.
+    - *Model-agnostic*. CP works with any ML model, even with black-box models where we only have access to the outputs of the model.
+    - *Non-asymptotic*. CP methods provide finite-sample probabilistic guarantees, that is, the guarantees hold without the need to assume that the number of available data grows to infinity.
 
 Given an error rate (or significance level) :math:`\alpha \in (0,1)`, set by the user, a set of exchangeable (or more simply i.i.d.)
-train data :math:`\{ (X_i, Y_i) \}_{i=1}^{n}` and test point
-:math:`(X_{new}, Y_{new})` generated for a joint distribution :math:`\mathbb{P}_{XY}`,
-a conformal prediction procedure builds prediction sets :math:`{C}_{\alpha}(\cdot)` so that:
+train data :math:`\{ (X_i, Y_i) \}_{i=1}^{n}` and a test point
+:math:`(X_{new}, Y_{new})`,
+all of which are generated from the same joint distribution :math:`\mathbb{P}_{XY}`,
+a conformal prediction procedure uses the training data
+to build prediction sets :math:`\widehat{C}_{\alpha}(\cdot)` so that:
 
 .. math::
 
-    \mathbb{P} \Big\{ Y_{new} \in {C}_{\alpha}\left(X_{new}\right) \Big\} \geq 1 - \alpha.
+    \mathbb{P} \Big\{ Y_{new} \in \widehat{C}_{\alpha}\left(X_{new}\right) \Big\} \geq 1 - \alpha.
 
 
-Over many calibration and test sets, :math:`{C}_{\alpha}(X_{new})` will contain
+Over many calibration and test sets, :math:`\widehat{C}_{\alpha}(X_{new})` will contain
 the observed values of :math:`Y_{new}` with frequency of *at least* :math:`(1-\alpha)`.
 
-Within the conformal prediction framework, the inequality above holds for any model,
-any data distribution :math:`\mathbb{P}_{XY}` and any finite sample sizes.
+..
+    Within the conformal prediction framework, the inequality above holds for any model,
+    any data distribution :math:`\mathbb{P}_{XY}` and any finite sample sizes.
+
+Usually, the conformal prediction method uses a point-predictor model :math:`\widehat{f}` 
+and turns it into the set predictor :math:`C_\alpha` 
+via a calibration procedure.
+Within the conformal prediction framework, 
+the inequality above holds for any model,
+any data distribution :math:`\mathbb{P}_{XY}` and any training set sample size, under the following minimal assumptions:
+    - *Exchangeability*. The data :math:`(X_1,Y_i),\dots, (X_n, Y_n), (X_{new}, Y_{new})` form an exchangeable sequence (this is a milder assumption than the data being i.i.d.).
+    - *Independence of train and calibration data.* The data for the model training is independent from the data for the model calibration.
+
 It is noteworthy that the coverage probability is marginalized over :math:`X`.
-Therefore, it is likely to undercover conditionally to some specific regions in the space of :math:`X`.
+Therefore, the CP algorithm is likely to achieve the coverage rate of :math:`1-\alpha` 
+by under-covering conditionally to some specific regions in the space of :math:`X` and over-covering in other regions.
 
 Conformal prediction can act as a *post-processing procedure* to attain rigorous probability coverages,
 as it can "conformalize" any existing predictor during or after training (black box predictors),
@@ -50,7 +70,7 @@ literature used on regression and classification models. We also refer to
 Angelopoulos and Bates [Angelopoulos2022]_ for a hands-on introduction to conformal prediction
 and awesome conformal prediction `github <https://github.com/valeman/awesome-conformal-prediction>`_ for additional ressources.
 
-In the following, let :math:`D_{train} = {(X_i, Y_i)}_{i=1..n_{train}} \sim P_{XY}`
+In the following, let :math:`D = {(X_i, Y_i)}_{i=1}^n \sim P_{XY}`
 be the training data and :math:`\alpha \in (0, 1)` the significance level (target maximum error rate).
 
 Conformal Regression
@@ -61,14 +81,22 @@ Split (inductive) Conformal
 .. _theory splitcp:
 
 The split (also called inductive) conformal prediction [Papadopoulos2002]_ [Lei2018]_ requires a hold-out calibration
-dataset :math:`D_{calibration}` to estimate prediction errors and use them to build the prediction interval for a new sample :math:`X_{new}`.
+dataset: the dataset :math:`D` is split into a proper training set 
+:math:`D_{train}=\big\lbrace(X_i,Y_i), i=1,\dots,n_{train}\big\rbrace` 
+and an independent calibration dataset :math:`D_{calib}=\big\lbrace(X_i,Y_i),i=1,\dots,n_{calib}\big\rbrace`. 
+The purpose of the calibration dataset is
+to estimate prediction errors and use them to build the prediction interval for a new sample :math:`X_{new}`.
 
 Given a prediction model :math:`\widehat{f}` trained on :math:`D_{train}`, the algorithm is summarized in the following:
 
-#. Choose a nonconformity score :math:`s`: :math:`R = s(\widehat{f}(X),Y)`. For example, one can pick the mean absolute deviation :math:`R = |\widehat{f}(X)-Y|`.
-#. Compute the nonconformity scores on the calibration dataset: :math:`\bar{R} = \{R_i\}_{}`, for :math:`i=1,\dots,|D_{calibration}|`, where :math:`|D_{calibration}|` is the cardinality of :math:`D_{calibration}`.
-#. Compute the error margin :math:`\delta_{\alpha}` as the :math:`(1-\alpha)(1 + \frac{1}{| D_{calibration} |})`-th empirical quantile of :math:`\bar{R}`.
-#. Build the prediction interval :math:`\widehat{C}_{\alpha}(X_{new}) = \Big[ \widehat{f}(X_{new}) - \delta_{\alpha}^{f} \,,\, \widehat{f}(X_{new}) + \delta_{\alpha}^{f} \Big]`.
+#. Choose a nonconformity score :math:`s`, and define the error :math:`R` over a sample :math:`(X,Y)` as :math:`R = s(\widehat{f}(X),Y)`. For example, one can pick the absolute deviation :math:`R = |\widehat{f}(X)-Y|`.
+#. Compute the nonconformity scores on the calibration dataset: :math:`\mathcal{R} = \{R_i\}_{}`, where :math:`R_i=s(\widehat{f}(X_i), Y_i)` for :math:`i=1,\dots,n_{calib}`.
+#. Compute the error margin :math:`\delta_{\alpha}` as the :math:`(1-\alpha)(1 + 1/n_{calib})`-th empirical quantile of :math:`\mathcal{R}`.
+#. Build the prediction interval as
+
+.. math::
+
+    \widehat{C}_{\alpha}(X_{new}) = \Big[ \widehat{f}(X_{new}) - \delta_{\alpha} \,,\, \widehat{f}(X_{new}) + \delta_{\alpha} \Big].
 
 Note that this procedure yields a constant-width prediction interval centered on the point estimate :math:`\widehat{f}(X_{new})`.
 
@@ -105,15 +133,22 @@ Conformalized Quantile Regression (CQR)
 #######################################
 .. _theory cqr:
 
-Split conformal prediction can be extended to `quantile predictors <https://en.wikipedia.org/wiki/Quantile_regression>`_  :math:`q(\cdot)`
-by using the nonconformity score:
+Split conformal prediction can be extended to `quantile predictors <https://en.wikipedia.org/wiki/Quantile_regression>`_  :math:`q(\cdot)`.
+Given a nominal error rate :math:`\alpha,`
+and positive error rates :math:`\alpha_{lo}` 
+and :math:`\alpha_{hi}` 
+such that :math:`\alpha_{lo}+\alpha_{hi}=1,`
+we denote by :math:`\widehat{q}_{\alpha_{lo}}` and 
+:math:`\widehat{q}_{1-\alpha_{hi}}`
+the predictors of the :math:`\alpha_{lo}` *-th* and :math:`(1-\alpha_{hi})` *-th* quantiles of :math:`Y | X.`
+The quantile predictors are trained on :math:`D_{train}`
+and calibrated on :math:`D_{calib}` 
+by using the following nonconformity score:
 
 .. math::
 
     R_i^{} = \text{max}\{ \widehat{q}_{\alpha_{lo}}(X_i) - Y_i, Y_i - \widehat{q}_{1 - \alpha_{hi}}(X_i)\},
 
-for :math:`i=1,\dots,|D_{calibration}|`. :math:`\widehat{q}_{\alpha_{lo}}` and :math:`\widehat{q}_{1-\alpha_{hi}}` are
-the predictors of the :math:`\alpha_{lo}` *-th* and :math:`(1-\alpha_{hi})` *-th* quantiles of :math:`Y | X`, respectively.
 For example, if we set :math:`\alpha = 0.1`, we would fit two predictors :math:`\widehat{q}_{0.05}(\cdot)` and :math:`\widehat{q}_{0.95}(\cdot)` on training data :math:`D_{train}` and compute the scores on :math:`D_{calibration}`.
 
 
@@ -129,7 +164,7 @@ The procedure, named *Conformalized Quantile Regression* [Romano2019]_, yields t
 
 When data are exchangeable, the correction margin :math:`\delta_{\alpha}` guarantees finite-sample marginal coverage for the quantile predictions, and this holds also for misspecified (i.e. "bad") predictors.
 
-If the fitted :math:`\widehat{q}_{\alpha_{lo}}` and :math:`\widehat{q}_{1-\alpha_{hi}}` approximate (empirically) well  the conditional distribution :math:`Y | X` of the data, we will get a small margin :math:`\delta_{\alpha}`: this means that on average, the prediction errors on the :math:`D_{calibration}` were small.
+If the fitted :math:`\widehat{q}_{\alpha_{lo}}` and :math:`\widehat{q}_{1-\alpha_{hi}}` approximate (empirically) well  the conditional distribution :math:`Y | X` of the data, we will get a small margin :math:`\delta_{\alpha}`: this means that on average, the prediction errors on the :math:`D_{calib}` were small.
 
 Also, if the base predictors have strong theoretical properties, our CP procedure inherits these properties of :math:`\widehat{q}_{}(\cdot)`.
 We could have an asymptotically, conditionally accurate predictor and also have a theoretically valid, distribution-free guarantee on the marginal coverage!
@@ -175,10 +210,10 @@ If :math:`K = n`, we obtain the *Jackknife+*, **leave-one-out** version of the a
 
 The lower and upper bounds of the prediction interval are given by:
 
-    1. Compute :math:`\bar{R}_{L} = \{ \widehat{f}_{-S_{k(i)}}(X_{new}) - R_i^{CV} \}_{i=1}^{n}`
-    2. :math:`\widehat{L}_{\alpha}(X_{new}) = \lfloor \alpha (n+1) \rfloor`-th smallest value in :math:`\bar{R}_{L}` (lower bound)
-    3. Compute :math:`\bar{R}_{U} = \{ \widehat{f}_{-S_{k(i)}}(X_{new}) + R_i^{CV} \}_{i=1}^{n}`
-    4. :math:`\widehat{U}_{\alpha}(X_{new}) = \lceil (1-\alpha) (n+1) \rceil`-th smallest value in :math:`\bar{R}_{U}` (upper bound)
+    #. Compute :math:`\bar{R}_{L} = \{ \widehat{f}_{-S_{k(i)}}(X_{new}) - R_i^{CV} \}_{i=1}^{n}`
+    #. :math:`\widehat{L}_{\alpha}(X_{new}) = \lfloor \alpha (n+1) \rfloor`-th smallest value in :math:`\bar{R}_{L}` (lower bound)
+    #. Compute :math:`\bar{R}_{U} = \{ \widehat{f}_{-S_{k(i)}}(X_{new}) + R_i^{CV} \}_{i=1}^{n}`
+    #. :math:`\widehat{U}_{\alpha}(X_{new}) = \lceil (1-\alpha) (n+1) \rceil`-th smallest value in :math:`\bar{R}_{U}` (upper bound)
 
 
 .. math::
@@ -190,9 +225,63 @@ Ensemble Batch Prediction Intervals (EnbPI)
 *******************************************
 .. _theory enbpi:
 
-Source: [Xu2021]_
 
-TBC
+Introduced in [Xu2021]_, 
+the EnbPI algorithm builds prediction intervals 
+for time series data of the form 
+:math:`Y_t = f(X_t) + \epsilon_t`, 
+where :math:`\epsilon_t` are identically distributed, 
+but not necessarily independent. 
+Given a training data set :math:`D=\lbrace (X_i, Y_i) \rbrace_{i=1}^n` 
+and a test set :math:`D_{test} = \lbrace (X_t,Y_t) \rbrace_{t=n+1}^{n_{test}}`, 
+the EnbPI algorithm aims at constructing prediction sets 
+for each test point :math:`X_t`. 
+As with the CV+ or Jackknife+ methods, 
+the EnbPI algorithm does not require a held-out calibration set, 
+as it uses a bootstrap algorithm instead. 
+Let :math:`\mathcal{A}` be a training algorithm 
+(i.e. an algorithm that maps a dataset to a predictor), 
+and :math:`\phi` an aggregation function 
+that aggregates different individual models together, 
+e.g. via a simple average, a bagging or an ensembling method. 
+The algorithm EnbPI is performed in three stages:
+
+**Training**
+    #. Sample :math:`B` bootstrap data sets :math:`S_b`, for :math:`b=1,\dots, B` with replacement from :math:`D`.
+    #. Train :math:`B` bootstrap models :math:`\widehat{f}^b = \mathcal{A}(S_b)`.
+
+**Calibration**
+    #. Compute the predictions on each training sample :math:`X_i\in D`. Only the models :math:`\widehat{f}^b` where :math:`X_i\not\in S_b` are used in the aggregation: :math:`\widehat{f}_{-i}(X_i):=\phi\big( \lbrace \widehat{f}^b(X_i) | X_i\not\in S_b\rbrace\big)`.
+    #. Compute the errors :math:`R_i=|Y_i-\widehat{f}_{-i}(X_i)|`, and stock them as :math:`\mathcal{R}_1:=\lbrace R_i,i=1,\dots, n\rbrace`.
+
+**Inference**
+    #. Compute the predictions on each test sample :math:`X_t\in D_{test}` by setting :math:`\widehat{f}_{-t}(X_t):=  \frac{1}{T}\sum_{i=1}^T \widehat{f}_{-i}(X_t)`.
+    #. Update the error set: :math:`\mathcal R_t` (see below).
+    #. Compute the width of the prediction intervals :math:`\delta_{\alpha, t}` as the :math:`(1-\alpha)`-th empirical quantile of :math:`\mathcal{R}_t`.
+
+
+The prediction interval for :math:`X_t` is then given by 
+
+.. math::
+    
+    \widehat{C}_{\alpha} = \big[ \widehat{f}_{-t}(X_t)-\delta_{\alpha, t}, \widehat{f}_{-t}(X_t)+\delta_{\alpha, t}].
+
+In order to update the error set :math:`\mathcal{R}_t`, 
+a *memory* parameter :math:`s` is employed. 
+Every :math:`s` test examples, the first :math:`s` errors in the set 
+:math:`\mathcal{R}` are dropped and the errors over the last :math:`s` 
+test examples are added to the error set :math:`\mathcal{R}`. 
+I.e. if :math:`t-n = 0\ mod\ s` then :math:`\mathcal{R}_t = \lbrace R_i, i=t-n,\dots,t-1\rbrace` 
+and if :math:`t-n \neq 0\ mod\ s` then :math:`\mathcal{R}_t=\mathcal{R}_{t-1}`. 
+
+
+.. note::
+
+    The EnbPI algorithm does not provide an exact probabilistic guarantee as the previous CP methods do. 
+    The guarantee provided by the EnbPI algorithm is only approximate, 
+    and holds under additional assumptions on the error process 
+    :math:`\epsilon_t`. However, it does not require the data to be exchangeable.
+
 
 .. Introduced in [Xu2021]_, the EnbPI algorithms builds prediction intervals for time series data of the form :math:`Y_t = f(X_t) + \epsilon_t`, where :math:`\epsilon_t` are identically distributed.
 .. Unlike the proper conformal algorithms seen above, EnbPI requires some additional hypothesis to attain the coverage guarantee.
@@ -226,9 +315,41 @@ Adaptive Prediction Sets (APS)
 *******************************************
 .. _theory aps:
 
-Source: [Romano2020]_
+As for the Split Conformal Regression algorithm, 
+the APS algorithm introduced in [Romano2020]_ 
+requires us to split the data set :math:`D` into a proper training set :math:`D_{train}` 
+and an independent calibration set :math:`D_{calib}`. 
+A classifier :math:`\widehat{\pi}` is trained 
+using the proper training set :math:`D_{train}` only. 
+We assume that the output of the classifier is given by the softmax scores for the different classes. 
+I.e. for each input :math:`x`, 
+the output :math:`\widehat{\pi}(x)=(\widehat{\pi}_1(x),\dots,\widehat{\pi}_K(x))` 
+is a probability vector and :math:`k=1,\dots, K` 
+represent the possible different classes in the classification task.
+ We represent by :math:`\widehat{\pi}_{(1)}(x)\geq \cdots\geq \widehat{\pi}_{(K)}(x)` 
+ the softmax vector :math:`\widehat{\pi}` arranged in decreasing order, 
+ i.e. :math:`(k)` is the index of the class having the :math:`k`-th largest probability mass.
+
+In order to construct the prediction sets :math:`\widehat{C}_\alpha`, 
+the APS algorithm works in two stages:
+
+**Calibration**
+    #. For each example :math:`X_i` in the calibration data set, we compute the error :math:`R_i` as the probability mass needed for reaching the true label :math:`Y_i`, i.e. :math:`R_i=\widehat{\pi}_{(1)}+\cdots+\widehat{\pi}_{(k)}`, wehere :math:`(k)=Y_i`.
+    #. Stock all errors in a vector :math:`\mathcal{R}`.
+
+**Inference**
+    #. Compute the error margin :math:`\delta_{\alpha}` as the :math:`(1-\alpha)(1 + 1/n_{calib})`-th empirical quantile of :math:`\mathcal{R}`.
+    #. The prediction set for a test point :math:`X_{new}` is defined as
+    
+    .. math::
+        \widehat{C}_{\alpha}(X_{new})=\big\lbrace
+        (1),\dots,(k)
+        \big\rbrace\quad \text{where}\quad 
+        k = \min\big\lbrace i : \widehat{\pi}_{(1)}+\cdots+\widehat{\pi}_{(i)}\geq \delta_\alpha\big\rbrace.
+
+
+
 
-TBC
 
 Regularized Adaptive Prediction Sets (RAPS)
 *******************************************