Merge branch 'pymc-labs:main' into pareto_covar

pymc-labs · Feb 9, 2024 · a63b0bd · a63b0bd
2 parents fd3d148 + a224cfe
commit a63b0bd
Show file tree

Hide file tree

Showing 8 changed files with 215 additions and 40 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -12,7 +12,7 @@ ci:
 
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.14
+    rev: v0.2.0
     hooks:
       - id: ruff
         args: ["--fix", "--show-source"]

diff --git a/pymc_marketing/clv/utils.py b/pymc_marketing/clv/utils.py
@@ -97,9 +97,31 @@ def _squeeze_dims(x: xarray.DataArray):
 
     clv = xarray.DataArray(0.0)
 
+    # FIXME: This is a hotfix for ParetoNBDModel, as it has a different API from BetaGeoModel
+    #  We should harmonize them!
+    from pymc_marketing.clv.models import ParetoNBDModel
+
+    if isinstance(transaction_model, ParetoNBDModel):
+        transaction_data = pd.DataFrame(
+            {
+                "customer_id": customer_id,
+                "frequency": frequency,
+                "recency": recency,
+                "T": T,
+            }
+        )
+
+        def expected_purchases(*, t, **kwargs):
+            return transaction_model.expected_purchases(
+                future_t=t,
+                data=transaction_data,
+            )
+    else:
+        expected_purchases = transaction_model.expected_num_purchases
+
     # TODO: Vectorize computation so that we perform a single call to expected_num_purchases
     prev_expected_num_purchases = _squeeze_dims(
-        transaction_model.expected_num_purchases(
+        expected_purchases(
             customer_id=customer_id,
             frequency=frequency,
             recency=recency,
@@ -110,7 +132,7 @@ def _squeeze_dims(x: xarray.DataArray):
     for i in steps * factor:
         # since the prediction of number of transactions is cumulative, we have to subtract off the previous periods
         new_expected_num_purchases = _squeeze_dims(
-            transaction_model.expected_num_purchases(
+            expected_purchases(
                 customer_id=customer_id,
                 frequency=frequency,
                 recency=recency,

diff --git a/pymc_marketing/mmm/transformers.py b/pymc_marketing/mmm/transformers.py
@@ -413,6 +413,60 @@ def logistic_saturation(x, lam: Union[npt.NDArray[np.float_], float] = 0.5):
     return (1 - pt.exp(-lam * x)) / (1 + pt.exp(-lam * x))
 
 
+def scale_preserving_logistic_saturation(x, m: Union[npt.NDArray[np.float_], float]):
+    """Scale preserving logistic transformation.
+
+    This single-parameter saturation function maps its input to the range
+    :math:`[0, m]`, where :math:`f(x) \leq x` for all :math:`x`. It can be interpreted as mapping
+    from spend to "effective spend".
+
+    Properties:
+
+    * Output is on scale with input values. Simplifies prior specification.
+    * Intuitive parameter interpretation: `m` is the "maximum achievable effect".
+    * Slope of curve never exceeds 1. Thus "effective spend" <= "actual spend".
+
+    .. math::
+        f(x) = m \\left( \\frac{1 - e^{-\\frac{2}{m} x}}{1 + e^{-\\frac{2}{m} x}} \\right)
+
+    .. plot::
+        :context: close-figs
+
+        import matplotlib.pyplot as plt
+        import numpy as np
+        import arviz as az
+        from pymc_marketing.mmm.transformers import scale_preserving_logistic_saturation
+        plt.style.use('arviz-darkgrid')
+        m = np.array([500, 1000, 2000, 4000, 8000])
+        x = np.linspace(0, 10000, 400)
+        ax = plt.subplot(111)
+        for l in m:
+            y = scale_preserving_logistic_saturation(x, m=l).eval()
+            plt.plot(x, y, label=f'm = {l}')
+        plt.xlabel('x', fontsize=12)
+        plt.ylabel('f(x)', fontsize=12)
+        box = ax.get_position()
+        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+        plt.show()
+
+    Parameters
+    ----------
+    x : tensor
+        Input tensor.
+    m : float or array-like
+        Saturation parameter, characterizing the asymptote of the function value.
+
+    Returns
+    -------
+    tensor
+        Transformed tensor.
+    """  # noqa: W605
+    if m == 0:
+        return pt.zeros_like(x)
+    return m * (1 - pt.exp(-2 / m * x)) / (1 + pt.exp(-2 / m * x))
+
+
 class TanhSaturationParameters(NamedTuple):
     b: pt.TensorLike
     """Saturation.

diff --git a/pymc_marketing/version.txt b/pymc_marketing/version.txt
@@ -1 +1 @@
-0.3.3
+0.4.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ maintainers = [{ name = "PyMC Labs", email = "[email protected]" }]
 dependencies = [
     "arviz>=0.13.0",
     "matplotlib>=3.5.1",
-    "numpy>=1.17,<1.25",
+    "numpy>=1.17",
     "pandas",
     # NOTE: Keep minimum pymc version in sync with ci.yml `OLDEST_PYMC_VERSION`
     "pymc>=5.8.2",

diff --git a/tests/clv/test_utils.py b/tests/clv/test_utils.py
@@ -7,7 +7,7 @@
 import xarray
 from pandas.testing import assert_frame_equal
 
-from pymc_marketing.clv import BetaGeoModel, GammaGammaModel
+from pymc_marketing.clv import BetaGeoModel, GammaGammaModel, ParetoNBDModel
 from pymc_marketing.clv.utils import (
     _find_first_transactions,
     clv_summary,
@@ -45,6 +45,7 @@ def test_summary_data() -> pd.DataFrame:
     rng = np.random.default_rng(14)
     df = pd.read_csv("tests/clv/datasets/test_summary_data.csv", index_col=0)
     df["monetary_value"] = rng.lognormal(size=(len(df)))
+    df["customer_id"] = df.index
     return df
 
 
@@ -82,6 +83,35 @@ def fitted_bg(test_summary_data) -> BetaGeoModel:
     return model
 
 
+@pytest.fixture(scope="module")
+def fitted_pnbd(test_summary_data) -> ParetoNBDModel:
+    rng = np.random.default_rng(45)
+
+    model_config = {
+        # Narrow Gaussian centered at MLE params from lifetimes ParetoNBDFitter
+        "r_prior": {"dist": "DiracDelta", "kwargs": {"c": 0.5534}},
+        "alpha_prior": {"dist": "DiracDelta", "kwargs": {"c": 10.5802}},
+        "s_prior": {"dist": "DiracDelta", "kwargs": {"c": 0.6061}},
+        "beta_prior": {"dist": "DiracDelta", "kwargs": {"c": 11.6562}},
+    }
+    pnbd_model = ParetoNBDModel(
+        data=test_summary_data,
+        model_config=model_config,
+    )
+    pnbd_model.build_model()
+
+    # Mock an idata object for tests requiring a fitted model
+    fake_fit = pm.sample_prior_predictive(
+        samples=50, model=pnbd_model.model, random_seed=rng
+    )
+    fake_fit.add_groups(dict(posterior=fake_fit.prior))
+    pnbd_model.idata = fake_fit
+    pnbd_model.set_idata_attrs(pnbd_model.idata)
+    pnbd_model._add_fit_data_group(pnbd_model.data)
+
+    return pnbd_model
+
+
 @pytest.fixture(scope="module")
 def fitted_gg(test_summary_data) -> GammaGammaModel:
     rng = np.random.default_rng(40)
@@ -136,7 +166,7 @@ def transaction_data() -> pd.DataFrame:
 
 
 class TestCustomerLifetimeValue:
-    def test_customer_lifetime_value_with_known_values(
+    def test_customer_lifetime_value_bg_with_known_values(
         self, test_summary_data, fitted_bg
     ):
         # Test borrowed from
@@ -159,6 +189,7 @@ def test_customer_lifetime_value_with_known_values(
             discount_rate=0.0,
         ).mean(("chain", "draw"))
         np.testing.assert_almost_equal(clv_d0, expected, decimal=5)
+
         # discount_rate=1 means the clv will halve over a period
         clv_d1 = (
             customer_lifetime_value(
@@ -210,23 +241,25 @@ def test_customer_lifetime_value_with_known_values(
         )
         np.testing.assert_allclose(clv_t2_d1, expected / 2.0 + expected / 4.0, rtol=0.1)
 
-    def test_customer_lifetime_value_gg_with_bgf(
-        self, test_summary_data, fitted_gg, fitted_bg
+    @pytest.mark.parametrize("transaction_model", ("fitted_bg", "fitted_pnbd"))
+    def test_customer_lifetime_value_as_gg_method(
+        self, request, test_summary_data, fitted_gg, transaction_model
     ):
+        transaction_model = request.getfixturevalue(transaction_model)
         t = test_summary_data.head()
 
         ggf_clv = fitted_gg.expected_customer_lifetime_value(
-            transaction_model=fitted_bg,
-            customer_id=t.index,
+            transaction_model=transaction_model,
+            customer_id=t["customer_id"],
             frequency=t["frequency"],
             recency=t["recency"],
             T=t["T"],
             mean_transaction_value=t["monetary_value"],
         )
 
         utils_clv = customer_lifetime_value(
-            transaction_model=fitted_bg,
-            customer_id=t.index,
+            transaction_model=transaction_model,
+            customer_id=t["customer_id"],
             frequency=t["frequency"],
             recency=t["recency"],
             T=t["T"],
@@ -238,36 +271,40 @@ def test_customer_lifetime_value_gg_with_bgf(
         )
         np.testing.assert_equal(ggf_clv.values, utils_clv.values)
 
-    @pytest.mark.parametrize("bg_map", (True, False))
     @pytest.mark.parametrize("gg_map", (True, False))
+    @pytest.mark.parametrize("transaction_model_map", (True, False))
+    @pytest.mark.parametrize("transaction_model", ("fitted_bg", "fitted_pnbd"))
     def test_map_posterior_mix_fit_types(
-        self, fitted_bg, fitted_gg, test_summary_data, bg_map, gg_map
+        self,
+        request,
+        test_summary_data,
+        fitted_gg,
+        gg_map,
+        transaction_model_map,
+        transaction_model,
     ):
         """Test we can mix a model that was fit with MAP and one that was fit with sample."""
+        transaction_model = request.getfixturevalue(transaction_model)
         t = test_summary_data.head()
 
         # Copy model with thinned chain/draw as would be obtained from MAP
-        if bg_map:
-            bg = BetaGeoModel(fitted_bg.data, model_config=fitted_bg.model_config)
-            bg.build_model()
-            bg.idata = fitted_bg.idata.sel(chain=slice(0, 1), draw=slice(0, 1))
-        else:
-            bg = fitted_bg
+        if transaction_model_map:
+            transaction_model = transaction_model._build_with_idata(
+                transaction_model.idata.sel(chain=slice(0, 1), draw=slice(0, 1))
+            )
 
         if gg_map:
-            gg = GammaGammaModel(fitted_gg.data, model_config=fitted_gg.model_config)
-            gg.build_model()
-            gg.idata = fitted_gg.idata.sel(chain=slice(0, 1), draw=slice(0, 1))
-        else:
-            gg = fitted_gg
+            fitted_gg = fitted_gg._build_with_idata(
+                fitted_gg.idata.sel(chain=slice(0, 1), draw=slice(0, 1))
+            )
 
         res = customer_lifetime_value(
-            transaction_model=bg,
-            customer_id=t.index,
+            transaction_model=transaction_model,
+            customer_id=t["customer_id"],
             frequency=t["frequency"],
             recency=t["recency"],
             T=t["T"],
-            monetary_value=gg.expected_customer_spend(
+            monetary_value=fitted_gg.expected_customer_spend(
                 t.index,
                 mean_transaction_value=t["monetary_value"],
                 frequency=t["frequency"],
@@ -276,11 +313,19 @@ def test_map_posterior_mix_fit_types(
 
         assert res.dims == ("chain", "draw", "customer_id")
 
-    def test_clv_after_thinning(self, test_summary_data, fitted_gg, fitted_bg):
+    @pytest.mark.parametrize("transaction_model", ("fitted_bg", "fitted_pnbd"))
+    def test_clv_after_thinning(
+        self,
+        request,
+        test_summary_data,
+        fitted_gg,
+        transaction_model,
+    ):
+        transaction_model = request.getfixturevalue(transaction_model)
         t = test_summary_data.head()
 
         ggf_clv = fitted_gg.expected_customer_lifetime_value(
-            transaction_model=fitted_bg,
+            transaction_model=transaction_model,
             customer_id=t.index,
             frequency=t["frequency"],
             recency=t["recency"],
@@ -289,9 +334,9 @@ def test_clv_after_thinning(self, test_summary_data, fitted_gg, fitted_bg):
         )
 
         fitted_gg_thinned = fitted_gg.thin_fit_result(keep_every=10)
-        fitted_bg_thinned = fitted_bg.thin_fit_result(keep_every=10)
+        transaction_model_thinned = transaction_model.thin_fit_result(keep_every=10)
         ggf_clv_thinned = fitted_gg_thinned.expected_customer_lifetime_value(
-            transaction_model=fitted_bg_thinned,
+            transaction_model=transaction_model_thinned,
             customer_id=t.index,
             frequency=t["frequency"],
             recency=t["recency"],

diff --git a/tests/mmm/test_delayed_saturated_mmm.py b/tests/mmm/test_delayed_saturated_mmm.py
@@ -17,7 +17,7 @@
 rng: np.random.Generator = np.random.default_rng(seed=seed)
 
 
-@pytest.fixture(scope="class")
+@pytest.fixture(scope="module")
 def generate_data():
     def _generate_data(date_data: pd.DatetimeIndex) -> pd.DataFrame:
         n: int = date_data.size
@@ -37,7 +37,7 @@ def _generate_data(date_data: pd.DatetimeIndex) -> pd.DataFrame:
     return _generate_data
 
 
-@pytest.fixture(scope="class")
+@pytest.fixture(scope="module")
 def toy_X(generate_data) -> pd.DataFrame:
     date_data: pd.DatetimeIndex = pd.date_range(
         start="2019-06-01", end="2021-12-31", freq="W-MON"
@@ -80,12 +80,12 @@ def model_config_requiring_serialization() -> Dict:
     return model_config
 
 
-@pytest.fixture(scope="class")
+@pytest.fixture(scope="module")
 def toy_y(toy_X: pd.DataFrame) -> pd.Series:
     return pd.Series(data=rng.integers(low=0, high=100, size=toy_X.shape[0]))
 
 
-@pytest.fixture(scope="class")
+@pytest.fixture(scope="module")
 def mmm() -> DelayedSaturatedMMM:
     return DelayedSaturatedMMM(
         date_column="date",
@@ -95,7 +95,7 @@ def mmm() -> DelayedSaturatedMMM:
     )
 
 
-@pytest.fixture(scope="class")
+@pytest.fixture(scope="module")
 def mmm_with_fourier_features() -> DelayedSaturatedMMM:
     return DelayedSaturatedMMM(
         date_column="date",
@@ -106,15 +106,15 @@ def mmm_with_fourier_features() -> DelayedSaturatedMMM:
     )
 
 
-@pytest.fixture(scope="class")
+@pytest.fixture(scope="module")
 def mmm_fitted(
     mmm: DelayedSaturatedMMM, toy_X: pd.DataFrame, toy_y: pd.Series
 ) -> DelayedSaturatedMMM:
     mmm.fit(X=toy_X, y=toy_y, target_accept=0.8, draws=3, chains=2)
     return mmm
 
 
-@pytest.fixture(scope="class")
+@pytest.fixture(scope="module")
 def mmm_fitted_with_fourier_features(
     mmm_with_fourier_features: DelayedSaturatedMMM,
     toy_X: pd.DataFrame,