diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 96172c13..1bf0963b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,7 +12,7 @@ ci: repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.14 + rev: v0.2.0 hooks: - id: ruff args: ["--fix", "--show-source"] diff --git a/pymc_marketing/clv/utils.py b/pymc_marketing/clv/utils.py index 806a1f31..307b82d3 100644 --- a/pymc_marketing/clv/utils.py +++ b/pymc_marketing/clv/utils.py @@ -97,9 +97,31 @@ def _squeeze_dims(x: xarray.DataArray): clv = xarray.DataArray(0.0) + # FIXME: This is a hotfix for ParetoNBDModel, as it has a different API from BetaGeoModel + # We should harmonize them! + from pymc_marketing.clv.models import ParetoNBDModel + + if isinstance(transaction_model, ParetoNBDModel): + transaction_data = pd.DataFrame( + { + "customer_id": customer_id, + "frequency": frequency, + "recency": recency, + "T": T, + } + ) + + def expected_purchases(*, t, **kwargs): + return transaction_model.expected_purchases( + future_t=t, + data=transaction_data, + ) + else: + expected_purchases = transaction_model.expected_num_purchases + # TODO: Vectorize computation so that we perform a single call to expected_num_purchases prev_expected_num_purchases = _squeeze_dims( - transaction_model.expected_num_purchases( + expected_purchases( customer_id=customer_id, frequency=frequency, recency=recency, @@ -110,7 +132,7 @@ def _squeeze_dims(x: xarray.DataArray): for i in steps * factor: # since the prediction of number of transactions is cumulative, we have to subtract off the previous periods new_expected_num_purchases = _squeeze_dims( - transaction_model.expected_num_purchases( + expected_purchases( customer_id=customer_id, frequency=frequency, recency=recency, diff --git a/pymc_marketing/mmm/transformers.py b/pymc_marketing/mmm/transformers.py index ff8001af..04f3d2f0 100644 --- a/pymc_marketing/mmm/transformers.py +++ b/pymc_marketing/mmm/transformers.py @@ -413,6 +413,60 @@ def logistic_saturation(x, lam: Union[npt.NDArray[np.float_], float] = 0.5): return (1 - pt.exp(-lam * x)) / (1 + pt.exp(-lam * x)) +def scale_preserving_logistic_saturation(x, m: Union[npt.NDArray[np.float_], float]): + """Scale preserving logistic transformation. + + This single-parameter saturation function maps its input to the range + :math:`[0, m]`, where :math:`f(x) \leq x` for all :math:`x`. It can be interpreted as mapping + from spend to "effective spend". + + Properties: + + * Output is on scale with input values. Simplifies prior specification. + * Intuitive parameter interpretation: `m` is the "maximum achievable effect". + * Slope of curve never exceeds 1. Thus "effective spend" <= "actual spend". + + .. math:: + f(x) = m \\left( \\frac{1 - e^{-\\frac{2}{m} x}}{1 + e^{-\\frac{2}{m} x}} \\right) + + .. plot:: + :context: close-figs + + import matplotlib.pyplot as plt + import numpy as np + import arviz as az + from pymc_marketing.mmm.transformers import scale_preserving_logistic_saturation + plt.style.use('arviz-darkgrid') + m = np.array([500, 1000, 2000, 4000, 8000]) + x = np.linspace(0, 10000, 400) + ax = plt.subplot(111) + for l in m: + y = scale_preserving_logistic_saturation(x, m=l).eval() + plt.plot(x, y, label=f'm = {l}') + plt.xlabel('x', fontsize=12) + plt.ylabel('f(x)', fontsize=12) + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) + plt.show() + + Parameters + ---------- + x : tensor + Input tensor. + m : float or array-like + Saturation parameter, characterizing the asymptote of the function value. + + Returns + ------- + tensor + Transformed tensor. + """ # noqa: W605 + if m == 0: + return pt.zeros_like(x) + return m * (1 - pt.exp(-2 / m * x)) / (1 + pt.exp(-2 / m * x)) + + class TanhSaturationParameters(NamedTuple): b: pt.TensorLike """Saturation. diff --git a/pymc_marketing/version.txt b/pymc_marketing/version.txt index 1c09c74e..1d0ba9ea 100644 --- a/pymc_marketing/version.txt +++ b/pymc_marketing/version.txt @@ -1 +1 @@ -0.3.3 +0.4.0 diff --git a/pyproject.toml b/pyproject.toml index 7bd7f4e6..7b0aed4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ maintainers = [{ name = "PyMC Labs", email = "info@pymc-labs.io" }] dependencies = [ "arviz>=0.13.0", "matplotlib>=3.5.1", - "numpy>=1.17,<1.25", + "numpy>=1.17", "pandas", # NOTE: Keep minimum pymc version in sync with ci.yml `OLDEST_PYMC_VERSION` "pymc>=5.8.2", diff --git a/tests/clv/test_utils.py b/tests/clv/test_utils.py index d41eae18..0ef2fd7a 100644 --- a/tests/clv/test_utils.py +++ b/tests/clv/test_utils.py @@ -7,7 +7,7 @@ import xarray from pandas.testing import assert_frame_equal -from pymc_marketing.clv import BetaGeoModel, GammaGammaModel +from pymc_marketing.clv import BetaGeoModel, GammaGammaModel, ParetoNBDModel from pymc_marketing.clv.utils import ( _find_first_transactions, clv_summary, @@ -45,6 +45,7 @@ def test_summary_data() -> pd.DataFrame: rng = np.random.default_rng(14) df = pd.read_csv("tests/clv/datasets/test_summary_data.csv", index_col=0) df["monetary_value"] = rng.lognormal(size=(len(df))) + df["customer_id"] = df.index return df @@ -82,6 +83,35 @@ def fitted_bg(test_summary_data) -> BetaGeoModel: return model +@pytest.fixture(scope="module") +def fitted_pnbd(test_summary_data) -> ParetoNBDModel: + rng = np.random.default_rng(45) + + model_config = { + # Narrow Gaussian centered at MLE params from lifetimes ParetoNBDFitter + "r_prior": {"dist": "DiracDelta", "kwargs": {"c": 0.5534}}, + "alpha_prior": {"dist": "DiracDelta", "kwargs": {"c": 10.5802}}, + "s_prior": {"dist": "DiracDelta", "kwargs": {"c": 0.6061}}, + "beta_prior": {"dist": "DiracDelta", "kwargs": {"c": 11.6562}}, + } + pnbd_model = ParetoNBDModel( + data=test_summary_data, + model_config=model_config, + ) + pnbd_model.build_model() + + # Mock an idata object for tests requiring a fitted model + fake_fit = pm.sample_prior_predictive( + samples=50, model=pnbd_model.model, random_seed=rng + ) + fake_fit.add_groups(dict(posterior=fake_fit.prior)) + pnbd_model.idata = fake_fit + pnbd_model.set_idata_attrs(pnbd_model.idata) + pnbd_model._add_fit_data_group(pnbd_model.data) + + return pnbd_model + + @pytest.fixture(scope="module") def fitted_gg(test_summary_data) -> GammaGammaModel: rng = np.random.default_rng(40) @@ -136,7 +166,7 @@ def transaction_data() -> pd.DataFrame: class TestCustomerLifetimeValue: - def test_customer_lifetime_value_with_known_values( + def test_customer_lifetime_value_bg_with_known_values( self, test_summary_data, fitted_bg ): # Test borrowed from @@ -159,6 +189,7 @@ def test_customer_lifetime_value_with_known_values( discount_rate=0.0, ).mean(("chain", "draw")) np.testing.assert_almost_equal(clv_d0, expected, decimal=5) + # discount_rate=1 means the clv will halve over a period clv_d1 = ( customer_lifetime_value( @@ -210,14 +241,16 @@ def test_customer_lifetime_value_with_known_values( ) np.testing.assert_allclose(clv_t2_d1, expected / 2.0 + expected / 4.0, rtol=0.1) - def test_customer_lifetime_value_gg_with_bgf( - self, test_summary_data, fitted_gg, fitted_bg + @pytest.mark.parametrize("transaction_model", ("fitted_bg", "fitted_pnbd")) + def test_customer_lifetime_value_as_gg_method( + self, request, test_summary_data, fitted_gg, transaction_model ): + transaction_model = request.getfixturevalue(transaction_model) t = test_summary_data.head() ggf_clv = fitted_gg.expected_customer_lifetime_value( - transaction_model=fitted_bg, - customer_id=t.index, + transaction_model=transaction_model, + customer_id=t["customer_id"], frequency=t["frequency"], recency=t["recency"], T=t["T"], @@ -225,8 +258,8 @@ def test_customer_lifetime_value_gg_with_bgf( ) utils_clv = customer_lifetime_value( - transaction_model=fitted_bg, - customer_id=t.index, + transaction_model=transaction_model, + customer_id=t["customer_id"], frequency=t["frequency"], recency=t["recency"], T=t["T"], @@ -238,36 +271,40 @@ def test_customer_lifetime_value_gg_with_bgf( ) np.testing.assert_equal(ggf_clv.values, utils_clv.values) - @pytest.mark.parametrize("bg_map", (True, False)) @pytest.mark.parametrize("gg_map", (True, False)) + @pytest.mark.parametrize("transaction_model_map", (True, False)) + @pytest.mark.parametrize("transaction_model", ("fitted_bg", "fitted_pnbd")) def test_map_posterior_mix_fit_types( - self, fitted_bg, fitted_gg, test_summary_data, bg_map, gg_map + self, + request, + test_summary_data, + fitted_gg, + gg_map, + transaction_model_map, + transaction_model, ): """Test we can mix a model that was fit with MAP and one that was fit with sample.""" + transaction_model = request.getfixturevalue(transaction_model) t = test_summary_data.head() # Copy model with thinned chain/draw as would be obtained from MAP - if bg_map: - bg = BetaGeoModel(fitted_bg.data, model_config=fitted_bg.model_config) - bg.build_model() - bg.idata = fitted_bg.idata.sel(chain=slice(0, 1), draw=slice(0, 1)) - else: - bg = fitted_bg + if transaction_model_map: + transaction_model = transaction_model._build_with_idata( + transaction_model.idata.sel(chain=slice(0, 1), draw=slice(0, 1)) + ) if gg_map: - gg = GammaGammaModel(fitted_gg.data, model_config=fitted_gg.model_config) - gg.build_model() - gg.idata = fitted_gg.idata.sel(chain=slice(0, 1), draw=slice(0, 1)) - else: - gg = fitted_gg + fitted_gg = fitted_gg._build_with_idata( + fitted_gg.idata.sel(chain=slice(0, 1), draw=slice(0, 1)) + ) res = customer_lifetime_value( - transaction_model=bg, - customer_id=t.index, + transaction_model=transaction_model, + customer_id=t["customer_id"], frequency=t["frequency"], recency=t["recency"], T=t["T"], - monetary_value=gg.expected_customer_spend( + monetary_value=fitted_gg.expected_customer_spend( t.index, mean_transaction_value=t["monetary_value"], frequency=t["frequency"], @@ -276,11 +313,19 @@ def test_map_posterior_mix_fit_types( assert res.dims == ("chain", "draw", "customer_id") - def test_clv_after_thinning(self, test_summary_data, fitted_gg, fitted_bg): + @pytest.mark.parametrize("transaction_model", ("fitted_bg", "fitted_pnbd")) + def test_clv_after_thinning( + self, + request, + test_summary_data, + fitted_gg, + transaction_model, + ): + transaction_model = request.getfixturevalue(transaction_model) t = test_summary_data.head() ggf_clv = fitted_gg.expected_customer_lifetime_value( - transaction_model=fitted_bg, + transaction_model=transaction_model, customer_id=t.index, frequency=t["frequency"], recency=t["recency"], @@ -289,9 +334,9 @@ def test_clv_after_thinning(self, test_summary_data, fitted_gg, fitted_bg): ) fitted_gg_thinned = fitted_gg.thin_fit_result(keep_every=10) - fitted_bg_thinned = fitted_bg.thin_fit_result(keep_every=10) + transaction_model_thinned = transaction_model.thin_fit_result(keep_every=10) ggf_clv_thinned = fitted_gg_thinned.expected_customer_lifetime_value( - transaction_model=fitted_bg_thinned, + transaction_model=transaction_model_thinned, customer_id=t.index, frequency=t["frequency"], recency=t["recency"], diff --git a/tests/mmm/test_delayed_saturated_mmm.py b/tests/mmm/test_delayed_saturated_mmm.py index ef2e6864..b65df3c8 100644 --- a/tests/mmm/test_delayed_saturated_mmm.py +++ b/tests/mmm/test_delayed_saturated_mmm.py @@ -17,7 +17,7 @@ rng: np.random.Generator = np.random.default_rng(seed=seed) -@pytest.fixture(scope="class") +@pytest.fixture(scope="module") def generate_data(): def _generate_data(date_data: pd.DatetimeIndex) -> pd.DataFrame: n: int = date_data.size @@ -37,7 +37,7 @@ def _generate_data(date_data: pd.DatetimeIndex) -> pd.DataFrame: return _generate_data -@pytest.fixture(scope="class") +@pytest.fixture(scope="module") def toy_X(generate_data) -> pd.DataFrame: date_data: pd.DatetimeIndex = pd.date_range( start="2019-06-01", end="2021-12-31", freq="W-MON" @@ -80,12 +80,12 @@ def model_config_requiring_serialization() -> Dict: return model_config -@pytest.fixture(scope="class") +@pytest.fixture(scope="module") def toy_y(toy_X: pd.DataFrame) -> pd.Series: return pd.Series(data=rng.integers(low=0, high=100, size=toy_X.shape[0])) -@pytest.fixture(scope="class") +@pytest.fixture(scope="module") def mmm() -> DelayedSaturatedMMM: return DelayedSaturatedMMM( date_column="date", @@ -95,7 +95,7 @@ def mmm() -> DelayedSaturatedMMM: ) -@pytest.fixture(scope="class") +@pytest.fixture(scope="module") def mmm_with_fourier_features() -> DelayedSaturatedMMM: return DelayedSaturatedMMM( date_column="date", @@ -106,7 +106,7 @@ def mmm_with_fourier_features() -> DelayedSaturatedMMM: ) -@pytest.fixture(scope="class") +@pytest.fixture(scope="module") def mmm_fitted( mmm: DelayedSaturatedMMM, toy_X: pd.DataFrame, toy_y: pd.Series ) -> DelayedSaturatedMMM: @@ -114,7 +114,7 @@ def mmm_fitted( return mmm -@pytest.fixture(scope="class") +@pytest.fixture(scope="module") def mmm_fitted_with_fourier_features( mmm_with_fourier_features: DelayedSaturatedMMM, toy_X: pd.DataFrame, diff --git a/tests/mmm/test_transformers.py b/tests/mmm/test_transformers.py index 61595aa5..6e294bbe 100644 --- a/tests/mmm/test_transformers.py +++ b/tests/mmm/test_transformers.py @@ -15,6 +15,7 @@ delayed_adstock, geometric_adstock, logistic_saturation, + scale_preserving_logistic_saturation, tanh_saturation, tanh_saturation_baselined, weibull_adstock, @@ -299,6 +300,59 @@ def test_logistic_saturation_min_max_value(self, x, lam): assert y_eval.max() <= 1 assert y_eval.min() >= 0 + def test_scale_preserving_logistic_saturation_m_zero(self): + # All values should be zero + x = np.ones(shape=(100)) + y = scale_preserving_logistic_saturation(x=x, m=0.0) + np.testing.assert_array_almost_equal(x=np.zeros(shape=(100)), y=y.eval()) + + def test_scale_preserving_logistic_saturation_m_eq_x(self): + # At x = m, the saturation f(x), reduces to + # f(x) = m * (1 - exp(-2)) / (1 + exp(-2)) + # = m * 0.761594... + # This test asserts that this is the case + x = np.ones(shape=(100)) + y = scale_preserving_logistic_saturation(x=x, m=1) + np.testing.assert_array_almost_equal( + x=np.ones(shape=(100)) * 0.761594, y=y.eval(), decimal=3 + ) + + @pytest.mark.parametrize( + "x", + [ + np.ones(shape=(100)), + np.linspace(start=0.0, stop=1.0, num=50), + np.linspace(start=200, stop=1000, num=50), + ], + ) + def test_scale_preserving_logistic_saturation_m_large(self, x): + # When asymptote is large, f(x) = x + y = scale_preserving_logistic_saturation(x=x, m=1e9) + np.testing.assert_array_almost_equal(x=x, y=y.eval(), decimal=0) + + @pytest.mark.parametrize( + "x, m", + [ + (np.ones(shape=(100)), 30), + (np.linspace(start=0.0, stop=1.0, num=50), 90), + (np.linspace(start=200, stop=1000, num=50), 17), + (np.zeros(shape=(100)), 200), + ], + ) + def test_scale_preserving_logistic_saturation_bounds(self, x, m): + # Check that the values are within the range [0, m] + y = scale_preserving_logistic_saturation(x=x, m=m) + assert y.eval().max() <= m + assert y.eval().min() >= 0 + + @pytest.mark.parametrize("m", [10, 100, 1000]) + def test_scale_preserving_logistic_saturation_slope(self, m): + # Check that slope < 1 + x = np.linspace(0, 10, 100) + y = scale_preserving_logistic_saturation(x, m=m).eval() + dy_dx = np.diff(y) / np.diff(x) + assert np.all(dy_dx <= 1) + @pytest.mark.parametrize( "x, b, c", [