diff --git a/.nojekyll b/.nojekyll index c8b36ba6..246e4379 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -8764a1e0 \ No newline at end of file +41d81d09 \ No newline at end of file diff --git a/notebooks/horseshoe_prior.html b/notebooks/horseshoe_prior.html new file mode 100644 index 00000000..54512e35 --- /dev/null +++ b/notebooks/horseshoe_prior.html @@ -0,0 +1,537 @@ + + + + + + + + + +Bambi – horseshoe_prior + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+ +
+ + + + +
+ + + +
+

Horseshoe Prior

+

In this example, we will use the Horseshoe Prior (Carvalho et al., 2009) to model a large number of variables, with only a few slopes being significantly different from zero.

+
+
import arviz as az
+import bambi as bmb
+import numpy as np
+import pandas as pd
+import pymc as pm
+import pytensor.tensor as pt
+
+from matplotlib import pyplot as plt
+
+

Here is what we did:

+
    +
  • We defined an intercept.
  • +
  • We defined a vector of 50 betas, 5 of which were drawn from a normal(5,1) distribution, and then assigned a random sign.
  • +
  • We created the design matrix with normal(0,1) entries and set \(\sigma\) to 1.
  • +
  • We calculated the deterministic means \(\mu\) using the intercept and the design matrix multiplied by the betas.
  • +
  • We simulated 100 response variables (observations) from a normal distribution with mean \(\mu\) and standard deviation \(\sigma\).
  • +
+

Next, we proceeded with the Bayesian estimation of the model. We proposed the horseshoe prior, for which the following parameters were calculated:

+

\[\mu_i = \alpha + \beta_1 x_{1i} + \beta_2 x_{2i} + ... + \beta_p x_{pi}\]

+

\[y_i \sim N(\mu_i, \sigma^2)\]

+

\[\alpha \sim N(0,1)\]

+

\[\beta_j \sim N(0,\lambda_j^2 \tau^2)\]

+

\[\lambda_j \sim C^+(0,1)\]

+

\[\tau \sim T^+(df=3)\]

+

\[\sigma^2 \sim N^+(0,1)\]

+
+
D = 50
+D0 = 5
+
+SEED = 123456789 # for reproducibility
+
+rng = np.random.default_rng(SEED)
+
+INTERCEPT = rng.uniform(-3, 3) # simulate an intercept
+
+COEF = np.zeros(D)
+# Simulate the slopes for significant variables
+COEF[:D0] = rng.choice([-1, 1], size=D0) * rng.normal(5, 1, size=D0)
+
+N = 100
+X = rng.normal(size=(N, D))
+SIGMA = 1.
+# Simulate the data
+y = INTERCEPT + X.dot(COEF) + rng.normal(0, SIGMA, size=N)
+
+

Here we create the dataframe and the term name for the set of variables, to define the formula.

+
+
df = pd.DataFrame(X)
+df.columns = [f"x{i}" for i in range(X.shape[1])]
+df["y"] = y
+
+
+
term_name = "c(" + ", ".join([f"x{i}" for i in range(X.shape[1])]) + ")"
+formula = f"y ~ {term_name}"
+formula
+
+
'y ~ c(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49)'
+
+
+

Finally, we call the Horseshoe prior and create the model

+
+
priors = {
+    term_name: bmb.Prior("Horseshoe"),
+}
+model = bmb.Model(formula, df, priors=priors)
+model.set_alias({term_name: "predictors"})
+
+model.build()
+model.graph()
+
+

+
+
+
+
idata = model.fit(target_accept = 0.95, chains=2)
+
+
Auto-assigning NUTS sampler...
+Initializing NUTS using jitter+adapt_diag...
+Multiprocess sampling (2 chains in 2 jobs)
+NUTS: [sigma, Intercept, predictors_tau, predictors_lam, predictors_raw]
+
+
+ +
+
+

+
+
+

+
+
+
Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 444 seconds.
+There were 64 divergences after tuning. Increase `target_accept` or reparameterize.
+Chain 0 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
+Chain 1 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
+We recommend running at least 4 chains for robust computation of convergence diagnostics
+
+
+
+
priors = {
+    term_name: bmb.Prior("Horseshoe", tau_nu = 3, lam_nu = 3),
+}
+model = bmb.Model(formula, df, priors=priors)
+model.set_alias({term_name: "predictors"})
+
+model.build()
+model.graph()
+
+

+
+
+
+
idata = model.fit(target_accept = 0.97, chains=2)
+
+
Auto-assigning NUTS sampler...
+Initializing NUTS using jitter+adapt_diag...
+Multiprocess sampling (2 chains in 2 jobs)
+NUTS: [sigma, Intercept, predictors_tau, predictors_lam, predictors_raw]
+
+
+ +
+
+

+
+
+

+
+
+
Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 293 seconds.
+There were 29 divergences after tuning. Increase `target_accept` or reparameterize.
+We recommend running at least 4 chains for robust computation of convergence diagnostics
+
+
+
+
ax, = az.plot_forest(
+    idata, 
+    var_names=["predictors"], 
+    coords={"predictors_dim": range(D0)},
+    kind='ridgeplot',
+    ridgeplot_truncate=False, 
+    ridgeplot_alpha=0.5,
+    hdi_prob=0.95, 
+    combined=True,
+    figsize=(8, 6)
+)
+ax.scatter(COEF[:D0][::-1], ax.get_yticks(), c='C1', label="Actual value");
+ax.set_xlabel(r"$\beta_i$");
+ax.set_ylim(bottom=None, top=1.55 * ax.get_yticks().max())
+ax.set_yticklabels(range(D0)[::-1]);
+ax.set_ylabel(r"$i$");
+ax.legend(loc='upper center');
+ax.set_title("Posterior distribution of nonzero coefficients");
+
+

+
+
+
+
%load_ext watermark
+%watermark -n -u -v -iv -w
+
+
Last updated: Thu Aug 22 2024
+
+Python implementation: CPython
+Python version       : 3.11.9
+IPython version      : 8.24.0
+
+pandas    : 2.2.2
+numpy     : 1.26.4
+bambi     : 0.14.1.dev12+g64e57423.d20240730
+arviz     : 0.18.0
+matplotlib: 3.8.4
+pymc      : 5.16.1
+pytensor  : 2.23.0
+
+Watermark: 2.4.3
+
+
+
+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/notebooks/horseshoe_prior_files/figure-html/cell-10-output-1.png b/notebooks/horseshoe_prior_files/figure-html/cell-10-output-1.png new file mode 100644 index 00000000..c788744f Binary files /dev/null and b/notebooks/horseshoe_prior_files/figure-html/cell-10-output-1.png differ diff --git a/notebooks/horseshoe_prior_files/figure-html/cell-6-output-1.svg b/notebooks/horseshoe_prior_files/figure-html/cell-6-output-1.svg new file mode 100644 index 00000000..ac2d728b --- /dev/null +++ b/notebooks/horseshoe_prior_files/figure-html/cell-6-output-1.svg @@ -0,0 +1,128 @@ + + + + + + + + +clusterpredictors_dim (50) + +predictors_dim (50) + + +cluster__obs__ (100) + +__obs__ (100) + + + +predictors_tau + +predictors_tau +~ +HalfStudentT + + + +predictors + +predictors +~ +Deterministic + + + +predictors_tau->predictors + + + + + +Intercept + +Intercept +~ +Normal + + + +mu + +mu +~ +Deterministic + + + +Intercept->mu + + + + + +sigma + +sigma +~ +HalfStudentT + + + +y + +y +~ +Normal + + + +sigma->y + + + + + +predictors->mu + + + + + +predictors_lam + +predictors_lam +~ +HalfStudentT + + + +predictors_lam->predictors + + + + + +predictors_raw + +predictors_raw +~ +Normal + + + +predictors_raw->predictors + + + + + +mu->y + + + + + diff --git a/notebooks/horseshoe_prior_files/figure-html/cell-8-output-1.svg b/notebooks/horseshoe_prior_files/figure-html/cell-8-output-1.svg new file mode 100644 index 00000000..ac2d728b --- /dev/null +++ b/notebooks/horseshoe_prior_files/figure-html/cell-8-output-1.svg @@ -0,0 +1,128 @@ + + + + + + + + +clusterpredictors_dim (50) + +predictors_dim (50) + + +cluster__obs__ (100) + +__obs__ (100) + + + +predictors_tau + +predictors_tau +~ +HalfStudentT + + + +predictors + +predictors +~ +Deterministic + + + +predictors_tau->predictors + + + + + +Intercept + +Intercept +~ +Normal + + + +mu + +mu +~ +Deterministic + + + +Intercept->mu + + + + + +sigma + +sigma +~ +HalfStudentT + + + +y + +y +~ +Normal + + + +sigma->y + + + + + +predictors->mu + + + + + +predictors_lam + +predictors_lam +~ +HalfStudentT + + + +predictors_lam->predictors + + + + + +predictors_raw + +predictors_raw +~ +Normal + + + +predictors_raw->predictors + + + + + +mu->y + + + + + diff --git a/search.json b/search.json index 55333c11..dbd5288d 100644 --- a/search.json +++ b/search.json @@ -258,6 +258,13 @@ "section": "", "text": "This notebook shows how to build a hierarchical logistic regression model with the Binomial family in Bambi.\nThis example is based on the Hierarchical baseball article in Bayesian Analysis Recipes, a collection of articles on how to do Bayesian data analysis with PyMC3 made by Eric Ma.\n\n\nExtracted from the original work:\n\nBaseball players have many metrics measured for them. Let’s say we are on a baseball team, and would like to quantify player performance, one metric being their batting average (defined by how many times a batter hit a pitched ball, divided by the number of times they were up for batting (“at bat”)). How would you go about this task?\n\n\n\n\n\nimport arviz as az\nimport bambi as bmb\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom matplotlib.lines import Line2D\nfrom matplotlib.patches import Patch\n\n\nrandom_seed = 1234\n\nWe first need some measurements of batting data. Today we’re going to use data from the Baseball Databank. It is a compilation of historical baseball data in a convenient, tidy format, distributed under Open Data terms.\nThis repository contains several datasets in the form of .csv files. This example is going to use the Batting.csv file, which can be loaded directly with Bambi in a convenient way.\n\ndf = bmb.load_data(\"batting\")\n\n# Then clean some of the data\ndf[\"AB\"] = df[\"AB\"].replace(0, np.nan)\ndf = df.dropna()\ndf[\"batting_avg\"] = df[\"H\"] / df[\"AB\"]\ndf = df[df[\"yearID\"] >= 2016]\ndf = df.iloc[0:15] \ndf.head(5)\n\n\n\n\n\n \n \n \n playerID\n yearID\n stint\n teamID\n lgID\n G\n AB\n R\n H\n 2B\n ...\n SB\n CS\n BB\n SO\n IBB\n HBP\n SH\n SF\n GIDP\n batting_avg\n \n \n \n \n 101348\n abadfe01\n 2016\n 1\n MIN\n AL\n 39\n 1.0\n 0\n 0\n 0\n ...\n 0.0\n 0.0\n 0\n 1.0\n 0.0\n 0.0\n 0.0\n 0.0\n 0.0\n 0.000000\n \n \n 101350\n abreujo02\n 2016\n 1\n CHA\n AL\n 159\n 624.0\n 67\n 183\n 32\n ...\n 0.0\n 2.0\n 47\n 125.0\n 7.0\n 15.0\n 0.0\n 9.0\n 21.0\n 0.293269\n \n \n 101352\n ackledu01\n 2016\n 1\n NYA\n AL\n 28\n 61.0\n 6\n 9\n 0\n ...\n 0.0\n 0.0\n 8\n 9.0\n 0.0\n 0.0\n 0.0\n 1.0\n 0.0\n 0.147541\n \n \n 101353\n adamecr01\n 2016\n 1\n COL\n NL\n 121\n 225.0\n 25\n 49\n 7\n ...\n 2.0\n 3.0\n 24\n 47.0\n 0.0\n 4.0\n 3.0\n 0.0\n 5.0\n 0.217778\n \n \n 101355\n adamsma01\n 2016\n 1\n SLN\n NL\n 118\n 297.0\n 37\n 74\n 18\n ...\n 0.0\n 1.0\n 25\n 81.0\n 1.0\n 2.0\n 0.0\n 3.0\n 5.0\n 0.249158\n \n \n\n5 rows × 23 columns\n\n\n\nFrom all the columns above, we’re going to use the following:\n\nplayerID: Unique identification for the player.\nAB: Number of times the player was up for batting.\nH: Number of times the player hit the ball while batting.\nbatting_avg: Simply ratio between H and AB.\n\n\n\n\nIt’s always good to explore the data before starting to write down our models. This is very useful to gain a good understanding of the distribution of the variables and their relationships, and even anticipate some problems that may occur during the sampling process.\nThe following graph summarizes the percentage of hits, as well as the number of times the players were up for batting and the number of times they hit the ball.\n\nBLUE = \"#2a5674\"\nRED = \"#b13f64\"\n\n\n_, ax = plt.subplots(figsize=(10, 6))\n\n# Customize x limits. \n# This adds space on the left side to indicate percentage of hits.\nax.set_xlim(-120, 320)\n\n# Add dots for the the number of hits and the times at bat\nax.scatter(df[\"H\"], list(range(15)), s=140, color=RED, zorder=10)\nax.scatter(df[\"AB\"], list(range(15)), s=140, color=BLUE, zorder=10)\n\n# Also a line connecting them\nax.hlines(list(range(15)), df[\"H\"], df[\"AB\"], color=\"#b3b3b3\", lw=4)\n\nax.axvline(ls=\"--\", lw=1.4, color=\"#a3a3a3\")\nax.hlines(list(range(15)), -110, -50, lw=6, color=\"#b3b3b3\", capstyle=\"round\")\nax.scatter(60 * df[\"batting_avg\"] - 110, list(range(15)), s=28, color=RED, zorder=10)\n\n# Add the percentage of hits\nfor j in range(15): \n text = f\"{round(df['batting_avg'].iloc[j] * 100)}%\"\n ax.text(-12, j, text, ha=\"right\", va=\"center\", fontsize=14, color=\"#333\")\n\n# Customize tick positions and labels\nax.yaxis.set_ticks(list(range(15)))\nax.yaxis.set_ticklabels(df[\"playerID\"])\nax.xaxis.set_ticks(range(0, 400, 100))\n\n# Create handles for the legend (just dots and labels)\nhandles = [\n Line2D(\n [0], [0], label=\"Hits\", marker=\"o\", color=\"None\", markeredgewidth=0, \n markerfacecolor=RED, markersize=13\n ),\n Line2D(\n [0], [0], label=\"At Bat\", marker=\"o\", color=\"None\", markeredgewidth=0,\n markerfacecolor=BLUE, markersize=12\n )\n]\n\n# Add legend on top-right corner\nlegend = ax.legend(\n handles=handles, \n loc=1, \n fontsize=14, \n handletextpad=0.4,\n frameon=True\n)\n\n# Finally add labels and a title\nax.set_xlabel(\"Count\", fontsize=14)\nax.set_ylabel(\"Player\", fontsize=14)\nax.set_title(\"How often do batters hit the ball?\", fontsize=20);\n\n\n\n\nThe first thing one can see is that the number of times players were up for batting varies quite a lot. Some players have been there for very few times, while there are others who have been there hundreds of times. We can also note the percentage of hits is usually a number between 12% and 29%.\nThere are two players, alberma01 and abadfe01, who had only one chance to bat. The first one hit the ball, while the latter missed. That’s why alberma01 as a 100% hit percentage, while abadfe01 has 0%. There’s another player, aguilje01, who has a success record of 0% because he missed all the few opportunities he had to bat. These extreme situations, where the empirical estimation lives in the boundary of the parameter space, are associated with estimation problems when using a maximum-likelihood estimation approach. Nonetheless, they can also impact the sampling process, especially when using wide priors.\nAs a final note, abreujo02, has been there for batting 624 times, and thus the grey dot representing this number does not appear in the plot.\n\n\n\nLet’s get started with a simple cell-means logistic regression for \\(p_i\\), the probability of hitting the ball for the player \\(i\\)\n\\[\n\\begin{array}{lr}\n \\displaystyle \\text{logit}(p_i) = \\beta_i & \\text{with } i = 0, \\cdots, 14\n\\end{array} \n\\]\nWhere\n\\[\n\\beta_i \\sim \\text{Normal}(0, \\ \\sigma_{\\beta}),\n\\]\n\\(\\sigma_{\\beta}\\) is a common constant for all the players, and \\(\\text{logit}(p_i) = \\log\\left(\\frac{p_i}{1 - p_i}\\right)\\).\nSpecifying this model is quite simple in Bambi thanks to its formula interface.\nFirst of all, note this is a Binomial family and the response involves both the number of hits (H) and the number of times at bat (AB). We use the p(x, n) function for the response term. This just tells Bambi we want to model the proportion resulting from dividing x over n.\nThe right-hand side of the formula is \"0 + playerID\". This means the model includes a coefficient for each player ID, but does not include a global intercept.\nFinally, using the Binomial family is as easy as passing family=\"binomial\". By default, the link function for this family is link=\"logit\", so there’s nothing to change there.\n\nmodel_non_hierarchical = bmb.Model(\"p(H, AB) ~ 0 + playerID\", df, family=\"binomial\")\nmodel_non_hierarchical\n\n Formula: p(H, AB) ~ 0 + playerID\n Family: binomial\n Link: p = logit\n Observations: 15\n Priors: \n target = p\n Common-level effects\n playerID ~ Normal(mu: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], sigma: [1. 1. 1. 1. 1. 1.\n 1. 1. 1. 1. 1. 1. 1. 1. 1.])\n\n\n\nidata_non_hierarchical = model_non_hierarchical.fit(random_seed=random_seed)\n\nAuto-assigning NUTS sampler...\nInitializing NUTS using jitter+adapt_diag...\nMultiprocess sampling (4 chains in 4 jobs)\nNUTS: [playerID]\n\n\n\n\n\n\n\n\n\n\n\nSampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 5 seconds.\n\n\nNext we observe the posterior of the coefficient for each player. The compact=False argument means we want separated panels for each player.\n\naz.plot_trace(idata_non_hierarchical, compact=False, backend_kwargs={\"layout\": \"constrained\"});\n\n\n\n\nSo far so good! The traceplots indicate the sampler worked well.\nNow, let’s keep this posterior aside for later use and let’s fit the hierarchical version.\n\n\n\nThis model incorporates a group-specific intercept for each player:\n\\[\n\\begin{array}{lr}\n \\displaystyle \\text{logit}(p_i) = \\alpha + \\gamma_i & \\text{with } i = 0, \\cdots, 14\n\\end{array} \n\\]\nwhere\n\\[\n\\begin{array}{c}\n \\alpha \\sim \\text{Normal}(0, \\ \\sigma_{\\alpha}) \\\\\n \\gamma_i \\sim \\text{Normal}(0, \\ \\sigma_{\\gamma}) \\\\\n \\sigma_{\\gamma} \\sim \\text{HalfNormal}(\\tau_{\\gamma})\n\\end{array}\n\\]\nThe group-specific terms are indicated with the | operator in the formula. In this case, since there is an intercept for each player, we write 1|playerID.\n\nmodel_hierarchical = bmb.Model(\"p(H, AB) ~ 1 + (1|playerID)\", df, family=\"binomial\")\nmodel_hierarchical\n\n Formula: p(H, AB) ~ 1 + (1|playerID)\n Family: binomial\n Link: p = logit\n Observations: 15\n Priors: \n target = p\n Common-level effects\n Intercept ~ Normal(mu: 0.0, sigma: 1.5)\n \n Group-level effects\n 1|playerID ~ Normal(mu: 0.0, sigma: HalfNormal(sigma: 2.5))\n\n\n\nidata_hierarchical = model_hierarchical.fit(random_seed=random_seed)\n\nAuto-assigning NUTS sampler...\nInitializing NUTS using jitter+adapt_diag...\nMultiprocess sampling (4 chains in 4 jobs)\nNUTS: [Intercept, 1|playerID_sigma, 1|playerID_offset]\n\n\n\n\n\n\n\n\n\n\n\nSampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 14 seconds.\nThere were 6 divergences after tuning. Increase `target_accept` or reparameterize.\n\n\nSometimes, there can be several divergences when fitting a hierarchical model. What can we do in that case?\nOne thing we could try is increasing target_accept. But if there are many divergences, that suggests a problem with the underlying model. Let’s take a look at the prior predictive distribution to see whether our priors are too informative or too wide.\nThe Model instance has a method called prior_predictive() that generates samples from the prior predictive distribution. It returns an InferenceData object that contains the values of the prior predictive distribution.\n\nidata_prior = model_hierarchical.prior_predictive()\nprior = az.extract_dataset(idata_prior, group=\"prior_predictive\")[\"p(H, AB)\"]\n\nSampling: [1|playerID_offset, 1|playerID_sigma, Intercept, p(H, AB)]\n/tmp/ipykernel_12852/2686921361.py:2: FutureWarning: extract_dataset has been deprecated, please use extract\n prior = az.extract_dataset(idata_prior, group=\"prior_predictive\")[\"p(H, AB)\"]\n\n\nIf we inspect the DataArray, we see there are 500 draws (sample) for each of the 15 players (__obs__)\nLet’s plot these distributions together with the observed proportion of hits for every player here.\n\n# We define this function because this plot is going to be repeated below.\ndef plot_prior_predictive(df, prior):\n AB = df[\"AB\"].values\n H = df[\"H\"].values\n\n fig, axes = plt.subplots(5, 3, figsize=(10, 6), sharex=\"col\")\n\n for idx, ax in enumerate(axes.ravel()):\n pps = prior.sel({\"__obs__\":idx})\n ab = AB[idx]\n h = H[idx]\n hist = ax.hist(pps / ab, bins=25, color=\"#a3a3a3\")\n ax.axvline(h / ab, color=RED, lw=2)\n ax.set_yticks([])\n ax.tick_params(labelsize=12)\n \n fig.subplots_adjust(left=0.025, right=0.975, hspace=0.05, wspace=0.05, bottom=0.125)\n fig.legend(\n handles=[Line2D([0], [0], label=\"Observed proportion\", color=RED, linewidth=2)],\n handlelength=1.5,\n handletextpad=0.8,\n borderaxespad=0,\n frameon=True,\n fontsize=11, \n bbox_to_anchor=(0.975, 0.92),\n loc=\"right\"\n \n )\n fig.text(0.5, 0.05, \"Prior probability of hitting\", fontsize=15, ha=\"center\", va=\"baseline\")\n\n\nplot_prior_predictive(df, prior)\n\n\n\n\nIndeed, priors are too wide! Let’s use tighter priors and see what’s the result\n\npriors = {\n \"Intercept\": bmb.Prior(\"Normal\", mu=0, sigma=1),\n \"1|playerID\": bmb.Prior(\"Normal\", mu=0, sigma=bmb.Prior(\"HalfNormal\", sigma=1))\n}\nmodel_hierarchical = bmb.Model(\"p(H, AB) ~ 1 + (1|playerID)\", df, family=\"binomial\", priors=priors)\nmodel_hierarchical\n\n Formula: p(H, AB) ~ 1 + (1|playerID)\n Family: binomial\n Link: p = logit\n Observations: 15\n Priors: \n target = p\n Common-level effects\n Intercept ~ Normal(mu: 0.0, sigma: 1.0)\n \n Group-level effects\n 1|playerID ~ Normal(mu: 0.0, sigma: HalfNormal(sigma: 1.0))\n\n\nNow let’s check the prior predictive distribution for these new priors.\n\nmodel_hierarchical.build()\nidata_prior = model_hierarchical.prior_predictive()\nprior = az.extract_dataset(idata_prior, group=\"prior_predictive\")[\"p(H, AB)\"]\nplot_prior_predictive(df, prior)\n\nSampling: [1|playerID_offset, 1|playerID_sigma, Intercept, p(H, AB)]\n/tmp/ipykernel_12852/1302716284.py:3: FutureWarning: extract_dataset has been deprecated, please use extract\n prior = az.extract_dataset(idata_prior, group=\"prior_predictive\")[\"p(H, AB)\"]\n\n\n\n\n\nDefinetely it looks much better. Now the priors tend to have a symmetric shape with a mode at 0.5, with substantial probability on the whole domain.\n\nidata_hierarchical = model_hierarchical.fit(random_seed=random_seed)\n\nAuto-assigning NUTS sampler...\nInitializing NUTS using jitter+adapt_diag...\nMultiprocess sampling (4 chains in 4 jobs)\nNUTS: [Intercept, 1|playerID_sigma, 1|playerID_offset]\n\n\n\n\n\n\n\n\n\n\n\nSampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 13 seconds.\nThere were 3 divergences after tuning. Increase `target_accept` or reparameterize.\n\n\nLet’s try with increasing target_accept and the number of tune samples.\n\nidata_hierarchical = model_hierarchical.fit(tune=2000, draws=2000, target_accept=0.95, random_seed=random_seed)\n\nAuto-assigning NUTS sampler...\nInitializing NUTS using jitter+adapt_diag...\nMultiprocess sampling (4 chains in 4 jobs)\nNUTS: [Intercept, 1|playerID_sigma, 1|playerID_offset]\n\n\n\n\n\n\n\n\n\n\n\nSampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 24 seconds.\n\n\n\nvar_names = [\"Intercept\", \"1|playerID\", \"1|playerID_sigma\"]\naz.plot_trace(idata_hierarchical, var_names=var_names, compact=False, backend_kwargs={\"layout\": \"constrained\"});\n\n\n\n\nLet’s jump onto the next section where we plot and compare the probability of hit for the players using both models.\n\n\n\nNow we’re going to plot the distribution of the probability of hit for each player, using both models.\nBut before doing that, we need to obtain the posterior in that scale. We could manually take the posterior of the coefficients, compute the linear predictor, and transform that to the probability scale. But that’s a lot of work!\nFortunately, Bambi models have a method called .predict() that we can use to predict in the probability scale. By default, it modifies in-place the InferenceData object we pass to it. Then, the posterior samples can be found in the variable p.\n\nmodel_non_hierarchical.predict(idata_non_hierarchical)\nmodel_hierarchical.predict(idata_hierarchical)\n\nLet’s create a forestplot using the posteriors obtained with both models so we can compare them very easily .\n\n_, ax = plt.subplots(figsize = (8, 8))\n\n# Add vertical line for the global probability of hitting\nax.axvline(x=(df[\"H\"] / df[\"AB\"]).mean(), ls=\"--\", color=\"black\", alpha=0.5)\n\n# Create forestplot with ArviZ, only for the mean.\naz.plot_forest(\n [idata_non_hierarchical, idata_hierarchical], \n var_names=\"p\", \n combined=True, \n colors=[\"#666666\", RED], \n linewidth=2.6, \n markersize=8,\n ax=ax\n)\n\n# Create custom y axis tick labels\nylabels = [f\"H: {round(h)}, AB: {round(ab)}\" for h, ab in zip(df[\"H\"].values, df[\"AB\"].values)]\nylabels = list(reversed(ylabels))\n\n# Put the labels for the y axis in the mid of the original location of the tick marks.\nax.set_yticklabels(ylabels, ha=\"right\")\n\n# Create legend\nhandles = [\n Patch(label=\"Non-hierarchical\", facecolor=\"#666666\"),\n Patch(label=\"Hierarchical\", facecolor=RED),\n Line2D([0], [0], label=\"Mean probability\", ls=\"--\", color=\"black\", alpha=0.5)\n]\n\nlegend = ax.legend(handles=handles, loc=4, fontsize=14, frameon=True, framealpha=0.8);\n\n\n\n\nOne of the first things one can see is that not only the center of the distributions varies but also their dispersion. Those posteriors that are very wide are associated with players who have batted only once or few times, while tighter posteriors correspond to players who batted several times.\nPlayers who have extreme empirical proportions have similar extreme posteriors under the non-hierarchical model. However, under the hierarchical model, these distributions are now shrunk towards the global mean. Extreme values are very unlikely under the hierarchical model.\nAnd finally, paraphrasing Eric, there’s nothing ineherently right or wrong about shrinkage and hierarchical models. Whether this is reasonable or not depends on our prior knowledge about the problem. And to me, after having seen the hit rates of the other players, it is much more reasonable to shrink extreme posteriors based on very few data points towards the global mean rather than just let them concentrate around 0 or 1.\n\n%load_ext watermark\n%watermark -n -u -v -iv -w\n\nLast updated: Thu Aug 15 2024\n\nPython implementation: CPython\nPython version : 3.11.9\nIPython version : 8.24.0\n\nbambi : 0.14.1.dev12+g64e57423.d20240730\nnumpy : 1.26.4\nmatplotlib: 3.8.4\narviz : 0.18.0\n\nWatermark: 2.4.3\n\n\n\n\n\n\n\n By default, the .predict() method obtains the posterior for the mean of the likelihood distribution. This mean would be \\(np\\) for the Binomial family. However, since \\(n\\) varies from observation to observation, it returns the value of \\(p\\), as if it was a Bernoulli family." }, + { + "objectID": "notebooks/horseshoe_prior.html", + "href": "notebooks/horseshoe_prior.html", + "title": "Bambi", + "section": "", + "text": "Horseshoe Prior\nIn this example, we will use the Horseshoe Prior (Carvalho et al., 2009) to model a large number of variables, with only a few slopes being significantly different from zero.\n\nimport arviz as az\nimport bambi as bmb\nimport numpy as np\nimport pandas as pd\nimport pymc as pm\nimport pytensor.tensor as pt\n\nfrom matplotlib import pyplot as plt\n\nHere is what we did:\n\nWe defined an intercept.\nWe defined a vector of 50 betas, 5 of which were drawn from a normal(5,1) distribution, and then assigned a random sign.\nWe created the design matrix with normal(0,1) entries and set \\(\\sigma\\) to 1.\nWe calculated the deterministic means \\(\\mu\\) using the intercept and the design matrix multiplied by the betas.\nWe simulated 100 response variables (observations) from a normal distribution with mean \\(\\mu\\) and standard deviation \\(\\sigma\\).\n\nNext, we proceeded with the Bayesian estimation of the model. We proposed the horseshoe prior, for which the following parameters were calculated:\n\\[\\mu_i = \\alpha + \\beta_1 x_{1i} + \\beta_2 x_{2i} + ... + \\beta_p x_{pi}\\]\n\\[y_i \\sim N(\\mu_i, \\sigma^2)\\]\n\\[\\alpha \\sim N(0,1)\\]\n\\[\\beta_j \\sim N(0,\\lambda_j^2 \\tau^2)\\]\n\\[\\lambda_j \\sim C^+(0,1)\\]\n\\[\\tau \\sim T^+(df=3)\\]\n\\[\\sigma^2 \\sim N^+(0,1)\\]\n\nD = 50\nD0 = 5\n\nSEED = 123456789 # for reproducibility\n\nrng = np.random.default_rng(SEED)\n\nINTERCEPT = rng.uniform(-3, 3) # simulate an intercept\n\nCOEF = np.zeros(D)\n# Simulate the slopes for significant variables\nCOEF[:D0] = rng.choice([-1, 1], size=D0) * rng.normal(5, 1, size=D0)\n\nN = 100\nX = rng.normal(size=(N, D))\nSIGMA = 1.\n# Simulate the data\ny = INTERCEPT + X.dot(COEF) + rng.normal(0, SIGMA, size=N)\n\nHere we create the dataframe and the term name for the set of variables, to define the formula.\n\ndf = pd.DataFrame(X)\ndf.columns = [f\"x{i}\" for i in range(X.shape[1])]\ndf[\"y\"] = y\n\n\nterm_name = \"c(\" + \", \".join([f\"x{i}\" for i in range(X.shape[1])]) + \")\"\nformula = f\"y ~ {term_name}\"\nformula\n\n'y ~ c(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49)'\n\n\nFinally, we call the Horseshoe prior and create the model\n\npriors = {\n term_name: bmb.Prior(\"Horseshoe\"),\n}\nmodel = bmb.Model(formula, df, priors=priors)\nmodel.set_alias({term_name: \"predictors\"})\n\nmodel.build()\nmodel.graph()\n\n\n\n\n\nidata = model.fit(target_accept = 0.95, chains=2)\n\nAuto-assigning NUTS sampler...\nInitializing NUTS using jitter+adapt_diag...\nMultiprocess sampling (2 chains in 2 jobs)\nNUTS: [sigma, Intercept, predictors_tau, predictors_lam, predictors_raw]\n\n\n\n\n\n\n\n\n\n\n\nSampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 444 seconds.\nThere were 64 divergences after tuning. Increase `target_accept` or reparameterize.\nChain 0 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.\nChain 1 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.\nWe recommend running at least 4 chains for robust computation of convergence diagnostics\n\n\n\npriors = {\n term_name: bmb.Prior(\"Horseshoe\", tau_nu = 3, lam_nu = 3),\n}\nmodel = bmb.Model(formula, df, priors=priors)\nmodel.set_alias({term_name: \"predictors\"})\n\nmodel.build()\nmodel.graph()\n\n\n\n\n\nidata = model.fit(target_accept = 0.97, chains=2)\n\nAuto-assigning NUTS sampler...\nInitializing NUTS using jitter+adapt_diag...\nMultiprocess sampling (2 chains in 2 jobs)\nNUTS: [sigma, Intercept, predictors_tau, predictors_lam, predictors_raw]\n\n\n\n\n\n\n\n\n\n\n\nSampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 293 seconds.\nThere were 29 divergences after tuning. Increase `target_accept` or reparameterize.\nWe recommend running at least 4 chains for robust computation of convergence diagnostics\n\n\n\nax, = az.plot_forest(\n idata, \n var_names=[\"predictors\"], \n coords={\"predictors_dim\": range(D0)},\n kind='ridgeplot',\n ridgeplot_truncate=False, \n ridgeplot_alpha=0.5,\n hdi_prob=0.95, \n combined=True,\n figsize=(8, 6)\n)\nax.scatter(COEF[:D0][::-1], ax.get_yticks(), c='C1', label=\"Actual value\");\nax.set_xlabel(r\"$\\beta_i$\");\nax.set_ylim(bottom=None, top=1.55 * ax.get_yticks().max())\nax.set_yticklabels(range(D0)[::-1]);\nax.set_ylabel(r\"$i$\");\nax.legend(loc='upper center');\nax.set_title(\"Posterior distribution of nonzero coefficients\");\n\n\n\n\n\n%load_ext watermark\n%watermark -n -u -v -iv -w\n\nLast updated: Thu Aug 22 2024\n\nPython implementation: CPython\nPython version : 3.11.9\nIPython version : 8.24.0\n\npandas : 2.2.2\nnumpy : 1.26.4\nbambi : 0.14.1.dev12+g64e57423.d20240730\narviz : 0.18.0\nmatplotlib: 3.8.4\npymc : 5.16.1\npytensor : 2.23.0\n\nWatermark: 2.4.3" + }, { "objectID": "notebooks/how_bambi_works.html", "href": "notebooks/how_bambi_works.html",