diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 44f2fe8..f8e5e26 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -54,3 +54,9 @@ jobs: - name: Delta Binomial Testing run: | python test/delta_binomial_test.py + - name: Bayesian Logistic Regression1 Testing + run: | + python test/bayesian_logistic_regression1_test.py + - name: Bounded Normal Testing + run: | + python test/bounded_normal_test.py diff --git a/bbai/glm/__init__.py b/bbai/glm/__init__.py index 2935cc5..e42c699 100644 --- a/bbai/glm/__init__.py +++ b/bbai/glm/__init__.py @@ -2,10 +2,12 @@ from ._logistic_regression_map import LogisticRegressionMAP from ._ridge_regression import RidgeRegression from ._bayesian_ridge_regression import BayesianRidgeRegression +from ._bayesian_logistic_regression1 import BayesianLogisticRegression1 __all__ = [ 'LogisticRegression', 'LogisticRegressionMAP', 'BayesianRidgeRegression', + 'BayesianLogisticRegression1', 'RidgeRegression', ] diff --git a/bbai/glm/_bayesian_logistic_regression1.py b/bbai/glm/_bayesian_logistic_regression1.py new file mode 100644 index 0000000..6d7927f --- /dev/null +++ b/bbai/glm/_bayesian_logistic_regression1.py @@ -0,0 +1,83 @@ +from .._computation._bridge import glmlg1 + +import numpy as np + +class _Prediction: + def __init__(self, model, xp): + self.model_ = model + self.xp_ = xp + + def pdf(self, p): + """Probability density function for the prediction posterior in p""" + return glmlg1.prediction_pdf(self.model_, self.xp_, p) + + def cdf(self, p): + """Cumulative density function for the prediction posterior in p""" + return glmlg1.prediction_cdf(self.model_, self.xp_, p) + + def ppf(self, t): + """Percentage point function for the prediction posterior in p""" + return glmlg1.prediction_ppf(self.model_, self.xp_, t) + +class BayesianLogisticRegression1: + """Model logistic regression with a single unknown weight variable and the reference prior. + + Note: Here the reference prior works out to be the same as Jeffreys prior. + + Additionally, using the parameters w_min and w_max the prior can be restricted to an + interval. + + Parameters + ---------- + w_min : default='-∞' + A minimum weight restriction for the prior. + + w_max : default='∞' + A maximum weight restriction for the prior. + + Example + -------- + >>> from bbai.glm import BayesianLogisticRegression1 + >>> + >>> x = [-5, 2, 8, 1] + >>> y = [0, 1, 0, 1] + >>> model = BayesianLogisticRegression1() + >>> + >>> # Fit a posterior distribution for w with the logistic + >>> # regression reference prior + >>> model.fit(x, y) + >>> + >>> # Print the probability that w < 0.123 + >>> print(model.cdf(0.123)) + """ + def __init__(self, w_min=-np.inf, w_max=np.inf): + self.w_min_ = w_min + self.w_max_ = w_max + + def fit(self, x, y): + yp = np.zeros(len(y)) + for i, yi in enumerate(y): + assert yi == 0 or yi == 1 + yp[i] = -1.0 if yi == 0 else 1.0 + self.model_ = glmlg1.logistic1_fit(x, yp, self.w_min_, self.w_max_) + + def prior_pdf(self, w): + """Probability density function for the prior in w""" + return glmlg1.prior_pdf(self.model_, w) + + def pdf(self, w): + """Probability density function for the posterior in w""" + return glmlg1.pdf(self.model_, w) + + def cdf(self, w): + """Cumulative density function for the posterior in w""" + return glmlg1.cdf(self.model_, w) + + def ppf(self, t): + """Percentage point function for the posterior in w""" + return glmlg1.ppf(self.model_, t) + + def predict(self, xp): + """Return the posterior distribution for p where p denotes the probability 1 / (1 + exp(-xp * w))""" + assert xp != 0.0 + return _Prediction(self.model_, xp) diff --git a/bbai/model/__init__.py b/bbai/model/__init__.py index 7c3e339..7e04d93 100644 --- a/bbai/model/__init__.py +++ b/bbai/model/__init__.py @@ -1,6 +1,10 @@ from ._delta_binomial import DeltaBinomialModel +from ._bounded_normal import BoundedNormal +from ._restricted_t import RestrictedT __all__ = [ 'DeltaBinomialModel', + 'BoundedNormal', + 'RestrictedT', ] diff --git a/bbai/model/_bounded_normal.py b/bbai/model/_bounded_normal.py new file mode 100644 index 0000000..c58fe68 --- /dev/null +++ b/bbai/model/_bounded_normal.py @@ -0,0 +1,31 @@ +from bbai._computation._bridge import mdlbn + +import numpy as np + +class BoundedNormal: + def __init__(self, a, b): + assert a < b + self.a_ = a + self.b_ = b + + def fit_mean_var(self, n, mean, variance): + self.model_ = mdlbn.bounded_normal_fit(self.a_, self.b_, n, mean, variance) + + def fit(self, y): + n = len(y) + mean = np.mean(y) + sigma = np.std(y) + self.fit_mean_var(n, mean, sigma**2) + + def pdf(self, x): + assert self.model_ is not None + return mdlbn.pdf(self.model_, x) + + def cdf(self, x): + assert self.model_ is not None + return mdlbn.cdf(self.model_, x) + + def ppf(self, t): + assert 0 <= t and t <= 1 + assert self.model_ is not None + return mdlbn.ppf(self.model_, t) diff --git a/bbai/model/_delta_binomial.py b/bbai/model/_delta_binomial.py index aaf927c..d035c0b 100644 --- a/bbai/model/_delta_binomial.py +++ b/bbai/model/_delta_binomial.py @@ -64,7 +64,7 @@ def fit(self, a1, b1, a2, b2): self.model_ = mdldb.delta_binomial_fit(self.prior_, a1, b1, a2, b2) def cdf(self, t): - """Cumulative distribution function for the posterior in p1-p2""" + """Cumulative density function for the posterior in p1-p2""" assert -1 <= t and t <= 1 assert self.model_ is not None return mdldb.delta_binomial_cdf(self.model_, t) diff --git a/bbai/model/_restricted_t.py b/bbai/model/_restricted_t.py new file mode 100644 index 0000000..a407dc6 --- /dev/null +++ b/bbai/model/_restricted_t.py @@ -0,0 +1,39 @@ +import numpy as np +import scipy + +class RestrictedT: + def __init__(self, a, b): + assert a < b + self.a_ = a + self.b_ = b + + def fit_mean_var(self, n, mean, variance): + s = np.sqrt(variance * n / (n - 1)) + s /= np.sqrt(n) + self.model_ = scipy.stats.t(n-1, loc=mean, scale=s) + + self.cdf_a_ = self.model_.cdf(self.a_) + self.norm_ = self.model_.cdf(self.b_) - self.cdf_a_ + + def fit(self, y): + mean = np.mean(y) + var = np.std(y)**2 + self.fit_mean_var(len(y), mean, var) + + def pdf(self, x): + if x < self.a_ or x > self.b_: + return 0 + return self.model_.pdf(x) / self.norm_ + + def cdf(self, x): + if x < self.a_: + return 0 + if x > self.b_: + return 1 + return (self.model_.cdf(x) - self.cdf_a) / self.norm_ + + def ppf(self, t): + assert 0 <= t and t <= 1 + t *= self.norm_ + t += self.cdf_a_ + return self.model_.ppf(t) diff --git a/test/bayesian_logistic_regression1_test.py b/test/bayesian_logistic_regression1_test.py new file mode 100644 index 0000000..da55f9a --- /dev/null +++ b/test/bayesian_logistic_regression1_test.py @@ -0,0 +1,115 @@ +import pytest +import numpy as np +from bbai.glm import BayesianLogisticRegression1 +import scipy + +def log_prior(x, w): + s = 0 + for xi in x: + p = 1 / (1 + np.exp(-xi * w)) + q = 1 - p + s += xi**2 * p * q + return 0.5 * np.log(s) + +def log_like(x, y, w): + res = 0 + for xi, yi in zip(x, y): + mult = 2 * yi - 1 + res += np.log(1 / (1 + np.exp(-xi * mult * w))) + return res + +def log_post(x, y, w): + return log_like(x, y, w) + log_prior(x, w) + +def test_bayesian_logistic_regression1(): + x = [1.2, -3.4, 5.1] + y = [1.0, 1.0, 0.0] + model = BayesianLogisticRegression1() + model.fit(x, y) + + # test relative pdfs + w1 = 0.123 + w2 = 0.321 + relative_pdf = model.pdf(w2) / model.pdf(w1) + expected = np.exp(log_post(x, y, w2) - log_post(x, y, w1)) + assert relative_pdf == pytest.approx(expected) + + # pdf integrates to 1 + N = 1000 + wmin = -10 + wmax = 10 + step = (wmax - wmin) / N + integral = 0 + for i in range(N): + integral += model.pdf(wmin + i * step) * step + assert integral == pytest.approx(1.0) + + # cdf matches the integral of pdf + N = 2000 + wmin = -10 + wmax = 0.123 + step = (wmax - wmin) / N + integral = 0 + for i in range(N): + integral += model.pdf(wmin + i * step) * step + assert integral == pytest.approx(model.cdf(wmax), rel=1.0e-3) + + # cdf and ppf match up + assert 0.123 == pytest.approx(model.ppf(model.cdf(0.123)), rel=1.0e-3) + + # prediction pdf matches the transformed posterior + xp = 0.67 + pred = model.predict(xp) + def phi(p): + return np.log(p / (1 - p)) / xp + eps = 1.0e-9 + p = 0.123 + w = phi(p) + phi_dot = (phi(p + eps) - phi(p)) / eps + assert pred.pdf(p) == pytest.approx(model.pdf(w) * phi_dot) + + # prediction cdf matches w cdf + p = 0.123 + xp = 0.67 + pred = model.predict(xp) + w = np.log(p / (1 - p)) / xp + assert p == pytest.approx(1 / (1 + np.exp(-xp * w))) + median = pred.cdf(p) + expected = model.cdf(w) + assert median == pytest.approx(expected) + + # prediction cdf and ppf match up + xp = 0.67 + pred = model.predict(xp) + p = pred.ppf(0.123) + assert pred.cdf(p) == pytest.approx(0.123) + + # we properly handle endpoints + assert pred.pdf(0.0) == 0.0 + assert pred.pdf(1.0) == 0.0 + + # prediction pdf integrates to 1 + N = 1000 + pred = model.predict(0.123) + pmin = 0.0 + pmax = 1.0 + step = (pmax - pmin) / N + integral = 0 + for i in range(N): + integral += pred.pdf(pmin + i * step) * step + assert integral == pytest.approx(1.0) + +def test_bayesian_logistic_regression1_separable(): + x = [1.2, -3.4, 5.1] + y = [1.0, 0.0, 1.0] + model = BayesianLogisticRegression1() + model.fit(x, y) + + xp = 0.67 + pred = model.predict(xp) + + assert pred.pdf(0.0) == 0.0 + assert pred.pdf(1.0) == np.inf + +if __name__ == "__main__": + raise SystemExit(pytest.main([__file__])) diff --git a/test/bounded_normal_test.py b/test/bounded_normal_test.py new file mode 100644 index 0000000..32c48e8 --- /dev/null +++ b/test/bounded_normal_test.py @@ -0,0 +1,43 @@ +import pytest +import numpy as np +from bbai.model import BoundedNormal +import scipy + +def test_bounded_normal_model(): + a = -0.123 + b = 4.5 + model = BoundedNormal(a, b) + y = [0, 3, 4] + model.fit(y) + + # PDF goes to zero at the endpoints + assert model.pdf(a) == 0.0 + assert model.pdf(b) == 0.0 + + # PDF is zero outside of the boundaries + assert model.pdf(a - 0.01) == 0.0 + assert model.pdf(b + 0.01) == 0.0 + + # PDF integrates to 1 + S = 0 + N = 2000 + step = (b - a) / N + for i in range(N): + S += model.pdf(a + i * step) * step + assert S == pytest.approx(1.0) + + # cdf matches up with numeric integration + t = 1.23 + S = 0 + step = (t - a) / N + for i in range(N): + S += model.pdf(a + i * step) * step + assert S == pytest.approx(model.cdf(t), rel=1.0e-3) + + # cdf and ppf match up + median = model.ppf(0.5) + assert model.cdf(median) == pytest.approx(0.5, rel=1.0e-3) + +if __name__ == "__main__": + raise SystemExit(pytest.main([__file__])) + diff --git a/wheel/bbai-1.7.1-py3-none-manylinux1_x86_64.whl b/wheel/bbai-1.8.0-py3-none-manylinux1_x86_64.whl similarity index 67% rename from wheel/bbai-1.7.1-py3-none-manylinux1_x86_64.whl rename to wheel/bbai-1.8.0-py3-none-manylinux1_x86_64.whl index 388a81c..28a5c46 100644 Binary files a/wheel/bbai-1.7.1-py3-none-manylinux1_x86_64.whl and b/wheel/bbai-1.8.0-py3-none-manylinux1_x86_64.whl differ