add bayesian logistic regression

rnburn · Oct 21, 2024 · 4cb212d · 4cb212d
1 parent 442db8a
commit 4cb212d
Show file tree

Hide file tree

Showing 10 changed files with 324 additions and 1 deletion.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -54,3 +54,9 @@ jobs:
     - name: Delta Binomial Testing
       run: |
         python test/delta_binomial_test.py
+    - name: Bayesian Logistic Regression1 Testing
+      run: |
+        python test/bayesian_logistic_regression1_test.py
+    - name: Bounded Normal Testing
+      run: |
+        python test/bounded_normal_test.py
diff --git a/bbai/glm/__init__.py b/bbai/glm/__init__.py
@@ -2,10 +2,12 @@
 from ._logistic_regression_map import LogisticRegressionMAP
 from ._ridge_regression import RidgeRegression
 from ._bayesian_ridge_regression import BayesianRidgeRegression
+from ._bayesian_logistic_regression1 import BayesianLogisticRegression1
 
 __all__ = [
         'LogisticRegression',
         'LogisticRegressionMAP',
         'BayesianRidgeRegression',
+        'BayesianLogisticRegression1',
         'RidgeRegression',
 ]
diff --git a/bbai/glm/_bayesian_logistic_regression1.py b/bbai/glm/_bayesian_logistic_regression1.py
@@ -0,0 +1,83 @@
+from .._computation._bridge import glmlg1
+
+import numpy as np
+
+class _Prediction:
+    def __init__(self, model, xp): 
+        self.model_ = model
+        self.xp_ = xp
+
+    def pdf(self, p):
+        """Probability density function for the prediction posterior in p"""
+        return glmlg1.prediction_pdf(self.model_, self.xp_, p)
+
+    def cdf(self, p):
+        """Cumulative density function for the prediction posterior in p"""
+        return glmlg1.prediction_cdf(self.model_, self.xp_, p)
+
+    def ppf(self, t):
+        """Percentage point function for the prediction posterior in p"""
+        return glmlg1.prediction_ppf(self.model_, self.xp_, t)
+
+class BayesianLogisticRegression1:
+    """Model logistic regression with a single unknown weight variable and the reference prior.
+
+    Note: Here the reference prior works out to be the same as Jeffreys prior.
+
+    Additionally, using the parameters w_min and w_max the prior can be restricted to an
+    interval.
+    
+    Parameters
+    ----------
+    w_min : default='-∞'
+        A minimum weight restriction for the prior.
+
+    w_max : default='∞'
+        A maximum weight restriction for the prior.
+
+    Example
+    --------
+    >>> from bbai.glm import BayesianLogisticRegression1
+    >>> 
+    >>> x = [-5, 2, 8, 1]
+    >>> y = [0, 1, 0, 1]
+    >>> model = BayesianLogisticRegression1()
+    >>>
+    >>> # Fit a posterior distribution for w with the logistic
+    >>> # regression reference prior
+    >>> model.fit(x, y)
+    >>>
+    >>> # Print the probability that w < 0.123
+    >>> print(model.cdf(0.123))
+    """
+    def __init__(self, w_min=-np.inf, w_max=np.inf):
+        self.w_min_ = w_min
+        self.w_max_ = w_max
+
+    def fit(self, x, y):
+        yp = np.zeros(len(y))
+        for i, yi in enumerate(y):
+            assert yi == 0 or yi == 1
+            yp[i] = -1.0 if yi == 0 else 1.0
+        self.model_ = glmlg1.logistic1_fit(x, yp, self.w_min_, self.w_max_)
+
+    def prior_pdf(self, w):
+        """Probability density function for the prior in w"""
+        return glmlg1.prior_pdf(self.model_, w)
+
+    def pdf(self, w):
+        """Probability density function for the posterior in w"""
+        return glmlg1.pdf(self.model_, w)
+
+    def cdf(self, w):
+        """Cumulative density function for the posterior in w"""
+        return glmlg1.cdf(self.model_, w)
+
+    def ppf(self, t):
+        """Percentage point function for the posterior in w"""
+        return glmlg1.ppf(self.model_, t)
+
+    def predict(self, xp):
+        """Return the posterior distribution for p where p denotes the probability 1 / (1 + exp(-xp * w))"""
+        assert xp != 0.0
+        return _Prediction(self.model_, xp)
diff --git a/bbai/model/__init__.py b/bbai/model/__init__.py
@@ -1,6 +1,10 @@
 from ._delta_binomial import DeltaBinomialModel
+from ._bounded_normal import BoundedNormal
+from ._restricted_t import RestrictedT
 
 __all__ = [
         'DeltaBinomialModel',
+        'BoundedNormal',
+        'RestrictedT',
 ]
 
diff --git a/bbai/model/_bounded_normal.py b/bbai/model/_bounded_normal.py
@@ -0,0 +1,31 @@
+from bbai._computation._bridge import mdlbn
+
+import numpy as np
+
+class BoundedNormal:
+    def __init__(self, a, b):
+        assert a < b
+        self.a_ = a
+        self.b_ = b
+
+    def fit_mean_var(self, n, mean, variance):
+        self.model_ = mdlbn.bounded_normal_fit(self.a_, self.b_, n, mean, variance)
+
+    def fit(self, y):
+        n = len(y)
+        mean = np.mean(y)
+        sigma = np.std(y)
+        self.fit_mean_var(n, mean, sigma**2)
+
+    def pdf(self, x):
+        assert self.model_ is not None
+        return mdlbn.pdf(self.model_, x)
+
+    def cdf(self, x):
+        assert self.model_ is not None
+        return mdlbn.cdf(self.model_, x)
+
+    def ppf(self, t):
+        assert 0 <= t and t <= 1
+        assert self.model_ is not None
+        return mdlbn.ppf(self.model_, t)
diff --git a/bbai/model/_delta_binomial.py b/bbai/model/_delta_binomial.py
@@ -64,7 +64,7 @@ def fit(self, a1, b1, a2, b2):
         self.model_ = mdldb.delta_binomial_fit(self.prior_, a1, b1, a2, b2)
 
     def cdf(self, t):
-        """Cumulative distribution function for the posterior in p1-p2"""
+        """Cumulative density function for the posterior in p1-p2"""
         assert -1 <= t and t <= 1
         assert self.model_ is not None
         return mdldb.delta_binomial_cdf(self.model_, t)

diff --git a/bbai/model/_restricted_t.py b/bbai/model/_restricted_t.py
@@ -0,0 +1,39 @@
+import numpy as np
+import scipy
+
+class RestrictedT:
+    def __init__(self, a, b):
+        assert a < b
+        self.a_ = a
+        self.b_ = b
+
+    def fit_mean_var(self, n, mean, variance):
+        s = np.sqrt(variance * n / (n - 1))
+        s /= np.sqrt(n)
+        self.model_ = scipy.stats.t(n-1, loc=mean, scale=s)
+
+        self.cdf_a_ = self.model_.cdf(self.a_)
+        self.norm_ = self.model_.cdf(self.b_) - self.cdf_a_
+
+    def fit(self, y):
+        mean = np.mean(y)
+        var = np.std(y)**2
+        self.fit_mean_var(len(y), mean, var)
+
+    def pdf(self, x):
+        if x < self.a_ or x > self.b_:
+            return 0
+        return self.model_.pdf(x) / self.norm_
+
+    def cdf(self, x):
+        if x < self.a_:
+            return 0
+        if x > self.b_:
+            return 1
+        return (self.model_.cdf(x) - self.cdf_a) / self.norm_
+
+    def ppf(self, t):
+        assert 0 <= t and t <= 1
+        t *= self.norm_
+        t += self.cdf_a_
+        return self.model_.ppf(t)
diff --git a/test/bayesian_logistic_regression1_test.py b/test/bayesian_logistic_regression1_test.py
@@ -0,0 +1,115 @@
+import pytest
+import numpy as np
+from bbai.glm import BayesianLogisticRegression1
+import scipy
+
+def log_prior(x, w):
+    s = 0
+    for xi in x:
+        p = 1 / (1 + np.exp(-xi * w))
+        q = 1 - p
+        s += xi**2 * p * q
+    return 0.5 * np.log(s)
+
+def log_like(x, y, w):
+    res = 0
+    for xi, yi in zip(x, y):
+        mult = 2 * yi - 1
+        res += np.log(1 / (1 + np.exp(-xi * mult * w)))
+    return res
+
+def log_post(x, y, w):
+    return log_like(x, y, w) + log_prior(x, w)
+
+def test_bayesian_logistic_regression1():
+    x = [1.2, -3.4, 5.1]
+    y = [1.0, 1.0, 0.0]
+    model = BayesianLogisticRegression1()
+    model.fit(x, y)
+
+    # test relative pdfs
+    w1 = 0.123
+    w2 = 0.321
+    relative_pdf = model.pdf(w2) / model.pdf(w1)
+    expected = np.exp(log_post(x, y, w2) - log_post(x, y, w1))
+    assert relative_pdf == pytest.approx(expected)
+
+    # pdf integrates to 1
+    N = 1000
+    wmin = -10
+    wmax = 10
+    step = (wmax - wmin) / N
+    integral = 0
+    for i in range(N):
+        integral += model.pdf(wmin + i * step) * step
+    assert integral == pytest.approx(1.0)
+
+    # cdf matches the integral of pdf
+    N = 2000
+    wmin = -10
+    wmax = 0.123
+    step = (wmax - wmin) / N
+    integral = 0
+    for i in range(N):
+        integral += model.pdf(wmin + i * step) * step
+    assert integral == pytest.approx(model.cdf(wmax), rel=1.0e-3)
+
+    # cdf and ppf match up
+    assert 0.123 == pytest.approx(model.ppf(model.cdf(0.123)), rel=1.0e-3)
+
+    # prediction pdf matches the transformed posterior
+    xp = 0.67
+    pred = model.predict(xp)
+    def phi(p):
+        return np.log(p / (1 - p)) / xp
+    eps = 1.0e-9
+    p = 0.123
+    w = phi(p)
+    phi_dot = (phi(p + eps) - phi(p)) / eps
+    assert pred.pdf(p) == pytest.approx(model.pdf(w) * phi_dot)
+
+    # prediction cdf matches w cdf
+    p = 0.123
+    xp = 0.67
+    pred = model.predict(xp)
+    w = np.log(p / (1 - p)) / xp
+    assert p == pytest.approx(1 / (1 + np.exp(-xp * w)))
+    median = pred.cdf(p)
+    expected = model.cdf(w)
+    assert median == pytest.approx(expected)
+
+    # prediction cdf and ppf match up
+    xp = 0.67
+    pred = model.predict(xp)
+    p = pred.ppf(0.123)
+    assert pred.cdf(p) == pytest.approx(0.123)
+
+    # we properly handle endpoints
+    assert pred.pdf(0.0) == 0.0
+    assert pred.pdf(1.0) == 0.0
+
+    # prediction pdf integrates to 1
+    N = 1000
+    pred = model.predict(0.123)
+    pmin = 0.0
+    pmax = 1.0
+    step = (pmax - pmin) / N
+    integral = 0
+    for i in range(N):
+        integral += pred.pdf(pmin + i * step) * step
+    assert integral == pytest.approx(1.0)
+
+def test_bayesian_logistic_regression1_separable():
+    x = [1.2, -3.4, 5.1]
+    y = [1.0, 0.0, 1.0]
+    model = BayesianLogisticRegression1()
+    model.fit(x, y)
+
+    xp = 0.67
+    pred = model.predict(xp)
+
+    assert pred.pdf(0.0) == 0.0
+    assert pred.pdf(1.0) == np.inf
+
+if __name__ == "__main__":
+    raise SystemExit(pytest.main([__file__]))
diff --git a/test/bounded_normal_test.py b/test/bounded_normal_test.py
@@ -0,0 +1,43 @@
+import pytest
+import numpy as np
+from bbai.model import BoundedNormal
+import scipy
+
+def test_bounded_normal_model():
+    a = -0.123
+    b = 4.5
+    model = BoundedNormal(a, b)
+    y = [0, 3, 4]
+    model.fit(y)
+
+    # PDF goes to zero at the endpoints
+    assert model.pdf(a) == 0.0
+    assert model.pdf(b) == 0.0
+
+    # PDF is zero outside of the boundaries
+    assert model.pdf(a - 0.01) == 0.0
+    assert model.pdf(b + 0.01) == 0.0
+
+    # PDF integrates to 1
+    S = 0
+    N = 2000
+    step = (b - a) / N
+    for i in range(N):
+        S += model.pdf(a + i * step) * step
+    assert S == pytest.approx(1.0)
+
+    # cdf matches up with numeric integration
+    t = 1.23
+    S = 0
+    step = (t - a) / N
+    for i in range(N):
+        S += model.pdf(a + i * step) * step
+    assert S == pytest.approx(model.cdf(t), rel=1.0e-3)
+
+    # cdf and ppf match up
+    median = model.ppf(0.5)
+    assert model.cdf(median) == pytest.approx(0.5, rel=1.0e-3)
+
+if __name__ == "__main__":
+    raise SystemExit(pytest.main([__file__]))
+
diff --git a/...bbai-1.7.1-py3-none-manylinux1_x86_64.whl → ...bbai-1.8.0-py3-none-manylinux1_x86_64.whl b/...bbai-1.7.1-py3-none-manylinux1_x86_64.whl → ...bbai-1.8.0-py3-none-manylinux1_x86_64.whl