Skip to content

Commit

Permalink
add bayesian logistic regression
Browse files Browse the repository at this point in the history
  • Loading branch information
rnburn committed Oct 21, 2024
1 parent 442db8a commit 4cb212d
Show file tree
Hide file tree
Showing 10 changed files with 324 additions and 1 deletion.
6 changes: 6 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,9 @@ jobs:
- name: Delta Binomial Testing
run: |
python test/delta_binomial_test.py
- name: Bayesian Logistic Regression1 Testing
run: |
python test/bayesian_logistic_regression1_test.py
- name: Bounded Normal Testing
run: |
python test/bounded_normal_test.py
2 changes: 2 additions & 0 deletions bbai/glm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
from ._logistic_regression_map import LogisticRegressionMAP
from ._ridge_regression import RidgeRegression
from ._bayesian_ridge_regression import BayesianRidgeRegression
from ._bayesian_logistic_regression1 import BayesianLogisticRegression1

__all__ = [
'LogisticRegression',
'LogisticRegressionMAP',
'BayesianRidgeRegression',
'BayesianLogisticRegression1',
'RidgeRegression',
]
83 changes: 83 additions & 0 deletions bbai/glm/_bayesian_logistic_regression1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from .._computation._bridge import glmlg1

import numpy as np

class _Prediction:
def __init__(self, model, xp):
self.model_ = model
self.xp_ = xp

def pdf(self, p):
"""Probability density function for the prediction posterior in p"""
return glmlg1.prediction_pdf(self.model_, self.xp_, p)

def cdf(self, p):
"""Cumulative density function for the prediction posterior in p"""
return glmlg1.prediction_cdf(self.model_, self.xp_, p)

def ppf(self, t):
"""Percentage point function for the prediction posterior in p"""
return glmlg1.prediction_ppf(self.model_, self.xp_, t)

class BayesianLogisticRegression1:
"""Model logistic regression with a single unknown weight variable and the reference prior.
Note: Here the reference prior works out to be the same as Jeffreys prior.
Additionally, using the parameters w_min and w_max the prior can be restricted to an
interval.
Parameters
----------
w_min : default='-∞'
A minimum weight restriction for the prior.
w_max : default='∞'
A maximum weight restriction for the prior.
Example
--------
>>> from bbai.glm import BayesianLogisticRegression1
>>>
>>> x = [-5, 2, 8, 1]
>>> y = [0, 1, 0, 1]
>>> model = BayesianLogisticRegression1()
>>>
>>> # Fit a posterior distribution for w with the logistic
>>> # regression reference prior
>>> model.fit(x, y)
>>>
>>> # Print the probability that w < 0.123
>>> print(model.cdf(0.123))
"""
def __init__(self, w_min=-np.inf, w_max=np.inf):
self.w_min_ = w_min
self.w_max_ = w_max

def fit(self, x, y):
yp = np.zeros(len(y))
for i, yi in enumerate(y):
assert yi == 0 or yi == 1
yp[i] = -1.0 if yi == 0 else 1.0
self.model_ = glmlg1.logistic1_fit(x, yp, self.w_min_, self.w_max_)

def prior_pdf(self, w):
"""Probability density function for the prior in w"""
return glmlg1.prior_pdf(self.model_, w)

def pdf(self, w):
"""Probability density function for the posterior in w"""
return glmlg1.pdf(self.model_, w)

def cdf(self, w):
"""Cumulative density function for the posterior in w"""
return glmlg1.cdf(self.model_, w)

def ppf(self, t):
"""Percentage point function for the posterior in w"""
return glmlg1.ppf(self.model_, t)

def predict(self, xp):
"""Return the posterior distribution for p where p denotes the probability 1 / (1 + exp(-xp * w))"""
assert xp != 0.0
return _Prediction(self.model_, xp)
4 changes: 4 additions & 0 deletions bbai/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from ._delta_binomial import DeltaBinomialModel
from ._bounded_normal import BoundedNormal
from ._restricted_t import RestrictedT

__all__ = [
'DeltaBinomialModel',
'BoundedNormal',
'RestrictedT',
]

31 changes: 31 additions & 0 deletions bbai/model/_bounded_normal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from bbai._computation._bridge import mdlbn

import numpy as np

class BoundedNormal:
def __init__(self, a, b):
assert a < b
self.a_ = a
self.b_ = b

def fit_mean_var(self, n, mean, variance):
self.model_ = mdlbn.bounded_normal_fit(self.a_, self.b_, n, mean, variance)

def fit(self, y):
n = len(y)
mean = np.mean(y)
sigma = np.std(y)
self.fit_mean_var(n, mean, sigma**2)

def pdf(self, x):
assert self.model_ is not None
return mdlbn.pdf(self.model_, x)

def cdf(self, x):
assert self.model_ is not None
return mdlbn.cdf(self.model_, x)

def ppf(self, t):
assert 0 <= t and t <= 1
assert self.model_ is not None
return mdlbn.ppf(self.model_, t)
2 changes: 1 addition & 1 deletion bbai/model/_delta_binomial.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def fit(self, a1, b1, a2, b2):
self.model_ = mdldb.delta_binomial_fit(self.prior_, a1, b1, a2, b2)

def cdf(self, t):
"""Cumulative distribution function for the posterior in p1-p2"""
"""Cumulative density function for the posterior in p1-p2"""
assert -1 <= t and t <= 1
assert self.model_ is not None
return mdldb.delta_binomial_cdf(self.model_, t)
Expand Down
39 changes: 39 additions & 0 deletions bbai/model/_restricted_t.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import numpy as np
import scipy

class RestrictedT:
def __init__(self, a, b):
assert a < b
self.a_ = a
self.b_ = b

def fit_mean_var(self, n, mean, variance):
s = np.sqrt(variance * n / (n - 1))
s /= np.sqrt(n)
self.model_ = scipy.stats.t(n-1, loc=mean, scale=s)

self.cdf_a_ = self.model_.cdf(self.a_)
self.norm_ = self.model_.cdf(self.b_) - self.cdf_a_

def fit(self, y):
mean = np.mean(y)
var = np.std(y)**2
self.fit_mean_var(len(y), mean, var)

def pdf(self, x):
if x < self.a_ or x > self.b_:
return 0
return self.model_.pdf(x) / self.norm_

def cdf(self, x):
if x < self.a_:
return 0
if x > self.b_:
return 1
return (self.model_.cdf(x) - self.cdf_a) / self.norm_

def ppf(self, t):
assert 0 <= t and t <= 1
t *= self.norm_
t += self.cdf_a_
return self.model_.ppf(t)
115 changes: 115 additions & 0 deletions test/bayesian_logistic_regression1_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import pytest
import numpy as np
from bbai.glm import BayesianLogisticRegression1
import scipy

def log_prior(x, w):
s = 0
for xi in x:
p = 1 / (1 + np.exp(-xi * w))
q = 1 - p
s += xi**2 * p * q
return 0.5 * np.log(s)

def log_like(x, y, w):
res = 0
for xi, yi in zip(x, y):
mult = 2 * yi - 1
res += np.log(1 / (1 + np.exp(-xi * mult * w)))
return res

def log_post(x, y, w):
return log_like(x, y, w) + log_prior(x, w)

def test_bayesian_logistic_regression1():
x = [1.2, -3.4, 5.1]
y = [1.0, 1.0, 0.0]
model = BayesianLogisticRegression1()
model.fit(x, y)

# test relative pdfs
w1 = 0.123
w2 = 0.321
relative_pdf = model.pdf(w2) / model.pdf(w1)
expected = np.exp(log_post(x, y, w2) - log_post(x, y, w1))
assert relative_pdf == pytest.approx(expected)

# pdf integrates to 1
N = 1000
wmin = -10
wmax = 10
step = (wmax - wmin) / N
integral = 0
for i in range(N):
integral += model.pdf(wmin + i * step) * step
assert integral == pytest.approx(1.0)

# cdf matches the integral of pdf
N = 2000
wmin = -10
wmax = 0.123
step = (wmax - wmin) / N
integral = 0
for i in range(N):
integral += model.pdf(wmin + i * step) * step
assert integral == pytest.approx(model.cdf(wmax), rel=1.0e-3)

# cdf and ppf match up
assert 0.123 == pytest.approx(model.ppf(model.cdf(0.123)), rel=1.0e-3)

# prediction pdf matches the transformed posterior
xp = 0.67
pred = model.predict(xp)
def phi(p):
return np.log(p / (1 - p)) / xp
eps = 1.0e-9
p = 0.123
w = phi(p)
phi_dot = (phi(p + eps) - phi(p)) / eps
assert pred.pdf(p) == pytest.approx(model.pdf(w) * phi_dot)

# prediction cdf matches w cdf
p = 0.123
xp = 0.67
pred = model.predict(xp)
w = np.log(p / (1 - p)) / xp
assert p == pytest.approx(1 / (1 + np.exp(-xp * w)))
median = pred.cdf(p)
expected = model.cdf(w)
assert median == pytest.approx(expected)

# prediction cdf and ppf match up
xp = 0.67
pred = model.predict(xp)
p = pred.ppf(0.123)
assert pred.cdf(p) == pytest.approx(0.123)

# we properly handle endpoints
assert pred.pdf(0.0) == 0.0
assert pred.pdf(1.0) == 0.0

# prediction pdf integrates to 1
N = 1000
pred = model.predict(0.123)
pmin = 0.0
pmax = 1.0
step = (pmax - pmin) / N
integral = 0
for i in range(N):
integral += pred.pdf(pmin + i * step) * step
assert integral == pytest.approx(1.0)

def test_bayesian_logistic_regression1_separable():
x = [1.2, -3.4, 5.1]
y = [1.0, 0.0, 1.0]
model = BayesianLogisticRegression1()
model.fit(x, y)

xp = 0.67
pred = model.predict(xp)

assert pred.pdf(0.0) == 0.0
assert pred.pdf(1.0) == np.inf

if __name__ == "__main__":
raise SystemExit(pytest.main([__file__]))
43 changes: 43 additions & 0 deletions test/bounded_normal_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pytest
import numpy as np
from bbai.model import BoundedNormal
import scipy

def test_bounded_normal_model():
a = -0.123
b = 4.5
model = BoundedNormal(a, b)
y = [0, 3, 4]
model.fit(y)

# PDF goes to zero at the endpoints
assert model.pdf(a) == 0.0
assert model.pdf(b) == 0.0

# PDF is zero outside of the boundaries
assert model.pdf(a - 0.01) == 0.0
assert model.pdf(b + 0.01) == 0.0

# PDF integrates to 1
S = 0
N = 2000
step = (b - a) / N
for i in range(N):
S += model.pdf(a + i * step) * step
assert S == pytest.approx(1.0)

# cdf matches up with numeric integration
t = 1.23
S = 0
step = (t - a) / N
for i in range(N):
S += model.pdf(a + i * step) * step
assert S == pytest.approx(model.cdf(t), rel=1.0e-3)

# cdf and ppf match up
median = model.ppf(0.5)
assert model.cdf(median) == pytest.approx(0.5, rel=1.0e-3)

if __name__ == "__main__":
raise SystemExit(pytest.main([__file__]))

Binary file not shown.

0 comments on commit 4cb212d

Please sign in to comment.