Skip to content

Commit

Permalink
tune pooling est and rebuild
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnMount committed Jun 13, 2024
1 parent d4c5bcd commit bd61ab0
Show file tree
Hide file tree
Showing 9 changed files with 427 additions and 393 deletions.
512 changes: 256 additions & 256 deletions Examples/KDD2009Example/KDD2009Example.ipynb

Large diffs are not rendered by default.

62 changes: 31 additions & 31 deletions Examples/Pooling/PartialPoolingExample.ipynb

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions coverage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ pkg/vtreat/__init__.py 6 0 100%
pkg/vtreat/cross_plan.py 50 1 98%
pkg/vtreat/da_adapter.py 109 1 99%
pkg/vtreat/effect_scaler.py 59 4 93%
pkg/vtreat/partial_pooling_estimator.py 32 0 100%
pkg/vtreat/partial_pooling_estimator.py 35 0 100%
pkg/vtreat/stats_utils.py 132 0 100%
pkg/vtreat/test_util.py 84 18 79%
pkg/vtreat/transform.py 14 0 100%
Expand All @@ -79,6 +79,6 @@ pkg/vtreat/vtreat_api.py 285 34 88%
pkg/vtreat/vtreat_db_adapter.py 1 0 100%
pkg/vtreat/vtreat_impl.py 711 61 91%
-------------------------------------------------------------
TOTAL 1632 127 92%
TOTAL 1635 127 92%

================= 47 passed, 81 warnings in 111.33s (0:01:51) ==================
================= 47 passed, 81 warnings in 104.41s (0:01:44) ==================
190 changes: 103 additions & 87 deletions docs/vtreat/partial_pooling_estimator.html

Large diffs are not rendered by default.

22 changes: 15 additions & 7 deletions pkg/build/lib/vtreat/partial_pooling_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ def standard_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
means.sort_values(["location_id"], inplace=True, ignore_index=True)
means['estimate'] = means['mean']
means["grand_mean"] = np.mean(observations["observation"])
means["impact"] = means["estimate"] - means["grand_mean"]
means["impact"] = means["estimate"]
means["impact"] = (
means["impact"]
- np.sum(means['size'] * means['impact']) / np.sum(means['size']))
means.sort_values(["location_id"], inplace=True, ignore_index=True)
return means

Expand All @@ -50,9 +53,11 @@ def pooled_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
# get counts per group
n_j = estimated_centers["size"]
per_location_observation_var = estimated_centers['var'].copy()
# inflate a bit
per_location_observation_var[pd.isnull(per_location_observation_var)] = 0
per_location_observation_var = (n_j * per_location_observation_var + np.var(observations['observation'])) / (n_j + 1)
# inflate per-loc a bit
per_location_observation_var = (
(n_j * per_location_observation_var + np.var(observations['observation']))
/ (n_j + 1))
# get the observed variance between locations
between_location_var = np.var(estimated_centers["estimate"], ddof=1)
# get v, the pooling coefficient
Expand All @@ -62,10 +67,13 @@ def pooled_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
# as between_location_var > 0 and per_location_observation_var > 0 here
# v will be in the range 0 to 1
v = 1 / (1 + per_location_observation_var / (n_j * between_location_var))
# this quantity can be improved using knowledge of the variances
grand_mean = estimated_centers['grand_mean']
v[n_j <= 1] = 0 # no information in size one items
v[pd.isnull(v)] = 0
# build the pooled estimate
pooled_estimate = v * estimated_centers["estimate"] + (1 - v) * grand_mean
pooled_estimate = v * estimated_centers["estimate"] + (1 - v) * estimated_centers["grand_mean"]
estimated_centers["estimate"] = pooled_estimate
estimated_centers['impact'] = pooled_estimate - grand_mean
estimated_centers["impact"] = estimated_centers["estimate"]
estimated_centers["impact"] = (
estimated_centers["impact"]
- np.sum(estimated_centers['size'] * estimated_centers['impact']) / np.sum(estimated_centers['size']))
return estimated_centers
Binary file modified pkg/dist/vtreat-1.3.1-py3-none-any.whl
Binary file not shown.
Binary file modified pkg/dist/vtreat-1.3.1.tar.gz
Binary file not shown.
6 changes: 4 additions & 2 deletions pkg/tests/test_partial_pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def test_standard_effect_estimate():
'observation': [ 1, 2, 3, 4, 5, 6],
})
r = standard_effect_estimate(d)
assert np.abs(np.sum(r['size'] * r['impact'])) < 1e-5
expect = pd.DataFrame({
'location_id': ['a', 'b', 'c'],
'mean': [2.0, 4.5, 6.0],
Expand All @@ -28,14 +29,15 @@ def test_pooled_effect_estimate():
'observation': [ 1, 2, 3, 4, 5, 6],
})
r = pooled_effect_estimate(d)
assert np.abs(np.sum(r['size'] * r['impact'])) < 1e-5
expect = pd.DataFrame({
'location_id': ['a', 'b', 'c'],
'mean': [2.0, 4.5, 6.0],
'var': [1.0, 0.5, np.nan],
'size': [3, 2, 1],
'estimate': [2.161608, 4.362170, 5.342105],
'estimate': [2.161608, 4.362170, 3.500000],
'grand_mean': [3.5, 3.5, 3.5],
'impact': [-1.338392, 0.862170, 1.842105],
'impact': [-0.956586, 1.243976, 0.381806],
})
assert r.shape == expect.shape
assert np.all(r.columns == expect.columns)
Expand Down
22 changes: 15 additions & 7 deletions pkg/vtreat/partial_pooling_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ def standard_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
means.sort_values(["location_id"], inplace=True, ignore_index=True)
means['estimate'] = means['mean']
means["grand_mean"] = np.mean(observations["observation"])
means["impact"] = means["estimate"] - means["grand_mean"]
means["impact"] = means["estimate"]
means["impact"] = (
means["impact"]
- np.sum(means['size'] * means['impact']) / np.sum(means['size']))
means.sort_values(["location_id"], inplace=True, ignore_index=True)
return means

Expand All @@ -50,9 +53,11 @@ def pooled_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
# get counts per group
n_j = estimated_centers["size"]
per_location_observation_var = estimated_centers['var'].copy()
# inflate a bit
per_location_observation_var[pd.isnull(per_location_observation_var)] = 0
per_location_observation_var = (n_j * per_location_observation_var + np.var(observations['observation'])) / (n_j + 1)
# inflate per-loc a bit
per_location_observation_var = (
(n_j * per_location_observation_var + np.var(observations['observation']))
/ (n_j + 1))
# get the observed variance between locations
between_location_var = np.var(estimated_centers["estimate"], ddof=1)
# get v, the pooling coefficient
Expand All @@ -62,10 +67,13 @@ def pooled_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
# as between_location_var > 0 and per_location_observation_var > 0 here
# v will be in the range 0 to 1
v = 1 / (1 + per_location_observation_var / (n_j * between_location_var))
# this quantity can be improved using knowledge of the variances
grand_mean = estimated_centers['grand_mean']
v[n_j <= 1] = 0 # no information in size one items
v[pd.isnull(v)] = 0
# build the pooled estimate
pooled_estimate = v * estimated_centers["estimate"] + (1 - v) * grand_mean
pooled_estimate = v * estimated_centers["estimate"] + (1 - v) * estimated_centers["grand_mean"]
estimated_centers["estimate"] = pooled_estimate
estimated_centers['impact'] = pooled_estimate - grand_mean
estimated_centers["impact"] = estimated_centers["estimate"]
estimated_centers["impact"] = (
estimated_centers["impact"]
- np.sum(estimated_centers['size'] * estimated_centers['impact']) / np.sum(estimated_centers['size']))
return estimated_centers

0 comments on commit bd61ab0

Please sign in to comment.