tune pooling est and rebuild

WinVector · Jun 13, 2024 · bd61ab0 · bd61ab0
1 parent d4c5bcd
commit bd61ab0
Show file tree

Hide file tree

Showing 9 changed files with 427 additions and 393 deletions.
diff --git a/Examples/KDD2009Example/KDD2009Example.ipynb b/Examples/KDD2009Example/KDD2009Example.ipynb
diff --git a/Examples/Pooling/PartialPoolingExample.ipynb b/Examples/Pooling/PartialPoolingExample.ipynb
diff --git a/coverage.txt b/coverage.txt
@@ -70,7 +70,7 @@ pkg/vtreat/__init__.py                        6      0   100%
 pkg/vtreat/cross_plan.py                     50      1    98%
 pkg/vtreat/da_adapter.py                    109      1    99%
 pkg/vtreat/effect_scaler.py                  59      4    93%
-pkg/vtreat/partial_pooling_estimator.py      32      0   100%
+pkg/vtreat/partial_pooling_estimator.py      35      0   100%
 pkg/vtreat/stats_utils.py                   132      0   100%
 pkg/vtreat/test_util.py                      84     18    79%
 pkg/vtreat/transform.py                      14      0   100%
@@ -79,6 +79,6 @@ pkg/vtreat/vtreat_api.py                    285     34    88%
 pkg/vtreat/vtreat_db_adapter.py               1      0   100%
 pkg/vtreat/vtreat_impl.py                   711     61    91%
 -------------------------------------------------------------
-TOTAL                                      1632    127    92%
+TOTAL                                      1635    127    92%
 
-================= 47 passed, 81 warnings in 111.33s (0:01:51) ==================
+================= 47 passed, 81 warnings in 104.41s (0:01:44) ==================
diff --git a/docs/vtreat/partial_pooling_estimator.html b/docs/vtreat/partial_pooling_estimator.html
diff --git a/pkg/build/lib/vtreat/partial_pooling_estimator.py b/pkg/build/lib/vtreat/partial_pooling_estimator.py
@@ -24,7 +24,10 @@ def standard_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
     means.sort_values(["location_id"], inplace=True, ignore_index=True)
     means['estimate'] = means['mean']
     means["grand_mean"] = np.mean(observations["observation"])
-    means["impact"] = means["estimate"] - means["grand_mean"]
+    means["impact"] = means["estimate"]
+    means["impact"] = (
+        means["impact"] 
+        - np.sum(means['size'] * means['impact']) / np.sum(means['size']))
     means.sort_values(["location_id"], inplace=True, ignore_index=True)
     return means
 
@@ -50,9 +53,11 @@ def pooled_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
     # get counts per group
     n_j = estimated_centers["size"]
     per_location_observation_var = estimated_centers['var'].copy()
-    # inflate a bit
     per_location_observation_var[pd.isnull(per_location_observation_var)] = 0
-    per_location_observation_var = (n_j * per_location_observation_var + np.var(observations['observation'])) / (n_j + 1)
+    # inflate per-loc a bit
+    per_location_observation_var = (
+        (n_j * per_location_observation_var + np.var(observations['observation'])) 
+        / (n_j + 1))
     # get the observed variance between locations
     between_location_var = np.var(estimated_centers["estimate"], ddof=1)
     # get v, the pooling coefficient
@@ -62,10 +67,13 @@ def pooled_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
         # as between_location_var > 0 and per_location_observation_var > 0 here
         # v will be in the range 0 to 1
         v = 1 / (1 + per_location_observation_var / (n_j * between_location_var))
-    # this quantity can be improved using knowledge of the variances
-    grand_mean = estimated_centers['grand_mean']
+    v[n_j <= 1] = 0  # no information in size one items
+    v[pd.isnull(v)] = 0
     # build the pooled estimate
-    pooled_estimate = v * estimated_centers["estimate"] + (1 - v) * grand_mean
+    pooled_estimate = v * estimated_centers["estimate"] + (1 - v) * estimated_centers["grand_mean"]
     estimated_centers["estimate"] = pooled_estimate
-    estimated_centers['impact'] = pooled_estimate - grand_mean
+    estimated_centers["impact"] = estimated_centers["estimate"] 
+    estimated_centers["impact"] = (
+        estimated_centers["impact"] 
+        - np.sum(estimated_centers['size'] * estimated_centers['impact']) / np.sum(estimated_centers['size']))
     return estimated_centers
diff --git a/pkg/dist/vtreat-1.3.1-py3-none-any.whl b/pkg/dist/vtreat-1.3.1-py3-none-any.whl
diff --git a/pkg/dist/vtreat-1.3.1.tar.gz b/pkg/dist/vtreat-1.3.1.tar.gz
diff --git a/pkg/tests/test_partial_pooling.py b/pkg/tests/test_partial_pooling.py
@@ -10,6 +10,7 @@ def test_standard_effect_estimate():
         'observation': [  1,   2,   3,   4,   5,   6],
     })
     r = standard_effect_estimate(d)
+    assert np.abs(np.sum(r['size'] * r['impact'])) < 1e-5
     expect = pd.DataFrame({
         'location_id': ['a', 'b', 'c'],
         'mean': [2.0, 4.5, 6.0],
@@ -28,14 +29,15 @@ def test_pooled_effect_estimate():
         'observation': [  1,   2,   3,   4,   5,   6],
     })
     r = pooled_effect_estimate(d)
+    assert np.abs(np.sum(r['size'] * r['impact'])) < 1e-5
     expect = pd.DataFrame({
         'location_id': ['a', 'b', 'c'],
         'mean': [2.0, 4.5, 6.0],
         'var': [1.0, 0.5, np.nan],
         'size': [3, 2, 1],
-        'estimate': [2.161608, 4.362170, 5.342105],
+        'estimate': [2.161608, 4.362170, 3.500000],
         'grand_mean': [3.5, 3.5, 3.5],
-        'impact': [-1.338392, 0.862170, 1.842105],
+        'impact': [-0.956586, 1.243976, 0.381806],
     })
     assert r.shape == expect.shape
     assert np.all(r.columns == expect.columns)

diff --git a/pkg/vtreat/partial_pooling_estimator.py b/pkg/vtreat/partial_pooling_estimator.py
@@ -24,7 +24,10 @@ def standard_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
     means.sort_values(["location_id"], inplace=True, ignore_index=True)
     means['estimate'] = means['mean']
     means["grand_mean"] = np.mean(observations["observation"])
-    means["impact"] = means["estimate"] - means["grand_mean"]
+    means["impact"] = means["estimate"]
+    means["impact"] = (
+        means["impact"] 
+        - np.sum(means['size'] * means['impact']) / np.sum(means['size']))
     means.sort_values(["location_id"], inplace=True, ignore_index=True)
     return means
 
@@ -50,9 +53,11 @@ def pooled_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
     # get counts per group
     n_j = estimated_centers["size"]
     per_location_observation_var = estimated_centers['var'].copy()
-    # inflate a bit
     per_location_observation_var[pd.isnull(per_location_observation_var)] = 0
-    per_location_observation_var = (n_j * per_location_observation_var + np.var(observations['observation'])) / (n_j + 1)
+    # inflate per-loc a bit
+    per_location_observation_var = (
+        (n_j * per_location_observation_var + np.var(observations['observation'])) 
+        / (n_j + 1))
     # get the observed variance between locations
     between_location_var = np.var(estimated_centers["estimate"], ddof=1)
     # get v, the pooling coefficient
@@ -62,10 +67,13 @@ def pooled_effect_estimate(observations: pd.DataFrame) -> pd.DataFrame:
         # as between_location_var > 0 and per_location_observation_var > 0 here
         # v will be in the range 0 to 1
         v = 1 / (1 + per_location_observation_var / (n_j * between_location_var))
-    # this quantity can be improved using knowledge of the variances
-    grand_mean = estimated_centers['grand_mean']
+    v[n_j <= 1] = 0  # no information in size one items
+    v[pd.isnull(v)] = 0
     # build the pooled estimate
-    pooled_estimate = v * estimated_centers["estimate"] + (1 - v) * grand_mean
+    pooled_estimate = v * estimated_centers["estimate"] + (1 - v) * estimated_centers["grand_mean"]
     estimated_centers["estimate"] = pooled_estimate
-    estimated_centers['impact'] = pooled_estimate - grand_mean
+    estimated_centers["impact"] = estimated_centers["estimate"] 
+    estimated_centers["impact"] = (
+        estimated_centers["impact"] 
+        - np.sum(estimated_centers['size'] * estimated_centers['impact']) / np.sum(estimated_centers['size']))
     return estimated_centers