v1.1.1 patches (#140)

Resolve #139 - implements the changes described in the latest comment Resolve #137 - adds `tabulate` to the installation requirements
JacksonBurns · Jul 2, 2023 · 4d1f0c7 · 4d1f0c7
2 parents 4e1b143 + 9de8085
commit 4d1f0c7
Show file tree

Hide file tree

Showing 7 changed files with 208 additions and 8 deletions.
diff --git a/.github/workflows/reproduce_paper.yml b/.github/workflows/reproduce_paper.yml
@@ -8,12 +8,10 @@ on:
     - cron: "0 8 * * 1"
   push:
     branches: [main]
-  pull_request:
-    branches: [main]
   workflow_dispatch:
 
 concurrency:
-  group: actions-id-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  group: actions-id-${{ github.workflow }}
   cancel-in-progress: true
 
 jobs:
@@ -25,6 +23,7 @@ jobs:
     name: Reproduce Paper Data Splits
     steps:
       - uses: actions/checkout@v3
+      - run: git checkout v1.0.0
       - uses: mamba-org/setup-micromamba@main
         with:
           environment-name: temp
@@ -38,6 +37,7 @@ jobs:
       - name: Install Dependencies
         run: |
           python -m pip install -e .[molecules]
+          python -m pip install scikit-learn==1.2.2
           python -m pip install notebook
       - name: Backup Reference Splits
         run: |

diff --git a/README.md b/README.md
@@ -127,11 +127,15 @@ To that end, the default behavior of `astartes` is to use `42` as the random see
 Running `astartes` with the default settings will always produce the exact same results.
 We have verified this behavior on Debian Ubuntu, Windows, and Intel Macs from Python versions 3.7 through 3.11 (with appropriate dependencies for each version).
 
+#### Known Reproducibility Limitations
+
 > **Note**
 > We are limited in our ability to test on M1 Macs, but from our limited manual testing we achieve perfect reproducbility in all cases _except occasionally_ with `KMeans` on Apple silicon.
 It has produced _slightly_ different results between platforms regardless of `random_state`, with up to two clusters being assigned differently resulting in data splits which are >99% identical.
 `astartes` is still consistent between runs on the same platform in all cases, and other samplers are not impacted by this apparent bug.
 
+ - `sklearn` v1.3.0 introduced backwards-incompatible changes in the `KMeans` sampler that changed how the random initialization affects the results, even given the same random seed. Different version of `sklearn` will affect the performance of `astartes` and we recommend including the exact version of `scikit-learn` and `astartes` used, when applicable.
+
 ## Evaluate the Impact of Splitting Algorithms
 The `generate_regression_results_dict` function allows users to quickly evaluate the impact of different splitting techniques on any model supported by `sklearn`. All results are stored in a dictionary format and can be displayed in a neatly formatted table using the optional `print_results` argument.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "astartes"
-version = "1.1.0"
+version = "1.1.1"
 authors = [
     { name = "Jackson Burns", email = "[email protected]" },
     { name = "Himaghna Bhattacharjee", email = "[email protected]" },
@@ -21,7 +21,7 @@ classifiers = [
 ]
 urls = { Homepage = "https://github.com/JacksonBurns/astartes" }
 requires-python = ">=3.7"
-dependencies = ["scikit_learn", "numpy", "scipy", "pandas"]
+dependencies = ["scikit_learn", "tabulate", "numpy", "scipy", "pandas"]
 
 [project.optional-dependencies]
 molecules = ["aimsim"]

diff --git a/...ion/reference_splits/kmeans_reference.pkl → ...erence_splits/kmeans-v1.2.2_reference.pkl b/...ion/reference_splits/kmeans_reference.pkl → ...erence_splits/kmeans-v1.2.2_reference.pkl
diff --git a/test/regression/reference_splits/kmeans-v1.3_reference.pkl b/test/regression/reference_splits/kmeans-v1.3_reference.pkl
diff --git a/test/regression/test_regression.py b/test/regression/test_regression.py
@@ -4,6 +4,7 @@
 from datetime import datetime
 
 import numpy as np
+import pkg_resources
 
 from astartes import train_val_test_split
 from astartes.samplers import (
@@ -12,6 +13,10 @@
     IMPLEMENTED_INTERPOLATION_SAMPLERS,
 )
 
+SKLEARN_GEQ_13 = (  # get the sklearn version
+    int(pkg_resources.get_distribution("scikit-learn").version.split(".")[1]) >= 3
+)
+
 
 class Test_regression(unittest.TestCase):
     """
@@ -34,8 +39,20 @@ def setUpClass(self):
         self.reference_splits = {
             name: os.path.join(self.reference_splits_dir, name + "_reference.pkl")
             for name in ALL_SAMPLERS
-            if name not in ("scaffold",)
+            if name
+            not in (
+                "scaffold",
+                "kmeans",
+            )
         }
+        self.reference_splits["kmeans-v1.3"] = os.path.join(
+            self.reference_splits_dir,
+            "kmeans-v1.3_reference.pkl",
+        )
+        self.reference_splits["kmeans-v1.2.2"] = os.path.join(
+            self.reference_splits_dir,
+            "kmeans-v1.2.2_reference.pkl",
+        )
 
     def test_timebased_regression(self):
         """Regression test TimeBased, which has labels to check as well."""
@@ -94,7 +111,7 @@ def test_interpolation_regression(self):
     def test_extrapolation_regression(self):
         """Regression testing of extrapolative methods relative to static results."""
         for sampler_name in IMPLEMENTED_EXTRAPOLATION_SAMPLERS:
-            if sampler_name in ("scaffold", "time_based"):
+            if sampler_name in ("scaffold", "time_based", "kmeans"):
                 continue
             (
                 X_train,
@@ -133,6 +150,90 @@ def test_extrapolation_regression(self):
                     "Sampler {:s} failed regression testing.".format(sampler_name),
                 )
 
+    @unittest.skipUnless(
+        SKLEARN_GEQ_13,
+        "sklearn version less than 1.3 detected",
+    )
+    def test_kmeans_regression_sklearn_v13(self):
+        """Regression testing of KMeans in sklearn v1.3 or newer."""
+        (
+            X_train,
+            X_val,
+            X_test,
+            y_train,
+            y_val,
+            y_test,
+            clusters_train,
+            clusters_val,
+            clusters_test,
+        ) = train_val_test_split(
+            self.X,
+            self.y,
+            sampler="kmeans",
+            random_state=42,
+        )
+        all_output = [
+            X_train,
+            X_val,
+            X_test,
+            y_train,
+            y_val,
+            y_test,
+            clusters_train,
+            clusters_val,
+            clusters_test,
+        ]
+        with open(self.reference_splits["kmeans-v1.3"], "rb") as f:
+            reference_output = pkl.load(f)
+        for i, j in zip(all_output, reference_output):
+            np.testing.assert_array_equal(
+                i,
+                j,
+                "Sampler kmeans failed regression testing.",
+            )
+
+    @unittest.skipIf(
+        SKLEARN_GEQ_13,
+        "sklearn version 1.3 or newer detected",
+    )
+    def test_kmeans_regression_sklearn_v12(self):
+        """Regression testing of KMeans in sklearn v1.2 or earlier."""
+        (
+            X_train,
+            X_val,
+            X_test,
+            y_train,
+            y_val,
+            y_test,
+            clusters_train,
+            clusters_val,
+            clusters_test,
+        ) = train_val_test_split(
+            self.X,
+            self.y,
+            sampler="kmeans",
+            random_state=42,
+        )
+        all_output = [
+            X_train,
+            X_val,
+            X_test,
+            y_train,
+            y_val,
+            y_test,
+            clusters_train,
+            clusters_val,
+            clusters_test,
+        ]
+        with open(self.reference_splits["kmeans-v1.2.2"], "rb") as f:
+            reference_output = pkl.load(f)
+        for i, j in zip(all_output, reference_output):
+            np.testing.assert_array_equal(
+                i,
+                j,
+                "Sampler kmeans failed regression testing.",
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/unit/samplers/extrapolative/test_kmeans.py b/test/unit/samplers/extrapolative/test_kmeans.py
@@ -1,11 +1,16 @@
 import unittest
 
 import numpy as np
+import pkg_resources
 
 from astartes import train_test_split
 from astartes.samplers import KMeans
 from astartes.utils.warnings import ImperfectSplittingWarning
 
+SKLEARN_GEQ_13 = (  # get the sklearn version
+    int(pkg_resources.get_distribution("scikit-learn").version.split(".")[1]) >= 3
+)
+
 
 class Test_kmeans(unittest.TestCase):
     """
@@ -35,7 +40,11 @@ def setUpClass(self):
             ]
         )
 
-    def test_kmeans_sampling(self):
+    @unittest.skipIf(
+        SKLEARN_GEQ_13,
+        "sklearn version 1.3 or newer detected",
+    )
+    def test_kmeans_sampling_v12(self):
         """Use kmeans in the train_test_split and verify results."""
         with self.assertWarns(ImperfectSplittingWarning):
             (
@@ -117,6 +126,92 @@ def test_kmeans_sampling(self):
             "Test clusters incorrect.",
         )
 
+    @unittest.skipUnless(
+        SKLEARN_GEQ_13,
+        "sklearn version less than 1.3 detected",
+    )
+    def test_kmeans_sampling_v13(self):
+        """Use kmeans in the train_test_split and verify results."""
+        with self.assertWarns(ImperfectSplittingWarning):
+            (
+                X_train,
+                X_test,
+                y_train,
+                y_test,
+                labels_train,
+                labels_test,
+                clusters_train,
+                clusters_test,
+            ) = train_test_split(
+                self.X,
+                self.y,
+                labels=self.labels,
+                test_size=0.75,
+                train_size=0.25,
+                sampler="kmeans",
+                random_state=42,
+                hopts={
+                    "n_clusters": 2,
+                },
+            )
+        # test that the known arrays equal the result from above
+        self.assertIsNone(
+            np.testing.assert_array_equal(
+                X_train,
+                np.array([[0, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 1, 0, 0, 0]]),
+            ),
+            "Train X incorrect.",
+        )
+        self.assertIsNone(
+            np.testing.assert_array_equal(
+                X_test,
+                np.array([[1, 1, 1, 0, 0], [1, 1, 1, 1, 0]]),
+            ),
+            "Test X incorrect.",
+        )
+        self.assertIsNone(
+            np.testing.assert_array_equal(
+                y_train,
+                np.array([1, 2, 3]),
+            ),
+            "Train y incorrect.",
+        )
+        self.assertIsNone(
+            np.testing.assert_array_equal(
+                y_test,
+                np.array([4, 5]),
+            ),
+            "Test y incorrect.",
+        )
+        self.assertIsNone(
+            np.testing.assert_array_equal(
+                labels_train,
+                np.array(["one", "two", "three"]),
+            ),
+            "Train labels incorrect.",
+        )
+        self.assertIsNone(
+            np.testing.assert_array_equal(
+                labels_test,
+                np.array(["four", "five"]),
+            ),
+            "Test labels incorrect.",
+        )
+        self.assertIsNone(
+            np.testing.assert_array_equal(
+                clusters_train,
+                np.array([0, 0, 0]),
+            ),
+            "Train clusters incorrect.",
+        )
+        self.assertIsNone(
+            np.testing.assert_array_equal(
+                clusters_test,
+                np.array([1, 1]),
+            ),
+            "Test clusters incorrect.",
+        )
+
     def test_kmeans(self):
         """Directly instantiate and test KMeans."""
         kmeans_instance = KMeans(