Skip to content

Commit

Permalink
v1.1.1 patches (#140)
Browse files Browse the repository at this point in the history
Resolve #139 - implements the changes described in the latest comment
Resolve #137 - adds `tabulate` to the installation requirements
  • Loading branch information
kspieks authored Jul 2, 2023
2 parents 4e1b143 + 9de8085 commit 4d1f0c7
Show file tree
Hide file tree
Showing 7 changed files with 208 additions and 8 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/reproduce_paper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,10 @@ on:
- cron: "0 8 * * 1"
push:
branches: [main]
pull_request:
branches: [main]
workflow_dispatch:

concurrency:
group: actions-id-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: actions-id-${{ github.workflow }}
cancel-in-progress: true

jobs:
Expand All @@ -25,6 +23,7 @@ jobs:
name: Reproduce Paper Data Splits
steps:
- uses: actions/checkout@v3
- run: git checkout v1.0.0
- uses: mamba-org/setup-micromamba@main
with:
environment-name: temp
Expand All @@ -38,6 +37,7 @@ jobs:
- name: Install Dependencies
run: |
python -m pip install -e .[molecules]
python -m pip install scikit-learn==1.2.2
python -m pip install notebook
- name: Backup Reference Splits
run: |
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,15 @@ To that end, the default behavior of `astartes` is to use `42` as the random see
Running `astartes` with the default settings will always produce the exact same results.
We have verified this behavior on Debian Ubuntu, Windows, and Intel Macs from Python versions 3.7 through 3.11 (with appropriate dependencies for each version).

#### Known Reproducibility Limitations

> **Note**
> We are limited in our ability to test on M1 Macs, but from our limited manual testing we achieve perfect reproducbility in all cases _except occasionally_ with `KMeans` on Apple silicon.
It has produced _slightly_ different results between platforms regardless of `random_state`, with up to two clusters being assigned differently resulting in data splits which are >99% identical.
`astartes` is still consistent between runs on the same platform in all cases, and other samplers are not impacted by this apparent bug.

- `sklearn` v1.3.0 introduced backwards-incompatible changes in the `KMeans` sampler that changed how the random initialization affects the results, even given the same random seed. Different version of `sklearn` will affect the performance of `astartes` and we recommend including the exact version of `scikit-learn` and `astartes` used, when applicable.

## Evaluate the Impact of Splitting Algorithms
The `generate_regression_results_dict` function allows users to quickly evaluate the impact of different splitting techniques on any model supported by `sklearn`. All results are stored in a dictionary format and can be displayed in a neatly formatted table using the optional `print_results` argument.

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "astartes"
version = "1.1.0"
version = "1.1.1"
authors = [
{ name = "Jackson Burns", email = "[email protected]" },
{ name = "Himaghna Bhattacharjee", email = "[email protected]" },
Expand All @@ -21,7 +21,7 @@ classifiers = [
]
urls = { Homepage = "https://github.com/JacksonBurns/astartes" }
requires-python = ">=3.7"
dependencies = ["scikit_learn", "numpy", "scipy", "pandas"]
dependencies = ["scikit_learn", "tabulate", "numpy", "scipy", "pandas"]

[project.optional-dependencies]
molecules = ["aimsim"]
Expand Down
Binary file not shown.
105 changes: 103 additions & 2 deletions test/regression/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from datetime import datetime

import numpy as np
import pkg_resources

from astartes import train_val_test_split
from astartes.samplers import (
Expand All @@ -12,6 +13,10 @@
IMPLEMENTED_INTERPOLATION_SAMPLERS,
)

SKLEARN_GEQ_13 = ( # get the sklearn version
int(pkg_resources.get_distribution("scikit-learn").version.split(".")[1]) >= 3
)


class Test_regression(unittest.TestCase):
"""
Expand All @@ -34,8 +39,20 @@ def setUpClass(self):
self.reference_splits = {
name: os.path.join(self.reference_splits_dir, name + "_reference.pkl")
for name in ALL_SAMPLERS
if name not in ("scaffold",)
if name
not in (
"scaffold",
"kmeans",
)
}
self.reference_splits["kmeans-v1.3"] = os.path.join(
self.reference_splits_dir,
"kmeans-v1.3_reference.pkl",
)
self.reference_splits["kmeans-v1.2.2"] = os.path.join(
self.reference_splits_dir,
"kmeans-v1.2.2_reference.pkl",
)

def test_timebased_regression(self):
"""Regression test TimeBased, which has labels to check as well."""
Expand Down Expand Up @@ -94,7 +111,7 @@ def test_interpolation_regression(self):
def test_extrapolation_regression(self):
"""Regression testing of extrapolative methods relative to static results."""
for sampler_name in IMPLEMENTED_EXTRAPOLATION_SAMPLERS:
if sampler_name in ("scaffold", "time_based"):
if sampler_name in ("scaffold", "time_based", "kmeans"):
continue
(
X_train,
Expand Down Expand Up @@ -133,6 +150,90 @@ def test_extrapolation_regression(self):
"Sampler {:s} failed regression testing.".format(sampler_name),
)

@unittest.skipUnless(
SKLEARN_GEQ_13,
"sklearn version less than 1.3 detected",
)
def test_kmeans_regression_sklearn_v13(self):
"""Regression testing of KMeans in sklearn v1.3 or newer."""
(
X_train,
X_val,
X_test,
y_train,
y_val,
y_test,
clusters_train,
clusters_val,
clusters_test,
) = train_val_test_split(
self.X,
self.y,
sampler="kmeans",
random_state=42,
)
all_output = [
X_train,
X_val,
X_test,
y_train,
y_val,
y_test,
clusters_train,
clusters_val,
clusters_test,
]
with open(self.reference_splits["kmeans-v1.3"], "rb") as f:
reference_output = pkl.load(f)
for i, j in zip(all_output, reference_output):
np.testing.assert_array_equal(
i,
j,
"Sampler kmeans failed regression testing.",
)

@unittest.skipIf(
SKLEARN_GEQ_13,
"sklearn version 1.3 or newer detected",
)
def test_kmeans_regression_sklearn_v12(self):
"""Regression testing of KMeans in sklearn v1.2 or earlier."""
(
X_train,
X_val,
X_test,
y_train,
y_val,
y_test,
clusters_train,
clusters_val,
clusters_test,
) = train_val_test_split(
self.X,
self.y,
sampler="kmeans",
random_state=42,
)
all_output = [
X_train,
X_val,
X_test,
y_train,
y_val,
y_test,
clusters_train,
clusters_val,
clusters_test,
]
with open(self.reference_splits["kmeans-v1.2.2"], "rb") as f:
reference_output = pkl.load(f)
for i, j in zip(all_output, reference_output):
np.testing.assert_array_equal(
i,
j,
"Sampler kmeans failed regression testing.",
)


if __name__ == "__main__":
unittest.main()
97 changes: 96 additions & 1 deletion test/unit/samplers/extrapolative/test_kmeans.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import unittest

import numpy as np
import pkg_resources

from astartes import train_test_split
from astartes.samplers import KMeans
from astartes.utils.warnings import ImperfectSplittingWarning

SKLEARN_GEQ_13 = ( # get the sklearn version
int(pkg_resources.get_distribution("scikit-learn").version.split(".")[1]) >= 3
)


class Test_kmeans(unittest.TestCase):
"""
Expand Down Expand Up @@ -35,7 +40,11 @@ def setUpClass(self):
]
)

def test_kmeans_sampling(self):
@unittest.skipIf(
SKLEARN_GEQ_13,
"sklearn version 1.3 or newer detected",
)
def test_kmeans_sampling_v12(self):
"""Use kmeans in the train_test_split and verify results."""
with self.assertWarns(ImperfectSplittingWarning):
(
Expand Down Expand Up @@ -117,6 +126,92 @@ def test_kmeans_sampling(self):
"Test clusters incorrect.",
)

@unittest.skipUnless(
SKLEARN_GEQ_13,
"sklearn version less than 1.3 detected",
)
def test_kmeans_sampling_v13(self):
"""Use kmeans in the train_test_split and verify results."""
with self.assertWarns(ImperfectSplittingWarning):
(
X_train,
X_test,
y_train,
y_test,
labels_train,
labels_test,
clusters_train,
clusters_test,
) = train_test_split(
self.X,
self.y,
labels=self.labels,
test_size=0.75,
train_size=0.25,
sampler="kmeans",
random_state=42,
hopts={
"n_clusters": 2,
},
)
# test that the known arrays equal the result from above
self.assertIsNone(
np.testing.assert_array_equal(
X_train,
np.array([[0, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 1, 0, 0, 0]]),
),
"Train X incorrect.",
)
self.assertIsNone(
np.testing.assert_array_equal(
X_test,
np.array([[1, 1, 1, 0, 0], [1, 1, 1, 1, 0]]),
),
"Test X incorrect.",
)
self.assertIsNone(
np.testing.assert_array_equal(
y_train,
np.array([1, 2, 3]),
),
"Train y incorrect.",
)
self.assertIsNone(
np.testing.assert_array_equal(
y_test,
np.array([4, 5]),
),
"Test y incorrect.",
)
self.assertIsNone(
np.testing.assert_array_equal(
labels_train,
np.array(["one", "two", "three"]),
),
"Train labels incorrect.",
)
self.assertIsNone(
np.testing.assert_array_equal(
labels_test,
np.array(["four", "five"]),
),
"Test labels incorrect.",
)
self.assertIsNone(
np.testing.assert_array_equal(
clusters_train,
np.array([0, 0, 0]),
),
"Train clusters incorrect.",
)
self.assertIsNone(
np.testing.assert_array_equal(
clusters_test,
np.array([1, 1]),
),
"Test clusters incorrect.",
)

def test_kmeans(self):
"""Directly instantiate and test KMeans."""
kmeans_instance = KMeans(
Expand Down

0 comments on commit 4d1f0c7

Please sign in to comment.