From b05cec30de1946a5027ab1364c282a4aa8d2c71e Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Mon, 19 Jun 2023 22:58:01 -0400 Subject: [PATCH 01/11] first pass at MLM --- astartes/samplers/interpolation/mlm.py | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 astartes/samplers/interpolation/mlm.py diff --git a/astartes/samplers/interpolation/mlm.py b/astartes/samplers/interpolation/mlm.py new file mode 100644 index 0000000..273d47e --- /dev/null +++ b/astartes/samplers/interpolation/mlm.py @@ -0,0 +1,46 @@ +from typing import overload + +import numpy as np + +from astartes.samplers.interpolation import KennardStone + + +class MLM(KennardStone): + # could be convenient to know size of train and test during init... + @overload + def get_sample_idxs(self, n_samples): + """Overload the KennardStone method to permute 10% of samples from train + + Args: + n_samples (int): Number of samples to retrieve. + + Returns: + np.array: The selected indices + """ + if self._current_sample_idx == 0: # permute indexes on the first call + train_idxs = self._samples_idxs[ + self._current_sample_idx : self._current_sample_idx + n_samples + ] + other_idxs = self._samples_idxs[self._current_sample_idx + n_samples : -1] + + # set RNG + rng = np.random.default_rng(seed=self.get_config("random_state")) + n_to_permute = np.floor(0.1 * len(train_idxs)) + train_permute_idxs = rng.choice(train_idxs, n_to_permute) + remaining_train_idxs = [ + i for i in train_idxs if i not in train_permute_idxs + ] + other_permute_idxs = rng.choice(other_idxs, n_to_permute) + remaining_other_idxs = [ + i for i in other_idxs if i not in other_permute_idxs + ] + # reassamble the indexes + self._samples_idxs = np.hstack( + ( + remaining_train_idxs, + other_permute_idxs, + remaining_other_idxs, + train_permute_idxs, + ) + ) + return super().get_sample_idxs(n_samples) From c892b21fb16337309c9607bcb08d07e07b65b8a5 Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Tue, 13 Aug 2024 17:40:30 -0400 Subject: [PATCH 02/11] bump minor version for minor release --- astartes/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/astartes/__init__.py b/astartes/__init__.py index 1512803..8edfdf6 100644 --- a/astartes/__init__.py +++ b/astartes/__init__.py @@ -1,7 +1,7 @@ # convenience import to enable 'from astartes import train_test_split' from .main import train_test_split, train_val_test_split -__version__ = "1.2.2" +__version__ = "1.3.0" # DO NOT do this: # from .molecules import train_test_split_molecules From 522acd7d5a00adbf806246b519ce2691c9be4d0d Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Tue, 13 Aug 2024 17:40:38 -0400 Subject: [PATCH 03/11] rename code of conduct --- CODE_OF_CONDUCT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 67a4078..d57cadf 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,4 +1,4 @@ -# RMG Code of Conduct +# `astartes` Code of Conduct ## Our Pledge From e65a97ad34cde4385c031db7d05d5f9620278eb7 Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Tue, 13 Aug 2024 17:49:27 -0400 Subject: [PATCH 04/11] print the validation sizes for validation set, not the test typo --- astartes/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/astartes/main.py b/astartes/main.py index 56121f4..0deb0b0 100644 --- a/astartes/main.py +++ b/astartes/main.py @@ -313,8 +313,8 @@ def _check_actual_split( ) if actual_val_size != requested_val_size: msg += "Requested validation size of {:.2f}, got {:.2f}. ".format( - requested_test_size, - actual_test_size, + requested_val_size, + actual_val_size, ) if actual_test_size != requested_test_size: msg += "Requested test size of {:.2f}, got {:.2f}. ".format( From 2a31d591021fe4ad3609212d7541d4099f02b450 Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Tue, 13 Aug 2024 17:51:05 -0400 Subject: [PATCH 05/11] fix broken badge in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a17bca..df24f28 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ PyPI - License - Test Status + Test Status DOI From 9f65cafeefb535aca2bb202d53279679cba2f895 Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Tue, 13 Aug 2024 17:58:55 -0400 Subject: [PATCH 06/11] fix formatting and version location in contributing guidelines --- CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 898baa1..538e590 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,8 +16,8 @@ To contribute to the `astartes` source code, start by forking and then cloning t ### Version Checking -`astartes` uses `pyproject.toml` to specify all metadata, but the version is also specified in `astartes/__init__.py` (via `__version__`) for backwards compatibility with Python 3.7. -To check which version of `astartes` you have installed, you can run `python -c "import astartes; print(astartes.__version__)"` on Python 3.7 or `python -c "from importlib.metadata import version; version('astartes')" on Python 3.8 or newer. +`astartes` uses `pyproject.toml` to specify all metadata except the version, which is specified in `astartes/__init__.py` (via `__version__`) for backwards compatibility with Python 3.7. +To check which version of `astartes` you have installed, you can run `python -c "import astartes; print(astartes.__version__)"` on Python 3.7 or `python -c "from importlib.metadata import version; version('astartes')"` on Python 3.8 or newer. ### Testing All of the tests in `astartes` are written using the built-in python `unittest` module (to allow running without `pytest`) but we _highly_ recommend using `pytest`. From 011216d887dcdecadc935c620d1f977a4b9984c0 Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Tue, 13 Aug 2024 18:02:14 -0400 Subject: [PATCH 07/11] remove duplicated kwargs --- test/functional/test_molecules.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/functional/test_molecules.py b/test/functional/test_molecules.py index 2942067..b84decb 100644 --- a/test/functional/test_molecules.py +++ b/test/functional/test_molecules.py @@ -126,8 +126,6 @@ def test_fprint_hopts(self): sampler="random", fingerprint="topological_fingerprint", fprints_hopts={ - "minPath": 2, - "maxPath": 5, "fpSize": 200, "bitsPerHash": 4, "useHs": 1, @@ -163,8 +161,6 @@ def test_maximum_call(self): train_size=0.2, fingerprint="topological_fingerprint", fprints_hopts={ - "minPath": 2, - "maxPath": 5, "fpSize": 200, "bitsPerHash": 4, "useHs": 1, From 80213251f9f4a8fdfefdfc9ed5c2ed4dc887f023 Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Tue, 13 Aug 2024 18:03:15 -0400 Subject: [PATCH 08/11] remove disallowed argument from readme --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index df24f28..624855e 100644 --- a/README.md +++ b/README.md @@ -260,8 +260,6 @@ train_test_split_molecules( train_size=0.8, fingerprint="daylight_fingerprint", fprints_hopts={ - "minPath": 2, - "maxPath": 5, "fpSize": 200, "bitsPerHash": 4, "useHs": 1, From 28e8d9cd99f431153333262575891f447fd933ea Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Tue, 13 Aug 2024 18:08:05 -0400 Subject: [PATCH 09/11] remove deprecated rdkit args --- README.md | 6 ++---- test/functional/test_molecules.py | 12 ++++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 624855e..f77b303 100644 --- a/README.md +++ b/README.md @@ -261,10 +261,8 @@ train_test_split_molecules( fingerprint="daylight_fingerprint", fprints_hopts={ "fpSize": 200, - "bitsPerHash": 4, - "useHs": 1, - "tgtDensity": 0.4, - "minSize": 64, + "numBitsPerFeature": 4, + "useHs": True, }, sampler="random", random_state=42, diff --git a/test/functional/test_molecules.py b/test/functional/test_molecules.py index b84decb..a737926 100644 --- a/test/functional/test_molecules.py +++ b/test/functional/test_molecules.py @@ -127,10 +127,8 @@ def test_fprint_hopts(self): fingerprint="topological_fingerprint", fprints_hopts={ "fpSize": 200, - "bitsPerHash": 4, - "useHs": 1, - "tgtDensity": 0.4, - "minSize": 64, + "numBitsPerFeature": 4, + "useHs": True, }, ) @@ -162,10 +160,8 @@ def test_maximum_call(self): fingerprint="topological_fingerprint", fprints_hopts={ "fpSize": 200, - "bitsPerHash": 4, - "useHs": 1, - "tgtDensity": 0.4, - "minSize": 64, + "numBitsPerFeature": 2, + "useHs": True, }, sampler="random", random_state=42, From b794dffc2f1bbe39d091f856404434cb3ea66354 Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Tue, 13 Aug 2024 18:24:22 -0400 Subject: [PATCH 10/11] move the MLM sampler into a demo notebook --- astartes/samplers/interpolation/mlm.py | 46 ---------- .../mlm_sampler.ipynb | 90 +++++++++++++++++++ 2 files changed, 90 insertions(+), 46 deletions(-) delete mode 100644 astartes/samplers/interpolation/mlm.py create mode 100644 examples/morais_lima_martin_sampling/mlm_sampler.ipynb diff --git a/astartes/samplers/interpolation/mlm.py b/astartes/samplers/interpolation/mlm.py deleted file mode 100644 index 273d47e..0000000 --- a/astartes/samplers/interpolation/mlm.py +++ /dev/null @@ -1,46 +0,0 @@ -from typing import overload - -import numpy as np - -from astartes.samplers.interpolation import KennardStone - - -class MLM(KennardStone): - # could be convenient to know size of train and test during init... - @overload - def get_sample_idxs(self, n_samples): - """Overload the KennardStone method to permute 10% of samples from train - - Args: - n_samples (int): Number of samples to retrieve. - - Returns: - np.array: The selected indices - """ - if self._current_sample_idx == 0: # permute indexes on the first call - train_idxs = self._samples_idxs[ - self._current_sample_idx : self._current_sample_idx + n_samples - ] - other_idxs = self._samples_idxs[self._current_sample_idx + n_samples : -1] - - # set RNG - rng = np.random.default_rng(seed=self.get_config("random_state")) - n_to_permute = np.floor(0.1 * len(train_idxs)) - train_permute_idxs = rng.choice(train_idxs, n_to_permute) - remaining_train_idxs = [ - i for i in train_idxs if i not in train_permute_idxs - ] - other_permute_idxs = rng.choice(other_idxs, n_to_permute) - remaining_other_idxs = [ - i for i in other_idxs if i not in other_permute_idxs - ] - # reassamble the indexes - self._samples_idxs = np.hstack( - ( - remaining_train_idxs, - other_permute_idxs, - remaining_other_idxs, - train_permute_idxs, - ) - ) - return super().get_sample_idxs(n_samples) diff --git a/examples/morais_lima_martin_sampling/mlm_sampler.ipynb b/examples/morais_lima_martin_sampling/mlm_sampler.ipynb new file mode 100644 index 0000000..c8f0f87 --- /dev/null +++ b/examples/morais_lima_martin_sampling/mlm_sampler.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Implementing the Morais-Lima-Martin (MLM) Sampler\n", + "The notebook shows a brief demonstration of using the built in utilities in `astartes` to implement the Morais-Lima-Martin sampler, which you can read about [here](https://academic.oup.com/bioinformatics/article/35/24/5257/5497250)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`astartes` has a very fast implementation of the Kennard-Stone algorithm, on which the MLM sampler is based, available in its `utils`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from astartes.utils.fast_kennard_stone import fast_kennard_stone" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The MLM sampler can then be implemented as shown below.\n", + "The `mlm_sampler` functions takes a 2D array and splits it first using the Kennard-Stone algorithm, then permutes the indices according to the MLM algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.spatial.distance import pdist, squareform\n", + "import numpy as np\n", + "\n", + "from astartes.samplers.interpolation import KennardStone\n", + "\n", + "\n", + "def mlm_split(X: np.ndarray, *, train_size: float = 0.8, val_size: float = 0.1, test_size: float = 0.1, random_state: int = 42):\n", + " # calculate the distance matrix\n", + " ks_indexes = fast_kennard_stone(squareform(pdist(X, \"euclidean\")))\n", + " pivot = int(len(ks_indexes) * train_size)\n", + " train_idxs = ks_indexes[0:pivot]\n", + " other_idxs = ks_indexes[pivot:]\n", + "\n", + " # set RNG\n", + " rng = np.random.default_rng(seed=random_state)\n", + " \n", + " # choose 10% of train to switch with 10% of val/test\n", + " n_to_permute = np.floor(0.1 * len(train_idxs))\n", + " train_permute_idxs = rng.choice(train_idxs, n_to_permute)\n", + " remaining_train_idxs = filter(lambda i: i not in train_permute_idxs, train_idxs)\n", + " other_permute_idxs = rng.choice(other_idxs, n_to_permute)\n", + " remaining_other_idxs = filter(lambda i: i not in other_permute_idxs, other_idxs)\n", + "\n", + " # reassemble the new lists of indexes\n", + " new_train_idxs = np.concatenate(remaining_train_idxs, other_permute_idxs)\n", + " new_other_idxs = np.concatenate(train_permute_idxs, remaining_other_idxs)\n", + " n_val = int(len(new_other_idxs) * (val_size / (val_size + test_size)))\n", + " val_indexes = new_other_idxs[0:n_val]\n", + " test_indexes = new_other_idxs[n_val:]\n", + " \n", + " # return the split up array\n", + " return X[train_idxs], X[val_indexes], X[test_indexes]\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fprop", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 4fcbcf2038d03411c92c8d141fc5630c80891eba Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Wed, 14 Aug 2024 09:07:29 -0400 Subject: [PATCH 11/11] add a section of packages using `astartes` --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index f77b303..5ac345f 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,11 @@ Click the badges in the table below to be taken to a live, interactive demo of ` To execute these notebooks locally, clone this repository (i.e. `git clone https://github.com/JacksonBurns/astartes.git`), navigate to the `astartes` directory, run `pip install .[demos]`, then open and run the notebooks in your preferred editor. You do _not_ need to execute the cells prefixed with `%%capture` - they are only present for compatibility with Google Colab. +#### Packages Using `astartes` + - [Chemprop](https://github.com/chemprop/chemprop), a machine learning library for chemical property prediction, uses `astartes` in the backend for splitting molecular structures. + - [`fastprop`](https://github.com/JacksonBurns/fastprop), a descriptor-based property prediction library, uses `astartes`. + - [Google Scholar of articles citing the JOSS paper for `astartes`](https://scholar.google.com/scholar?cites=4693802000464819413&as_sdt=40000005&sciodt=0,22&hl=en) + ### Withhold Testing Data with `train_val_test_split` For rigorous ML research, it is critical to withhold some data during training to use a `test` set. The model should _never_ see this data during training (unlike the validation set) so that we can get an accurate measurement of its performance.