diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 67a4078..d57cadf 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,4 +1,4 @@ -# RMG Code of Conduct +# `astartes` Code of Conduct ## Our Pledge diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 898baa1..538e590 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,8 +16,8 @@ To contribute to the `astartes` source code, start by forking and then cloning t ### Version Checking -`astartes` uses `pyproject.toml` to specify all metadata, but the version is also specified in `astartes/__init__.py` (via `__version__`) for backwards compatibility with Python 3.7. -To check which version of `astartes` you have installed, you can run `python -c "import astartes; print(astartes.__version__)"` on Python 3.7 or `python -c "from importlib.metadata import version; version('astartes')" on Python 3.8 or newer. +`astartes` uses `pyproject.toml` to specify all metadata except the version, which is specified in `astartes/__init__.py` (via `__version__`) for backwards compatibility with Python 3.7. +To check which version of `astartes` you have installed, you can run `python -c "import astartes; print(astartes.__version__)"` on Python 3.7 or `python -c "from importlib.metadata import version; version('astartes')"` on Python 3.8 or newer. ### Testing All of the tests in `astartes` are written using the built-in python `unittest` module (to allow running without `pytest`) but we _highly_ recommend using `pytest`. diff --git a/README.md b/README.md index 5a17bca..5ac345f 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ PyPI - License - Test Status + Test Status DOI @@ -121,6 +121,11 @@ Click the badges in the table below to be taken to a live, interactive demo of ` To execute these notebooks locally, clone this repository (i.e. `git clone https://github.com/JacksonBurns/astartes.git`), navigate to the `astartes` directory, run `pip install .[demos]`, then open and run the notebooks in your preferred editor. You do _not_ need to execute the cells prefixed with `%%capture` - they are only present for compatibility with Google Colab. +#### Packages Using `astartes` + - [Chemprop](https://github.com/chemprop/chemprop), a machine learning library for chemical property prediction, uses `astartes` in the backend for splitting molecular structures. + - [`fastprop`](https://github.com/JacksonBurns/fastprop), a descriptor-based property prediction library, uses `astartes`. + - [Google Scholar of articles citing the JOSS paper for `astartes`](https://scholar.google.com/scholar?cites=4693802000464819413&as_sdt=40000005&sciodt=0,22&hl=en) + ### Withhold Testing Data with `train_val_test_split` For rigorous ML research, it is critical to withhold some data during training to use a `test` set. The model should _never_ see this data during training (unlike the validation set) so that we can get an accurate measurement of its performance. @@ -260,13 +265,9 @@ train_test_split_molecules( train_size=0.8, fingerprint="daylight_fingerprint", fprints_hopts={ - "minPath": 2, - "maxPath": 5, "fpSize": 200, - "bitsPerHash": 4, - "useHs": 1, - "tgtDensity": 0.4, - "minSize": 64, + "numBitsPerFeature": 4, + "useHs": True, }, sampler="random", random_state=42, diff --git a/astartes/__init__.py b/astartes/__init__.py index 1512803..8edfdf6 100644 --- a/astartes/__init__.py +++ b/astartes/__init__.py @@ -1,7 +1,7 @@ # convenience import to enable 'from astartes import train_test_split' from .main import train_test_split, train_val_test_split -__version__ = "1.2.2" +__version__ = "1.3.0" # DO NOT do this: # from .molecules import train_test_split_molecules diff --git a/astartes/main.py b/astartes/main.py index 56121f4..0deb0b0 100644 --- a/astartes/main.py +++ b/astartes/main.py @@ -313,8 +313,8 @@ def _check_actual_split( ) if actual_val_size != requested_val_size: msg += "Requested validation size of {:.2f}, got {:.2f}. ".format( - requested_test_size, - actual_test_size, + requested_val_size, + actual_val_size, ) if actual_test_size != requested_test_size: msg += "Requested test size of {:.2f}, got {:.2f}. ".format( diff --git a/examples/morais_lima_martin_sampling/mlm_sampler.ipynb b/examples/morais_lima_martin_sampling/mlm_sampler.ipynb new file mode 100644 index 0000000..c8f0f87 --- /dev/null +++ b/examples/morais_lima_martin_sampling/mlm_sampler.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Implementing the Morais-Lima-Martin (MLM) Sampler\n", + "The notebook shows a brief demonstration of using the built in utilities in `astartes` to implement the Morais-Lima-Martin sampler, which you can read about [here](https://academic.oup.com/bioinformatics/article/35/24/5257/5497250)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`astartes` has a very fast implementation of the Kennard-Stone algorithm, on which the MLM sampler is based, available in its `utils`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from astartes.utils.fast_kennard_stone import fast_kennard_stone" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The MLM sampler can then be implemented as shown below.\n", + "The `mlm_sampler` functions takes a 2D array and splits it first using the Kennard-Stone algorithm, then permutes the indices according to the MLM algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.spatial.distance import pdist, squareform\n", + "import numpy as np\n", + "\n", + "from astartes.samplers.interpolation import KennardStone\n", + "\n", + "\n", + "def mlm_split(X: np.ndarray, *, train_size: float = 0.8, val_size: float = 0.1, test_size: float = 0.1, random_state: int = 42):\n", + " # calculate the distance matrix\n", + " ks_indexes = fast_kennard_stone(squareform(pdist(X, \"euclidean\")))\n", + " pivot = int(len(ks_indexes) * train_size)\n", + " train_idxs = ks_indexes[0:pivot]\n", + " other_idxs = ks_indexes[pivot:]\n", + "\n", + " # set RNG\n", + " rng = np.random.default_rng(seed=random_state)\n", + " \n", + " # choose 10% of train to switch with 10% of val/test\n", + " n_to_permute = np.floor(0.1 * len(train_idxs))\n", + " train_permute_idxs = rng.choice(train_idxs, n_to_permute)\n", + " remaining_train_idxs = filter(lambda i: i not in train_permute_idxs, train_idxs)\n", + " other_permute_idxs = rng.choice(other_idxs, n_to_permute)\n", + " remaining_other_idxs = filter(lambda i: i not in other_permute_idxs, other_idxs)\n", + "\n", + " # reassemble the new lists of indexes\n", + " new_train_idxs = np.concatenate(remaining_train_idxs, other_permute_idxs)\n", + " new_other_idxs = np.concatenate(train_permute_idxs, remaining_other_idxs)\n", + " n_val = int(len(new_other_idxs) * (val_size / (val_size + test_size)))\n", + " val_indexes = new_other_idxs[0:n_val]\n", + " test_indexes = new_other_idxs[n_val:]\n", + " \n", + " # return the split up array\n", + " return X[train_idxs], X[val_indexes], X[test_indexes]\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fprop", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/test/functional/test_molecules.py b/test/functional/test_molecules.py index 2942067..a737926 100644 --- a/test/functional/test_molecules.py +++ b/test/functional/test_molecules.py @@ -126,13 +126,9 @@ def test_fprint_hopts(self): sampler="random", fingerprint="topological_fingerprint", fprints_hopts={ - "minPath": 2, - "maxPath": 5, "fpSize": 200, - "bitsPerHash": 4, - "useHs": 1, - "tgtDensity": 0.4, - "minSize": 64, + "numBitsPerFeature": 4, + "useHs": True, }, ) @@ -163,13 +159,9 @@ def test_maximum_call(self): train_size=0.2, fingerprint="topological_fingerprint", fprints_hopts={ - "minPath": 2, - "maxPath": 5, "fpSize": 200, - "bitsPerHash": 4, - "useHs": 1, - "tgtDensity": 0.4, - "minSize": 64, + "numBitsPerFeature": 2, + "useHs": True, }, sampler="random", random_state=42,