diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 67a4078..d57cadf 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -1,4 +1,4 @@
-# RMG Code of Conduct
+# `astartes` Code of Conduct
## Our Pledge
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 898baa1..538e590 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -16,8 +16,8 @@ To contribute to the `astartes` source code, start by forking and then cloning t
### Version Checking
-`astartes` uses `pyproject.toml` to specify all metadata, but the version is also specified in `astartes/__init__.py` (via `__version__`) for backwards compatibility with Python 3.7.
-To check which version of `astartes` you have installed, you can run `python -c "import astartes; print(astartes.__version__)"` on Python 3.7 or `python -c "from importlib.metadata import version; version('astartes')" on Python 3.8 or newer.
+`astartes` uses `pyproject.toml` to specify all metadata except the version, which is specified in `astartes/__init__.py` (via `__version__`) for backwards compatibility with Python 3.7.
+To check which version of `astartes` you have installed, you can run `python -c "import astartes; print(astartes.__version__)"` on Python 3.7 or `python -c "from importlib.metadata import version; version('astartes')"` on Python 3.8 or newer.
### Testing
All of the tests in `astartes` are written using the built-in python `unittest` module (to allow running without `pytest`) but we _highly_ recommend using `pytest`.
diff --git a/README.md b/README.md
index 5a17bca..5ac345f 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@
|
- |
+ |
|
@@ -121,6 +121,11 @@ Click the badges in the table below to be taken to a live, interactive demo of `
To execute these notebooks locally, clone this repository (i.e. `git clone https://github.com/JacksonBurns/astartes.git`), navigate to the `astartes` directory, run `pip install .[demos]`, then open and run the notebooks in your preferred editor.
You do _not_ need to execute the cells prefixed with `%%capture` - they are only present for compatibility with Google Colab.
+#### Packages Using `astartes`
+ - [Chemprop](https://github.com/chemprop/chemprop), a machine learning library for chemical property prediction, uses `astartes` in the backend for splitting molecular structures.
+ - [`fastprop`](https://github.com/JacksonBurns/fastprop), a descriptor-based property prediction library, uses `astartes`.
+ - [Google Scholar of articles citing the JOSS paper for `astartes`](https://scholar.google.com/scholar?cites=4693802000464819413&as_sdt=40000005&sciodt=0,22&hl=en)
+
### Withhold Testing Data with `train_val_test_split`
For rigorous ML research, it is critical to withhold some data during training to use a `test` set.
The model should _never_ see this data during training (unlike the validation set) so that we can get an accurate measurement of its performance.
@@ -260,13 +265,9 @@ train_test_split_molecules(
train_size=0.8,
fingerprint="daylight_fingerprint",
fprints_hopts={
- "minPath": 2,
- "maxPath": 5,
"fpSize": 200,
- "bitsPerHash": 4,
- "useHs": 1,
- "tgtDensity": 0.4,
- "minSize": 64,
+ "numBitsPerFeature": 4,
+ "useHs": True,
},
sampler="random",
random_state=42,
diff --git a/astartes/__init__.py b/astartes/__init__.py
index 1512803..8edfdf6 100644
--- a/astartes/__init__.py
+++ b/astartes/__init__.py
@@ -1,7 +1,7 @@
# convenience import to enable 'from astartes import train_test_split'
from .main import train_test_split, train_val_test_split
-__version__ = "1.2.2"
+__version__ = "1.3.0"
# DO NOT do this:
# from .molecules import train_test_split_molecules
diff --git a/astartes/main.py b/astartes/main.py
index 56121f4..0deb0b0 100644
--- a/astartes/main.py
+++ b/astartes/main.py
@@ -313,8 +313,8 @@ def _check_actual_split(
)
if actual_val_size != requested_val_size:
msg += "Requested validation size of {:.2f}, got {:.2f}. ".format(
- requested_test_size,
- actual_test_size,
+ requested_val_size,
+ actual_val_size,
)
if actual_test_size != requested_test_size:
msg += "Requested test size of {:.2f}, got {:.2f}. ".format(
diff --git a/examples/morais_lima_martin_sampling/mlm_sampler.ipynb b/examples/morais_lima_martin_sampling/mlm_sampler.ipynb
new file mode 100644
index 0000000..c8f0f87
--- /dev/null
+++ b/examples/morais_lima_martin_sampling/mlm_sampler.ipynb
@@ -0,0 +1,90 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Implementing the Morais-Lima-Martin (MLM) Sampler\n",
+ "The notebook shows a brief demonstration of using the built in utilities in `astartes` to implement the Morais-Lima-Martin sampler, which you can read about [here](https://academic.oup.com/bioinformatics/article/35/24/5257/5497250)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "`astartes` has a very fast implementation of the Kennard-Stone algorithm, on which the MLM sampler is based, available in its `utils`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from astartes.utils.fast_kennard_stone import fast_kennard_stone"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The MLM sampler can then be implemented as shown below.\n",
+ "The `mlm_sampler` functions takes a 2D array and splits it first using the Kennard-Stone algorithm, then permutes the indices according to the MLM algorithm."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from scipy.spatial.distance import pdist, squareform\n",
+ "import numpy as np\n",
+ "\n",
+ "from astartes.samplers.interpolation import KennardStone\n",
+ "\n",
+ "\n",
+ "def mlm_split(X: np.ndarray, *, train_size: float = 0.8, val_size: float = 0.1, test_size: float = 0.1, random_state: int = 42):\n",
+ " # calculate the distance matrix\n",
+ " ks_indexes = fast_kennard_stone(squareform(pdist(X, \"euclidean\")))\n",
+ " pivot = int(len(ks_indexes) * train_size)\n",
+ " train_idxs = ks_indexes[0:pivot]\n",
+ " other_idxs = ks_indexes[pivot:]\n",
+ "\n",
+ " # set RNG\n",
+ " rng = np.random.default_rng(seed=random_state)\n",
+ " \n",
+ " # choose 10% of train to switch with 10% of val/test\n",
+ " n_to_permute = np.floor(0.1 * len(train_idxs))\n",
+ " train_permute_idxs = rng.choice(train_idxs, n_to_permute)\n",
+ " remaining_train_idxs = filter(lambda i: i not in train_permute_idxs, train_idxs)\n",
+ " other_permute_idxs = rng.choice(other_idxs, n_to_permute)\n",
+ " remaining_other_idxs = filter(lambda i: i not in other_permute_idxs, other_idxs)\n",
+ "\n",
+ " # reassemble the new lists of indexes\n",
+ " new_train_idxs = np.concatenate(remaining_train_idxs, other_permute_idxs)\n",
+ " new_other_idxs = np.concatenate(train_permute_idxs, remaining_other_idxs)\n",
+ " n_val = int(len(new_other_idxs) * (val_size / (val_size + test_size)))\n",
+ " val_indexes = new_other_idxs[0:n_val]\n",
+ " test_indexes = new_other_idxs[n_val:]\n",
+ " \n",
+ " # return the split up array\n",
+ " return X[train_idxs], X[val_indexes], X[test_indexes]\n",
+ " "
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "fprop",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.11.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/test/functional/test_molecules.py b/test/functional/test_molecules.py
index 2942067..a737926 100644
--- a/test/functional/test_molecules.py
+++ b/test/functional/test_molecules.py
@@ -126,13 +126,9 @@ def test_fprint_hopts(self):
sampler="random",
fingerprint="topological_fingerprint",
fprints_hopts={
- "minPath": 2,
- "maxPath": 5,
"fpSize": 200,
- "bitsPerHash": 4,
- "useHs": 1,
- "tgtDensity": 0.4,
- "minSize": 64,
+ "numBitsPerFeature": 4,
+ "useHs": True,
},
)
@@ -163,13 +159,9 @@ def test_maximum_call(self):
train_size=0.2,
fingerprint="topological_fingerprint",
fprints_hopts={
- "minPath": 2,
- "maxPath": 5,
"fpSize": 200,
- "bitsPerHash": 4,
- "useHs": 1,
- "tgtDensity": 0.4,
- "minSize": 64,
+ "numBitsPerFeature": 2,
+ "useHs": True,
},
sampler="random",
random_state=42,