Skip to content
This repository has been archived by the owner on Apr 24, 2024. It is now read-only.

Commit

Permalink
tests for StandardScaler for with_mean with_std
Browse files Browse the repository at this point in the history
tests for all combinations of with_mean=[True, False],
with_std=[True, False] for the StandardScaler have been added
fixes the smal bugs that raise errors for new tests
  • Loading branch information
agoscinski committed Feb 28, 2023
1 parent c921c5f commit 4d478aa
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 91 deletions.
47 changes: 13 additions & 34 deletions src/equisolve/numpy/preprocessing/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ class StandardScaler:
``"cell"`` or a combination of these.
:param with_mean:
If ``True``, center the data before scaling. If ``False``,
keep the mean intact
keep the mean intact. Because all operations are consistent
wrt. to the derivative, the mean is only taken from the values,
but not from the gradients, since ∇(X - mean) = ∇X
:param with_std:
If ``True``, scale the data to unit variance. If ``False``,
keep the variance intact
Expand Down Expand Up @@ -90,24 +92,15 @@ def fit(
components=[],
properties=Labels.single(),
)
n_properties_block = TensorBlock(
values=np.array([len(X_block.properties)], dtype=np.int32).reshape(
1, 1
),
samples=Labels.single(),
components=[],
properties=Labels.single(),
)

# TODO use this in transform function to do a check
self.n_components_ = TensorMap(X.keys, [n_components_block])
self.n_properties_ = TensorMap(X.keys, [n_properties_block])
self.n_properties_ = len(X_block.properties)

mean_blocks = []
scale_blocks = []

# this is how it will look like when we replace with equistore oprations
# if self.with_mean:
# self.mean_ = equistore.operations.mean_over_samples(X, X.samples_names)
# replace with equistore oprations see issue #18
for key, X_block in X:
# if values not in parameter_keys, we create empty tensor block to
# attach gradients
Expand All @@ -121,9 +114,7 @@ def fit(
if self.with_mean:
mean_values = np.average(X_mat, weights=sample_weights, axis=0)
else:
mean_values = np.zeros(
self.n_properties_.block(key).values.flatten()
)
mean_values = np.zeros(self.n_properties_)

mean_block = TensorBlock(
values=mean_values.reshape((1,) + mean_values.shape),
Expand Down Expand Up @@ -177,26 +168,18 @@ def fit(
properties=Labels.single(),
)

for parameter in self.parameter_keys:
if parameter == "values":
continue

for parameter in X_block.gradients_list():
X_mat = block_to_array(X_block, [parameter])
# X_gradient = X_block.gradient(parameter)

if sample_weights is not None:
sw_block = sample_weights.block(key)
sample_weights = block_to_array(sw_block, [parameter])
if self.with_mean:
if self.with_mean and (parameter in self.parameter_keys):
mean_values = np.average(X_mat, weights=sample_weights, axis=0)
mean_values = mean_values.reshape((1, 1) + mean_values.shape)
else:
n_properties = (
self.n_properties_.block(key)
.gradient(parameter)
.data.flatten()[0]
)
mean_values = np.zeros((1, 1, n_properties))
mean_values = np.zeros((1, 1, self.n_properties_))

mean_block.add_gradient(
parameter,
Expand All @@ -205,7 +188,7 @@ def fit(
[Labels.single()],
)

if self.with_std:
if self.with_std and (parameter in self.parameter_keys):
X_mean = np.average(X_mat, weights=sample_weights, axis=0)
var = np.average((X_mat - X_mean) ** 2, axis=0)

Expand Down Expand Up @@ -279,9 +262,7 @@ def transform(self, X: TensorMap, y: TensorMap = None):
properties=X_block.properties,
)

for parameter in self.parameter_keys:
if parameter == "values":
continue
for parameter in X_block.gradients_list():
X_gradient = X_block.gradient(parameter)

block.add_gradient(
Expand Down Expand Up @@ -331,9 +312,7 @@ def inverse_transform(self, X: TensorMap, y: TensorMap = None):
properties=X_block.properties,
)

for parameter in self.parameter_keys:
if parameter == "values":
continue
for parameter in X_block.gradients_list():
X_gradient = X_block.gradient(parameter)

block.add_gradient(
Expand Down
118 changes: 61 additions & 57 deletions tests/equisolve_tests/numpy/preprocessing/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,89 +6,93 @@
# Released under the BSD 3-Clause "New" or "Revised" License
# SPDX-License-Identifier: BSD-3-Clause

import os

import ase.io
import numpy as np
import pytest
from numpy.testing import assert_allclose
from rascaline import SoapPowerSpectrum

from equisolve.numpy.preprocessing import StandardScaler


ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from ..utils import random_single_block_no_components_tensor_map


class TestStandardScaler:
"""Test the StandardScaler."""

rng = np.random.default_rng(0x122578748CFF12AD12)
n_strucs = 5
frames = ase.io.read(os.path.join(ROOT, "./examples/dataset.xyz"), f":{n_strucs}")

hypers = {
"cutoff": 5.0,
"max_radial": 2,
"max_angular": 2,
"atomic_gaussian_width": 0.3,
"center_atom_weight": 1.0,
"radial_basis": {
"Gto": {},
},
"cutoff_function": {
"ShiftedCosine": {"width": 0.5},
},
}

calculator = SoapPowerSpectrum(**hypers)

descriptor = calculator.compute(frames, gradients=["positions"])
# Move all keys into properties
descriptor = descriptor.keys_to_properties(
["species_center", "species_neighbor_1", "species_neighbor_2"]
)
# ``X`` contains a represenantion with respect to central atoms. However
# our energies as target data is per structure. Therefore, we convert the
# central atom representation into a structure wise represention by summing all
# properties per structure.
from equistore.operations import sum_over_samples

X = sum_over_samples(descriptor, ["center"])

@pytest.fixture
def energies(self):
return [i for i in self.rng.random(self.n_strucs)]

def test_standard_scaler_transform(self):
st = StandardScaler(["values", "positions"]).fit(self.X)
X = random_single_block_no_components_tensor_map()

absolute_tolerance = 1e-14
relative_tolerance = 1e-14

@pytest.mark.parametrize("with_mean", [True, False])
@pytest.mark.parametrize("with_std", [True, False])
def test_standard_scaler_transform(self, with_mean, with_std):
parameter_keys = ["values", "positions"]
st = StandardScaler(
parameter_keys=parameter_keys,
with_mean=with_mean,
with_std=with_std,
column_wise=False,
).fit(self.X)
X_t = st.transform(self.X)

X_values = X_t.block().values
assert_allclose(np.mean(X_values, axis=0), 0, atol=1e-14, rtol=1e-14)
assert_allclose(
np.sqrt(np.sum(np.var(X_values, axis=0))), 1, atol=1e-14, rtol=1e-14
)

for _, X_grad in X_t.block().gradients():
X_grad = X_grad.data.reshape(-1, X_grad.data.shape[-1])
assert_allclose(np.mean(X_grad, axis=0), 0, atol=1e-14, rtol=1e-14)
if with_mean:
assert_allclose(
np.sqrt(np.sum(np.var(X_grad, axis=0))), 1, atol=1e-14, rtol=1e-14
np.mean(X_values, axis=0),
0,
atol=self.absolute_tolerance,
rtol=self.relative_tolerance,
)
if with_std:
assert_allclose(
np.sqrt(np.sum(np.var(X_values, axis=0))),
1,
atol=self.absolute_tolerance,
rtol=self.relative_tolerance,
)

def test_standard_scaler_inverse_transform(self):
st = StandardScaler(["values", "positions"]).fit(self.X)
parameter_keys.remove("values")
for parameter in parameter_keys:
X_grad = X_t.block().gradient(parameter)
X_grad = X_grad.data.reshape(-1, X_grad.data.shape[-1])
if with_mean:
assert_allclose(np.mean(X_grad, axis=0), 0, atol=1e-14, rtol=1e-14)
if with_std:
assert_allclose(
np.sqrt(np.sum(np.var(X_grad, axis=0))),
1,
atol=self.absolute_tolerance,
rtol=self.relative_tolerance,
)

@pytest.mark.parametrize("with_mean", [True, False])
@pytest.mark.parametrize("with_std", [True, False])
def test_standard_scaler_inverse_transform(self, with_mean, with_std):
parameter_keys = ["values", "positions"]
st = StandardScaler(
parameter_keys=parameter_keys,
with_mean=with_mean,
with_std=with_std,
column_wise=False,
).fit(self.X)
X_t_inv_t = st.inverse_transform(st.transform(self.X))

assert_allclose(
self.X.block().values, X_t_inv_t.block().values, atol=1e-14, rtol=1e-14
self.X.block().values,
X_t_inv_t.block().values,
atol=self.absolute_tolerance,
rtol=self.relative_tolerance,
)

for parameter, X_grad in self.X.block().gradients():
parameter_keys.remove("values")
for parameter in parameter_keys:
X_grad = self.X.block().gradient(parameter)
assert_allclose(
X_grad.data,
X_t_inv_t.block().gradient(parameter).data,
atol=1e-14,
rtol=1e-14,
atol=self.absolute_tolerance,
rtol=self.relative_tolerance,
)
55 changes: 55 additions & 0 deletions tests/equisolve_tests/numpy/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import numpy as np
from equistore import Labels, TensorBlock, TensorMap


def random_single_block_no_components_tensor_map():
"""
Create a dummy tensor map to be used in tests. This is the same one as the
tensor map used in `tensor.rs` tests.
"""
block_1 = TensorBlock(
values=np.random.rand(4, 2),
samples=Labels(
["sample", "structure"],
np.array([[0, 0], [1, 1], [2, 2], [3, 3]], dtype=np.int32),
),
components=[],
properties=Labels(["properties"], np.array([[0], [1]], dtype=np.int32)),
)
block_1.add_gradient(
"positions",
data=np.random.rand(7, 3, 2),
samples=Labels(
["sample", "structure", "center"],
np.array(
[
[0, 0, 1],
[0, 0, 2],
[1, 1, 0],
[1, 1, 1],
[1, 1, 2],
[2, 2, 0],
[3, 3, 0],
],
dtype=np.int32,
),
),
components=[Labels(["direction"], np.array([[0], [1], [2]], dtype=np.int32))],
)

block_1.add_gradient(
"cell",
data=np.random.rand(4, 6, 2),
samples=Labels(
["sample", "structure"],
np.array([[0, 0], [1, 1], [2, 2], [3, 3]], dtype=np.int32),
),
components=[
Labels(
["direction_xx_yy_zz_yz_xz_xy"],
np.array([[0], [1], [2], [3], [4], [5]], dtype=np.int32),
)
],
)

return TensorMap(Labels.single(), [block_1])

0 comments on commit 4d478aa

Please sign in to comment.