tests for StandardScaler for with_mean with_std

tests for all combinations of with_mean=[True, False], with_std=[True, False] for the StandardScaler have been added fixes the smal bugs that raise errors for new tests
lab-cosmo · Feb 28, 2023 · 4d478aa · 4d478aa
1 parent c921c5f
commit 4d478aa
Show file tree

Hide file tree

Showing 3 changed files with 129 additions and 91 deletions.
diff --git a/src/equisolve/numpy/preprocessing/_base.py b/src/equisolve/numpy/preprocessing/_base.py
@@ -16,7 +16,9 @@ class StandardScaler:
         ``"cell"`` or a combination of these.
     :param with_mean:
         If ``True``, center the data before scaling. If ``False``,
-        keep the mean intact
+        keep the mean intact. Because all operations are consistent
+        wrt. to the derivative, the mean is only taken from the values,
+        but not from the gradients, since ∇(X - mean) = ∇X
     :param with_std:
         If ``True``, scale the data to unit variance. If ``False``,
         keep the variance intact
@@ -90,24 +92,15 @@ def fit(
                 components=[],
                 properties=Labels.single(),
             )
-            n_properties_block = TensorBlock(
-                values=np.array([len(X_block.properties)], dtype=np.int32).reshape(
-                    1, 1
-                ),
-                samples=Labels.single(),
-                components=[],
-                properties=Labels.single(),
-            )
+
         # TODO use this in transform function to do a check
         self.n_components_ = TensorMap(X.keys, [n_components_block])
-        self.n_properties_ = TensorMap(X.keys, [n_properties_block])
+        self.n_properties_ = len(X_block.properties)
 
         mean_blocks = []
         scale_blocks = []
 
-        # this is how it will look like when we replace with equistore oprations
-        # if self.with_mean:
-        #    self.mean_ = equistore.operations.mean_over_samples(X, X.samples_names)
+        # replace with equistore oprations see issue #18
         for key, X_block in X:
             # if values not in parameter_keys, we create empty tensor block to
             # attach gradients
@@ -121,9 +114,7 @@ def fit(
                 if self.with_mean:
                     mean_values = np.average(X_mat, weights=sample_weights, axis=0)
                 else:
-                    mean_values = np.zeros(
-                        self.n_properties_.block(key).values.flatten()
-                    )
+                    mean_values = np.zeros(self.n_properties_)
 
                 mean_block = TensorBlock(
                     values=mean_values.reshape((1,) + mean_values.shape),
@@ -177,26 +168,18 @@ def fit(
                     properties=Labels.single(),
                 )
 
-            for parameter in self.parameter_keys:
-                if parameter == "values":
-                    continue
-
+            for parameter in X_block.gradients_list():
                 X_mat = block_to_array(X_block, [parameter])
                 # X_gradient = X_block.gradient(parameter)
 
                 if sample_weights is not None:
                     sw_block = sample_weights.block(key)
                     sample_weights = block_to_array(sw_block, [parameter])
-                if self.with_mean:
+                if self.with_mean and (parameter in self.parameter_keys):
                     mean_values = np.average(X_mat, weights=sample_weights, axis=0)
                     mean_values = mean_values.reshape((1, 1) + mean_values.shape)
                 else:
-                    n_properties = (
-                        self.n_properties_.block(key)
-                        .gradient(parameter)
-                        .data.flatten()[0]
-                    )
-                    mean_values = np.zeros((1, 1, n_properties))
+                    mean_values = np.zeros((1, 1, self.n_properties_))
 
                 mean_block.add_gradient(
                     parameter,
@@ -205,7 +188,7 @@ def fit(
                     [Labels.single()],
                 )
 
-                if self.with_std:
+                if self.with_std and (parameter in self.parameter_keys):
                     X_mean = np.average(X_mat, weights=sample_weights, axis=0)
                     var = np.average((X_mat - X_mean) ** 2, axis=0)
 
@@ -279,9 +262,7 @@ def transform(self, X: TensorMap, y: TensorMap = None):
                 properties=X_block.properties,
             )
 
-            for parameter in self.parameter_keys:
-                if parameter == "values":
-                    continue
+            for parameter in X_block.gradients_list():
                 X_gradient = X_block.gradient(parameter)
 
                 block.add_gradient(
@@ -331,9 +312,7 @@ def inverse_transform(self, X: TensorMap, y: TensorMap = None):
                 properties=X_block.properties,
             )
 
-            for parameter in self.parameter_keys:
-                if parameter == "values":
-                    continue
+            for parameter in X_block.gradients_list():
                 X_gradient = X_block.gradient(parameter)
 
                 block.add_gradient(

diff --git a/tests/equisolve_tests/numpy/preprocessing/test_base.py b/tests/equisolve_tests/numpy/preprocessing/test_base.py
@@ -6,89 +6,93 @@
 # Released under the BSD 3-Clause "New" or "Revised" License
 # SPDX-License-Identifier: BSD-3-Clause
 
-import os
-
-import ase.io
 import numpy as np
 import pytest
 from numpy.testing import assert_allclose
-from rascaline import SoapPowerSpectrum
 
 from equisolve.numpy.preprocessing import StandardScaler
 
-
-ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
+from ..utils import random_single_block_no_components_tensor_map
 
 
 class TestStandardScaler:
     """Test the StandardScaler."""
 
     rng = np.random.default_rng(0x122578748CFF12AD12)
     n_strucs = 5
-    frames = ase.io.read(os.path.join(ROOT, "./examples/dataset.xyz"), f":{n_strucs}")
-
-    hypers = {
-        "cutoff": 5.0,
-        "max_radial": 2,
-        "max_angular": 2,
-        "atomic_gaussian_width": 0.3,
-        "center_atom_weight": 1.0,
-        "radial_basis": {
-            "Gto": {},
-        },
-        "cutoff_function": {
-            "ShiftedCosine": {"width": 0.5},
-        },
-    }
-
-    calculator = SoapPowerSpectrum(**hypers)
-
-    descriptor = calculator.compute(frames, gradients=["positions"])
-    # Move all keys into properties
-    descriptor = descriptor.keys_to_properties(
-        ["species_center", "species_neighbor_1", "species_neighbor_2"]
-    )
-    # ``X`` contains a represenantion with respect to central atoms. However
-    # our energies as target data is per structure. Therefore, we convert the
-    # central atom representation into a structure wise represention by summing all
-    # properties per structure.
-    from equistore.operations import sum_over_samples
-
-    X = sum_over_samples(descriptor, ["center"])
-
-    @pytest.fixture
-    def energies(self):
-        return [i for i in self.rng.random(self.n_strucs)]
 
-    def test_standard_scaler_transform(self):
-        st = StandardScaler(["values", "positions"]).fit(self.X)
+    X = random_single_block_no_components_tensor_map()
+
+    absolute_tolerance = 1e-14
+    relative_tolerance = 1e-14
+
+    @pytest.mark.parametrize("with_mean", [True, False])
+    @pytest.mark.parametrize("with_std", [True, False])
+    def test_standard_scaler_transform(self, with_mean, with_std):
+        parameter_keys = ["values", "positions"]
+        st = StandardScaler(
+            parameter_keys=parameter_keys,
+            with_mean=with_mean,
+            with_std=with_std,
+            column_wise=False,
+        ).fit(self.X)
         X_t = st.transform(self.X)
 
         X_values = X_t.block().values
-        assert_allclose(np.mean(X_values, axis=0), 0, atol=1e-14, rtol=1e-14)
-        assert_allclose(
-            np.sqrt(np.sum(np.var(X_values, axis=0))), 1, atol=1e-14, rtol=1e-14
-        )
-
-        for _, X_grad in X_t.block().gradients():
-            X_grad = X_grad.data.reshape(-1, X_grad.data.shape[-1])
-            assert_allclose(np.mean(X_grad, axis=0), 0, atol=1e-14, rtol=1e-14)
+        if with_mean:
             assert_allclose(
-                np.sqrt(np.sum(np.var(X_grad, axis=0))), 1, atol=1e-14, rtol=1e-14
+                np.mean(X_values, axis=0),
+                0,
+                atol=self.absolute_tolerance,
+                rtol=self.relative_tolerance,
+            )
+        if with_std:
+            assert_allclose(
+                np.sqrt(np.sum(np.var(X_values, axis=0))),
+                1,
+                atol=self.absolute_tolerance,
+                rtol=self.relative_tolerance,
             )
 
-    def test_standard_scaler_inverse_transform(self):
-        st = StandardScaler(["values", "positions"]).fit(self.X)
+        parameter_keys.remove("values")
+        for parameter in parameter_keys:
+            X_grad = X_t.block().gradient(parameter)
+            X_grad = X_grad.data.reshape(-1, X_grad.data.shape[-1])
+            if with_mean:
+                assert_allclose(np.mean(X_grad, axis=0), 0, atol=1e-14, rtol=1e-14)
+            if with_std:
+                assert_allclose(
+                    np.sqrt(np.sum(np.var(X_grad, axis=0))),
+                    1,
+                    atol=self.absolute_tolerance,
+                    rtol=self.relative_tolerance,
+                )
+
+    @pytest.mark.parametrize("with_mean", [True, False])
+    @pytest.mark.parametrize("with_std", [True, False])
+    def test_standard_scaler_inverse_transform(self, with_mean, with_std):
+        parameter_keys = ["values", "positions"]
+        st = StandardScaler(
+            parameter_keys=parameter_keys,
+            with_mean=with_mean,
+            with_std=with_std,
+            column_wise=False,
+        ).fit(self.X)
         X_t_inv_t = st.inverse_transform(st.transform(self.X))
 
         assert_allclose(
-            self.X.block().values, X_t_inv_t.block().values, atol=1e-14, rtol=1e-14
+            self.X.block().values,
+            X_t_inv_t.block().values,
+            atol=self.absolute_tolerance,
+            rtol=self.relative_tolerance,
         )
 
-        for parameter, X_grad in self.X.block().gradients():
+        parameter_keys.remove("values")
+        for parameter in parameter_keys:
+            X_grad = self.X.block().gradient(parameter)
             assert_allclose(
                 X_grad.data,
                 X_t_inv_t.block().gradient(parameter).data,
-                atol=1e-14,
-                rtol=1e-14,
+                atol=self.absolute_tolerance,
+                rtol=self.relative_tolerance,
             )
diff --git a/tests/equisolve_tests/numpy/utils.py b/tests/equisolve_tests/numpy/utils.py
@@ -0,0 +1,55 @@
+import numpy as np
+from equistore import Labels, TensorBlock, TensorMap
+
+
+def random_single_block_no_components_tensor_map():
+    """
+    Create a dummy tensor map to be used in tests. This is the same one as the
+    tensor map used in `tensor.rs` tests.
+    """
+    block_1 = TensorBlock(
+        values=np.random.rand(4, 2),
+        samples=Labels(
+            ["sample", "structure"],
+            np.array([[0, 0], [1, 1], [2, 2], [3, 3]], dtype=np.int32),
+        ),
+        components=[],
+        properties=Labels(["properties"], np.array([[0], [1]], dtype=np.int32)),
+    )
+    block_1.add_gradient(
+        "positions",
+        data=np.random.rand(7, 3, 2),
+        samples=Labels(
+            ["sample", "structure", "center"],
+            np.array(
+                [
+                    [0, 0, 1],
+                    [0, 0, 2],
+                    [1, 1, 0],
+                    [1, 1, 1],
+                    [1, 1, 2],
+                    [2, 2, 0],
+                    [3, 3, 0],
+                ],
+                dtype=np.int32,
+            ),
+        ),
+        components=[Labels(["direction"], np.array([[0], [1], [2]], dtype=np.int32))],
+    )
+
+    block_1.add_gradient(
+        "cell",
+        data=np.random.rand(4, 6, 2),
+        samples=Labels(
+            ["sample", "structure"],
+            np.array([[0, 0], [1, 1], [2, 2], [3, 3]], dtype=np.int32),
+        ),
+        components=[
+            Labels(
+                ["direction_xx_yy_zz_yz_xz_xy"],
+                np.array([[0], [1], [2], [3], [4], [5]], dtype=np.int32),
+            )
+        ],
+    )
+
+    return TensorMap(Labels.single(), [block_1])