From e1b3b0b8c13ad581013deb8e4c7fc5470586491c Mon Sep 17 00:00:00 2001
From: Philip Loche <ploche@physik.fu-berlin.de>
Date: Tue, 4 Jun 2024 15:15:31 +0200
Subject: [PATCH] Cache creation of compound masks

---
 package/MDAnalysis/core/groups.py             | 81 +++++++++++--------
 .../MDAnalysisTests/core/test_accumulate.py   | 39 ++++++++-
 2 files changed, 83 insertions(+), 37 deletions(-)

diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py
index 1cb57f1f342..d168700114e 100644
--- a/package/MDAnalysis/core/groups.py
+++ b/package/MDAnalysis/core/groups.py
@@ -936,45 +936,56 @@ def _split_by_compound_indices(self, compound, stable_sort=False):
         n_compounds : int
             The number of individual compounds.
         """
-        # Caching would help here, especially when repeating the operation
-        # over different frames, since these masks are coordinate-independent.
-        # However, cache must be invalidated whenever new compound indices are
-        # modified, which is not yet implemented.
-        # Also, should we include here the grouping for 'group', which is
+        # Should we include here the grouping for 'group', which is
         # essentially a non-split?
 
+        cache_key = f"{compound}_masks"
         compound_indices = self._get_compound_indices(compound)
-        compound_sizes = np.bincount(compound_indices)
-        size_per_atom = compound_sizes[compound_indices]
-        compound_sizes = compound_sizes[compound_sizes != 0]
-        unique_compound_sizes = unique_int_1d(compound_sizes)
-
-        # Are we already sorted? argsorting and fancy-indexing can be expensive
-        # so we do a quick pre-check.
-        needs_sorting = np.any(np.diff(compound_indices) < 0)
-        if needs_sorting:
-            # stable sort ensures reproducibility, especially concerning who
-            # gets to be a compound's atom[0] and be a reference for unwrap.
-            if stable_sort:
-                sort_indices = np.argsort(compound_indices, kind='stable')
-            else:
-                # Quicksort
-                sort_indices = np.argsort(compound_indices)
-            # We must sort size_per_atom accordingly (Issue #3352).
-            size_per_atom = size_per_atom[sort_indices]
-
-        compound_masks = []
-        atom_masks = []
-        for compound_size in unique_compound_sizes:
-            compound_masks.append(compound_sizes == compound_size)
+
+        # create new cache or invalidate cache when compound indices changed
+        if (
+            cache_key not in self._cache
+            or np.all(self._cache[cache_key]["compound_indices"]
+                      != compound_indices)):
+            compound_sizes = np.bincount(compound_indices)
+            size_per_atom = compound_sizes[compound_indices]
+            compound_sizes = compound_sizes[compound_sizes != 0]
+            unique_compound_sizes = unique_int_1d(compound_sizes)
+
+            # Are we already sorted? argsorting and fancy-indexing can be
+            # expensive so we do a quick pre-check.
+            needs_sorting = np.any(np.diff(compound_indices) < 0)
             if needs_sorting:
-                atom_masks.append(sort_indices[size_per_atom == compound_size]
-                                   .reshape(-1, compound_size))
-            else:
-                atom_masks.append(np.where(size_per_atom == compound_size)[0]
-                                   .reshape(-1, compound_size))
+                # stable sort ensures reproducibility, especially concerning
+                # who gets to be a compound's atom[0] and be a reference for
+                # unwrap.
+                if stable_sort:
+                    sort_indices = np.argsort(compound_indices, kind='stable')
+                else:
+                    # Quicksort
+                    sort_indices = np.argsort(compound_indices)
+                # We must sort size_per_atom accordingly (Issue #3352).
+                size_per_atom = size_per_atom[sort_indices]
+
+            compound_masks = []
+            atom_masks = []
+            for compound_size in unique_compound_sizes:
+                compound_masks.append(compound_sizes == compound_size)
+                if needs_sorting:
+                    atom_masks.append(sort_indices[size_per_atom
+                                                   == compound_size]
+                                      .reshape(-1, compound_size))
+                else:
+                    atom_masks.append(np.where(size_per_atom
+                                               == compound_size)[0]
+                                      .reshape(-1, compound_size))
+
+            self._cache[cache_key] = {
+                "compound_indices": compound_indices,
+                "data": (atom_masks, compound_masks, len(compound_sizes))
+            }
 
-        return atom_masks, compound_masks, len(compound_sizes)
+        return self._cache[cache_key]["data"]
 
     @warn_if_not_unique
     @_pbc_to_wrap
@@ -3200,7 +3211,7 @@ def select_atoms(self, sel, *othersel, periodic=True, rtol=1e-05,
                     universe = mda.Universe(PSF, DCD)
                     guessed_elements = guess_types(universe.atoms.names)
                     universe.add_TopologyAttr('elements', guessed_elements)
-                    
+
                 .. doctest:: AtomGroup.select_atoms.smarts
 
                     >>> universe.select_atoms("smarts C", smarts_kwargs={"maxMatches": 100})
diff --git a/testsuite/MDAnalysisTests/core/test_accumulate.py b/testsuite/MDAnalysisTests/core/test_accumulate.py
index aadbeee6454..52398e08b59 100644
--- a/testsuite/MDAnalysisTests/core/test_accumulate.py
+++ b/testsuite/MDAnalysisTests/core/test_accumulate.py
@@ -21,7 +21,7 @@
 # J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
 #
 import numpy as np
-from numpy.testing import assert_equal, assert_almost_equal
+from numpy.testing import assert_equal, assert_almost_equal, assert_allclose
 
 import MDAnalysis as mda
 from MDAnalysis.exceptions import DuplicateWarning, NoDataError
@@ -99,7 +99,6 @@ def test_accumulate_array_attribute_compounds(self, name, compound, level):
         ref = [np.ones((len(a), 2, 5)).sum(axis=0) for a in group.atoms.groupby(name).values()]
         assert_equal(group.accumulate(np.ones((len(group.atoms), 2, 5)), compound=compound), ref)
 
-
 class TestTotals(object):
     """Tests the functionality of *Group.total*() like total_mass
     and total_charge.
@@ -291,3 +290,39 @@ def test_quadrupole_moment_fragments(self, group):
         assert_almost_equal(quadrupoles,
                             np.array([0., 0.0011629, 0.1182701, 0.6891748
                                       ])) and len(quadrupoles) == n_compounds
+
+
+class TestCache:
+    @pytest.fixture()
+    def group(self):
+        return mda.Universe(PSF, DCD).atoms
+
+    def test_cache(self, group):
+        """Test that one cache per compound is created."""
+        group_nocache = group.copy()
+        group_cache = group.copy()
+
+        for compound in ['residues', 'fragments']:
+            actual = group_nocache.accumulate("masses", compound=compound)
+            desired = group_cache.accumulate("masses", compound=compound)
+
+            assert_allclose(actual, desired)
+            group_nocache._cache.pop(f"{compound}_masks")
+
+    @pytest.mark.parametrize("compound",
+                             ['residues', 'fragments'])
+    def test_cache_updating(self, group, compound):
+        """Test caching of compound_masks for updating atomgroups."""
+        kwargs = {"attribute": "masses", "compound": compound}
+
+        group_nocache = group.select_atoms("prop z < 1.0", updating=True)
+        group_cache = group.select_atoms("prop z < 1.0", updating=True)
+
+        assert_allclose(group_nocache.accumulate(**kwargs),
+                        group_cache.accumulate(**kwargs))
+
+        # Clear cache and forward to next frame
+        group_nocache._cache.pop(f"{compound}_masks")
+        group.universe.trajectory.next()
+        assert_allclose(group_nocache.accumulate(**kwargs),
+                        group_cache.accumulate(**kwargs))