PseudoPotentialData: Base hash only on file content and element

By default, the hash for data nodes is computed from a set of attributes that includes the version of `aiida-core` as well as the plugin package that provides the data plugin, if applicable. This means that the hash of a data node changes merely if the version of `aiida-core` or the plugin package changes, even if its content is identical. This choice was made in `aiida-core` to err on the safe side. However, for pseudopotentials, this is unnecessary and actually reduces the effectiveness of caching. If a calculation is run with pseudos that has already run before, but the version of `aiida-pseudo` is different with which the pseudo was installed, the calculation will not be cached but run again. The `PseudoPotentialData` now customizes the caching class, ensuring the hash is calculated just based on the element and the file content. This should uniquely define a pseudopotential and so should be safe enough to base the hash upon.
aiidateam · Dec 22, 2023 · 48635ee · 48635ee
1 parent 8db0a55
commit 48635ee
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 0 deletions.
diff --git a/src/aiida_pseudo/data/pseudo/pseudo.py b/src/aiida_pseudo/data/pseudo/pseudo.py
@@ -1,4 +1,6 @@
 """Base class for data types representing pseudo potentials."""
+from __future__ import annotations
+
 import io
 import pathlib
 import typing
@@ -7,16 +9,27 @@
 from aiida.common.constants import elements
 from aiida.common.exceptions import StoringNotAllowed
 from aiida.common.files import md5_from_filelike
+from aiida.orm.nodes.caching import NodeCaching
 
 __all__ = ('PseudoPotentialData',)
 
 
+class PseudoPotentialDataCaching(NodeCaching):
+    """Class to define caching behavior of ``PseudoPotentialData`` nodes."""
+
+    def _get_objects_to_hash(self) -> list:
+        """Return a list of objects which should be included in the node hash."""
+        return [self._node.element, self._node.md5]
+
+
 class PseudoPotentialData(plugins.DataFactory('core.singlefile')):
     """Base class for data types representing pseudo potentials."""
 
     _key_element = 'element'
     _key_md5 = 'md5'
 
+    _CLS_NODE_CACHING = PseudoPotentialDataCaching
+
     @classmethod
     def get_or_create(
         cls, source: typing.Union[str, pathlib.Path, typing.BinaryIO], filename: typing.Optional[str] = None

diff --git a/tests/data/pseudo/test_pseudo.py b/tests/data/pseudo/test_pseudo.py
@@ -209,3 +209,28 @@ def test_get_or_create(get_pseudo_potential_data):
     assert isinstance(different_class, PseudoPotentialData)
     assert not different_class.is_stored
     assert different_class.uuid != original.uuid
+
+
+@pytest.mark.parametrize(
+    'stream, filename, element, are_equal',
+    (
+        (b'content', 'filename.pseudo', 'Ar', True),
+        (b'content', 'filename.different', 'Ar', True),  # The filename should not affect the hash
+        (b'contenta', 'filename.pseudo', 'Ar', False),  # Different content should mean a different hash
+        (b'content', 'filename.pseudo', 'Kr', False),  # A different element should mean a different hash
+    )
+)
+def test_hash(stream, filename, element, are_equal):
+    """Test the behavior of the hash of ``PseudoPotentialData`` nodes.
+
+    The hash should only be based on the contents of the file and the element.
+    """
+    left = PseudoPotentialData(io.BytesIO(b'content'), filename='filename.pseudo')
+    left.element = 'Ar'
+    left.store()
+
+    right = PseudoPotentialData(io.BytesIO(stream), filename=filename)
+    right.element = element
+    right.store()
+
+    assert (left.base.caching.get_hash() == right.base.caching.get_hash()) is are_equal