From 48635ee4119e0e09f0515da05a7a350c1fdf6c71 Mon Sep 17 00:00:00 2001
From: Jusong Yu <jusong.yeu@gmail.com>
Date: Fri, 8 Dec 2023 13:42:24 +0100
Subject: [PATCH] `PseudoPotentialData`: Base hash only on file content and
 element

By default, the hash for data nodes is computed from a set of attributes
that includes the version of `aiida-core` as well as the plugin package
that provides the data plugin, if applicable. This means that the hash
of a data node changes merely if the version of `aiida-core` or the
plugin package changes, even if its content is identical.

This choice was made in `aiida-core` to err on the safe side. However,
for pseudopotentials, this is unnecessary and actually reduces the
effectiveness of caching. If a calculation is run with pseudos that has
already run before, but the version of `aiida-pseudo` is different with
which the pseudo was installed, the calculation will not be cached but
run again.

The `PseudoPotentialData` now customizes the caching class, ensuring the
hash is calculated just based on the element and the file content. This
should uniquely define a pseudopotential and so should be safe enough to
base the hash upon.
---
 src/aiida_pseudo/data/pseudo/pseudo.py | 13 +++++++++++++
 tests/data/pseudo/test_pseudo.py       | 25 +++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/src/aiida_pseudo/data/pseudo/pseudo.py b/src/aiida_pseudo/data/pseudo/pseudo.py
index 7719f1c..4c37f0d 100644
--- a/src/aiida_pseudo/data/pseudo/pseudo.py
+++ b/src/aiida_pseudo/data/pseudo/pseudo.py
@@ -1,4 +1,6 @@
 """Base class for data types representing pseudo potentials."""
+from __future__ import annotations
+
 import io
 import pathlib
 import typing
@@ -7,16 +9,27 @@
 from aiida.common.constants import elements
 from aiida.common.exceptions import StoringNotAllowed
 from aiida.common.files import md5_from_filelike
+from aiida.orm.nodes.caching import NodeCaching
 
 __all__ = ('PseudoPotentialData',)
 
 
+class PseudoPotentialDataCaching(NodeCaching):
+    """Class to define caching behavior of ``PseudoPotentialData`` nodes."""
+
+    def _get_objects_to_hash(self) -> list:
+        """Return a list of objects which should be included in the node hash."""
+        return [self._node.element, self._node.md5]
+
+
 class PseudoPotentialData(plugins.DataFactory('core.singlefile')):
     """Base class for data types representing pseudo potentials."""
 
     _key_element = 'element'
     _key_md5 = 'md5'
 
+    _CLS_NODE_CACHING = PseudoPotentialDataCaching
+
     @classmethod
     def get_or_create(
         cls, source: typing.Union[str, pathlib.Path, typing.BinaryIO], filename: typing.Optional[str] = None
diff --git a/tests/data/pseudo/test_pseudo.py b/tests/data/pseudo/test_pseudo.py
index db0e992..5884ef3 100644
--- a/tests/data/pseudo/test_pseudo.py
+++ b/tests/data/pseudo/test_pseudo.py
@@ -209,3 +209,28 @@ def test_get_or_create(get_pseudo_potential_data):
     assert isinstance(different_class, PseudoPotentialData)
     assert not different_class.is_stored
     assert different_class.uuid != original.uuid
+
+
+@pytest.mark.parametrize(
+    'stream, filename, element, are_equal',
+    (
+        (b'content', 'filename.pseudo', 'Ar', True),
+        (b'content', 'filename.different', 'Ar', True),  # The filename should not affect the hash
+        (b'contenta', 'filename.pseudo', 'Ar', False),  # Different content should mean a different hash
+        (b'content', 'filename.pseudo', 'Kr', False),  # A different element should mean a different hash
+    )
+)
+def test_hash(stream, filename, element, are_equal):
+    """Test the behavior of the hash of ``PseudoPotentialData`` nodes.
+
+    The hash should only be based on the contents of the file and the element.
+    """
+    left = PseudoPotentialData(io.BytesIO(b'content'), filename='filename.pseudo')
+    left.element = 'Ar'
+    left.store()
+
+    right = PseudoPotentialData(io.BytesIO(stream), filename=filename)
+    right.element = element
+    right.store()
+
+    assert (left.base.caching.get_hash() == right.base.caching.get_hash()) is are_equal