Skip to content

Commit

Permalink
PseudoPotentialData: Base hash only on file content and element
Browse files Browse the repository at this point in the history
By default, the hash for data nodes is computed from a set of attributes
that includes the version of `aiida-core` as well as the plugin package
that provides the data plugin, if applicable. This means that the hash
of a data node changes merely if the version of `aiida-core` or the
plugin package changes, even if its content is identical.

This choice was made in `aiida-core` to err on the safe side. However,
for pseudopotentials, this is unnecessary and actually reduces the
effectiveness of caching. If a calculation is run with pseudos that has
already run before, but the version of `aiida-pseudo` is different with
which the pseudo was installed, the calculation will not be cached but
run again.

The `PseudoPotentialData` now customizes the caching class, ensuring the
hash is calculated just based on the element and the file content. This
should uniquely define a pseudopotential and so should be safe enough to
base the hash upon.
  • Loading branch information
unkcpz authored and sphuber committed Dec 22, 2023
1 parent 8db0a55 commit 48635ee
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 0 deletions.
13 changes: 13 additions & 0 deletions src/aiida_pseudo/data/pseudo/pseudo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""Base class for data types representing pseudo potentials."""
from __future__ import annotations

import io
import pathlib
import typing
Expand All @@ -7,16 +9,27 @@
from aiida.common.constants import elements
from aiida.common.exceptions import StoringNotAllowed
from aiida.common.files import md5_from_filelike
from aiida.orm.nodes.caching import NodeCaching

__all__ = ('PseudoPotentialData',)


class PseudoPotentialDataCaching(NodeCaching):
"""Class to define caching behavior of ``PseudoPotentialData`` nodes."""

def _get_objects_to_hash(self) -> list:
"""Return a list of objects which should be included in the node hash."""
return [self._node.element, self._node.md5]


class PseudoPotentialData(plugins.DataFactory('core.singlefile')):
"""Base class for data types representing pseudo potentials."""

_key_element = 'element'
_key_md5 = 'md5'

_CLS_NODE_CACHING = PseudoPotentialDataCaching

@classmethod
def get_or_create(
cls, source: typing.Union[str, pathlib.Path, typing.BinaryIO], filename: typing.Optional[str] = None
Expand Down
25 changes: 25 additions & 0 deletions tests/data/pseudo/test_pseudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,28 @@ def test_get_or_create(get_pseudo_potential_data):
assert isinstance(different_class, PseudoPotentialData)
assert not different_class.is_stored
assert different_class.uuid != original.uuid


@pytest.mark.parametrize(
'stream, filename, element, are_equal',
(
(b'content', 'filename.pseudo', 'Ar', True),
(b'content', 'filename.different', 'Ar', True), # The filename should not affect the hash
(b'contenta', 'filename.pseudo', 'Ar', False), # Different content should mean a different hash
(b'content', 'filename.pseudo', 'Kr', False), # A different element should mean a different hash
)
)
def test_hash(stream, filename, element, are_equal):
"""Test the behavior of the hash of ``PseudoPotentialData`` nodes.
The hash should only be based on the contents of the file and the element.
"""
left = PseudoPotentialData(io.BytesIO(b'content'), filename='filename.pseudo')
left.element = 'Ar'
left.store()

right = PseudoPotentialData(io.BytesIO(stream), filename=filename)
right.element = element
right.store()

assert (left.base.caching.get_hash() == right.base.caching.get_hash()) is are_equal

0 comments on commit 48635ee

Please sign in to comment.