From 874db31fb171577bfc0c962708786f9774ea6b1b Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Fri, 13 Sep 2024 10:58:25 -0700 Subject: [PATCH] Fix handling of read h5py string datasets (#1189) --- CHANGELOG.md | 6 ++ src/hdmf/build/objectmapper.py | 5 +- src/hdmf/utils.py | 2 +- tests/unit/build_tests/test_io_map.py | 131 +++++++++++++++++++++++++- 4 files changed, 141 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb51dc538..c7919c81b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # HDMF Changelog +## HDMF 3.14.5 (September 6, 2024) + +### Bug fixes +- Fixed bug in writing of string arrays to an HDF5 file that were read from an HDF5 file that was introduced in 3.14.4. @rly @stephprince + [#1189](https://github.com/hdmf-dev/hdmf/pull/1189) + ## HDMF 3.14.4 (September 4, 2024) ### Enhancements diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index 3e8d835f1..83df1b427 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -602,7 +602,10 @@ def __get_data_type(cls, spec): def __convert_string(self, value, spec): """Convert string types to the specified dtype.""" def __apply_string_type(value, string_type): - if isinstance(value, (list, tuple, np.ndarray, DataIO)): + # NOTE: if a user passes a h5py.Dataset that is not wrapped with a hdmf.utils.StrDataset, + # then this conversion may not be correct. Users should unpack their string h5py.Datasets + # into a numpy array (or wrap them in StrDataset) before passing them to a container object. + if hasattr(value, '__iter__') and not isinstance(value, (str, bytes)): return [__apply_string_type(item, string_type) for item in value] else: return string_type(value) diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py index 5e0b61539..50db79c40 100644 --- a/src/hdmf/utils.py +++ b/src/hdmf/utils.py @@ -1140,7 +1140,7 @@ def update(self, other): @docval_macro('array_data') class StrDataset(h5py.Dataset): - """Wrapper to decode strings on reading the dataset""" + """Wrapper to decode strings on reading the dataset. Use only for h5py 3+.""" def __init__(self, dset, encoding, errors='strict'): self.dset = dset if encoding is None: diff --git a/tests/unit/build_tests/test_io_map.py b/tests/unit/build_tests/test_io_map.py index e095ef318..730530a5a 100644 --- a/tests/unit/build_tests/test_io_map.py +++ b/tests/unit/build_tests/test_io_map.py @@ -1,4 +1,4 @@ -from hdmf.utils import docval, getargs +from hdmf.utils import StrDataset, docval, getargs from hdmf import Container, Data from hdmf.backends.hdf5 import H5DataIO from hdmf.build import (GroupBuilder, DatasetBuilder, ObjectMapper, BuildManager, TypeMap, LinkBuilder, @@ -7,12 +7,15 @@ from hdmf.spec import (GroupSpec, AttributeSpec, DatasetSpec, SpecCatalog, SpecNamespace, NamespaceCatalog, RefSpec, LinkSpec) from hdmf.testing import TestCase +import h5py from abc import ABCMeta, abstractmethod import unittest import numpy as np from tests.unit.helpers.utils import CORE_NAMESPACE, create_test_type_map +H5PY_3 = h5py.__version__.startswith('3') + class Bar(Container): @@ -460,6 +463,132 @@ def test_build_3d_ndarray(self): np.testing.assert_array_equal(builder.get('data').data, str_array_3d) np.testing.assert_array_equal(builder.get('attr_array'), str_array_3d) + @unittest.skipIf(not H5PY_3, "Use StrDataset only for h5py 3+") + def test_build_1d_h5py_3_dataset(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, ), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, ))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + # create in-memory hdf5 file that is discarded after closing + with h5py.File("test.h5", "w", driver="core", backing_store=False) as f: + str_array_1d = np.array( + ['aa', 'bb', 'cc', 'dd'], + dtype=h5py.special_dtype(vlen=str) + ) + # wrap the dataset in a StrDataset to mimic how HDF5IO would read this dataset with h5py 3+ + dataset = StrDataset(f.create_dataset('data', data=str_array_1d), None) + bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset) + builder = type_map.build(bar_inst) + np.testing.assert_array_equal(builder.get('data').data, dataset[:]) + np.testing.assert_array_equal(builder.get('attr_array'), dataset[:]) + + @unittest.skipIf(not H5PY_3, "Use StrDataset only for h5py 3+") + def test_build_3d_h5py_3_dataset(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, None, None), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, None, None))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + # create in-memory hdf5 file that is discarded after closing + with h5py.File("test.h5", "w", driver="core", backing_store=False) as f: + str_array_3d = np.array( + [[['aa', 'bb'], ['cc', 'dd']], [['ee', 'ff'], ['gg', 'hh']]], + dtype=h5py.special_dtype(vlen=str) + ) + # wrap the dataset in a StrDataset to mimic how HDF5IO would read this dataset with h5py 3+ + dataset = StrDataset(f.create_dataset('data', data=str_array_3d), None) + bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset) + builder = type_map.build(bar_inst) + np.testing.assert_array_equal(builder.get('data').data, dataset[:]) + np.testing.assert_array_equal(builder.get('attr_array'), dataset[:]) + + @unittest.skipIf(H5PY_3, "Create dataset differently for h5py < 3") + def test_build_1d_h5py_2_dataset(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, ), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, ))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + # create in-memory hdf5 file that is discarded after closing + with h5py.File("test.h5", "w", driver="core", backing_store=False) as f: + str_array_1d = np.array( + ['aa', 'bb', 'cc', 'dd'], + dtype=h5py.special_dtype(vlen=str) + ) + dataset = f.create_dataset('data', data=str_array_1d) + bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset) + builder = type_map.build(bar_inst) + np.testing.assert_array_equal(builder.get('data').data, dataset[:]) + np.testing.assert_array_equal(builder.get('attr_array'), dataset[:]) + + @unittest.skipIf(H5PY_3, "Create dataset differently for h5py < 3") + def test_build_3d_h5py_2_dataset(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, None, None), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, None, None))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + # create in-memory hdf5 file that is discarded after closing + with h5py.File("test.h5", "w", driver="core", backing_store=False) as f: + str_array_3d = np.array( + [[['aa', 'bb'], ['cc', 'dd']], [['ee', 'ff'], ['gg', 'hh']]], + dtype=h5py.special_dtype(vlen=str) + ) + dataset = f.create_dataset('data', data=str_array_3d) + bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset) + builder = type_map.build(bar_inst) + np.testing.assert_array_equal(builder.get('data').data, dataset[:]) + np.testing.assert_array_equal(builder.get('attr_array'), dataset[:]) + def test_build_dataio(self): bar_spec = GroupSpec('A test group specification with a data type', data_type_def='Bar',