From 2b167aedc8a8f58afd75d3d0c750f6d620dc663d Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Wed, 21 Aug 2024 13:48:42 -0700 Subject: [PATCH] Add support to write multidimensional string arrays (#1173) * add condition for multidim string arrays * add tests for multidim string array build * update condition when defining hdf5 dataset shape * add test to write multidim string array * update CHANGELOG.md * fix text decoding in test * add recursive string type for arrays of arbitrary dim * add test for compound data type with strings * add tests for multidim str attributes * fix line lengths * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update compound dtype test --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ryan Ly --- CHANGELOG.md | 1 + src/hdmf/backends/hdf5/h5tools.py | 2 +- src/hdmf/build/objectmapper.py | 10 +- tests/unit/build_tests/test_classgenerator.py | 7 +- tests/unit/build_tests/test_io_map.py | 119 +++++++++++++++++- tests/unit/test_io_hdf5_h5tools.py | 27 +++- 6 files changed, 153 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c1af6aab8..549eccc7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157) - Adjusted stacklevel of warnings to point to user code when possible. @rly [#1166](https://github.com/hdmf-dev/hdmf/pull/1166) - Improved "already exists" error message when adding a container to a `MultiContainerInterface`. @rly [#1165](https://github.com/hdmf-dev/hdmf/pull/1165) +- Added support to write multidimensional string arrays. @stephprince [#1173](https://github.com/hdmf-dev/hdmf/pull/1173) ## HDMF 3.14.3 (July 29, 2024) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index da07a6a5c..ffdc4eab6 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -1469,7 +1469,7 @@ def __list_fill__(cls, parent, name, data, options=None): data_shape = io_settings.pop('shape') elif hasattr(data, 'shape'): data_shape = data.shape - elif isinstance(dtype, np.dtype): + elif isinstance(dtype, np.dtype) and len(dtype) > 1: # check if compound dtype data_shape = (len(data),) else: data_shape = get_data_shape(data) diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index b5815ee2c..d6e1de15a 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -598,11 +598,17 @@ def __get_data_type(cls, spec): def __convert_string(self, value, spec): """Convert string types to the specified dtype.""" + def __apply_string_type(value, string_type): + if isinstance(value, (list, tuple, np.ndarray, DataIO)): + return [__apply_string_type(item, string_type) for item in value] + else: + return string_type(value) + ret = value if isinstance(spec, AttributeSpec): if 'text' in spec.dtype: if spec.shape is not None or spec.dims is not None: - ret = list(map(str, value)) + ret = __apply_string_type(value, str) else: ret = str(value) elif isinstance(spec, DatasetSpec): @@ -618,7 +624,7 @@ def string_type(x): return x.isoformat() # method works for both date and datetime if string_type is not None: if spec.shape is not None or spec.dims is not None: - ret = list(map(string_type, value)) + ret = __apply_string_type(value, string_type) else: ret = string_type(value) # copy over any I/O parameters if they were specified diff --git a/tests/unit/build_tests/test_classgenerator.py b/tests/unit/build_tests/test_classgenerator.py index 52fdc4839..3c9fda283 100644 --- a/tests/unit/build_tests/test_classgenerator.py +++ b/tests/unit/build_tests/test_classgenerator.py @@ -180,10 +180,11 @@ def test_dynamic_container_creation(self): baz_spec = GroupSpec('A test extension with no Container class', data_type_def='Baz', data_type_inc=self.bar_spec, attributes=[AttributeSpec('attr3', 'a float attribute', 'float'), - AttributeSpec('attr4', 'another float attribute', 'float')]) + AttributeSpec('attr4', 'another float attribute', 'float'), + AttributeSpec('attr_array', 'an array attribute', 'text', shape=(None,)),]) self.spec_catalog.register_spec(baz_spec, 'extension.yaml') cls = self.type_map.get_dt_container_cls('Baz', CORE_NAMESPACE) - expected_args = {'name', 'data', 'attr1', 'attr2', 'attr3', 'attr4', 'skip_post_init'} + expected_args = {'name', 'data', 'attr1', 'attr2', 'attr3', 'attr4', 'attr_array', 'skip_post_init'} received_args = set() for x in get_docval(cls.__init__): @@ -211,7 +212,7 @@ def test_dynamic_container_creation_defaults(self): AttributeSpec('attr4', 'another float attribute', 'float')]) self.spec_catalog.register_spec(baz_spec, 'extension.yaml') cls = self.type_map.get_dt_container_cls('Baz', CORE_NAMESPACE) - expected_args = {'name', 'data', 'attr1', 'attr2', 'attr3', 'attr4', 'foo', 'skip_post_init'} + expected_args = {'name', 'data', 'attr1', 'attr2', 'attr3', 'attr4', 'attr_array', 'foo', 'skip_post_init'} received_args = set(map(lambda x: x['name'], get_docval(cls.__init__))) self.assertSetEqual(expected_args, received_args) self.assertEqual(cls.__name__, 'Baz') diff --git a/tests/unit/build_tests/test_io_map.py b/tests/unit/build_tests/test_io_map.py index 63f397682..e095ef318 100644 --- a/tests/unit/build_tests/test_io_map.py +++ b/tests/unit/build_tests/test_io_map.py @@ -9,6 +9,7 @@ from hdmf.testing import TestCase from abc import ABCMeta, abstractmethod import unittest +import numpy as np from tests.unit.helpers.utils import CORE_NAMESPACE, create_test_type_map @@ -20,24 +21,27 @@ class Bar(Container): {'name': 'attr1', 'type': str, 'doc': 'an attribute'}, {'name': 'attr2', 'type': int, 'doc': 'another attribute'}, {'name': 'attr3', 'type': float, 'doc': 'a third attribute', 'default': 3.14}, + {'name': 'attr_array', 'type': 'array_data', 'doc': 'another attribute', 'default': (1, 2, 3)}, {'name': 'foo', 'type': 'Foo', 'doc': 'a group', 'default': None}) def __init__(self, **kwargs): - name, data, attr1, attr2, attr3, foo = getargs('name', 'data', 'attr1', 'attr2', 'attr3', 'foo', kwargs) + name, data, attr1, attr2, attr3, attr_array, foo = getargs('name', 'data', 'attr1', 'attr2', 'attr3', + 'attr_array', 'foo', kwargs) super().__init__(name=name) self.__data = data self.__attr1 = attr1 self.__attr2 = attr2 self.__attr3 = attr3 + self.__attr_array = attr_array self.__foo = foo if self.__foo is not None and self.__foo.parent is None: self.__foo.parent = self def __eq__(self, other): - attrs = ('name', 'data', 'attr1', 'attr2', 'attr3', 'foo') + attrs = ('name', 'data', 'attr1', 'attr2', 'attr3', 'attr_array', 'foo') return all(getattr(self, a) == getattr(other, a) for a in attrs) def __str__(self): - attrs = ('name', 'data', 'attr1', 'attr2', 'attr3', 'foo') + attrs = ('name', 'data', 'attr1', 'attr2', 'attr3', 'attr_array', 'foo') return ','.join('%s=%s' % (a, getattr(self, a)) for a in attrs) @property @@ -60,6 +64,10 @@ def attr2(self): def attr3(self): return self.__attr3 + @property + def attr_array(self): + return self.__attr_array + @property def foo(self): return self.__foo @@ -333,12 +341,15 @@ def test_build_1d(self): datasets=[DatasetSpec('an example dataset', 'text', name='data', shape=(None,), attributes=[AttributeSpec( 'attr2', 'an example integer attribute', 'int')])], - attributes=[AttributeSpec('attr1', 'an example string attribute', 'text')]) + attributes=[AttributeSpec('attr1', 'an example string attribute', 'text'), + AttributeSpec('attr_array', 'an example array attribute', 'text', + shape=(None,))]) type_map = self.customSetUp(bar_spec) type_map.register_map(Bar, BarMapper) - bar_inst = Bar('my_bar', ['a', 'b', 'c', 'd'], 'value1', 10) + bar_inst = Bar('my_bar', ['a', 'b', 'c', 'd'], 'value1', 10, attr_array=['a', 'b', 'c', 'd']) builder = type_map.build(bar_inst) - self.assertEqual(builder.get('data').data, ['a', 'b', 'c', 'd']) + np.testing.assert_array_equal(builder.get('data').data, np.array(['a', 'b', 'c', 'd'])) + np.testing.assert_array_equal(builder.get('attr_array'), np.array(['a', 'b', 'c', 'd'])) def test_build_scalar(self): bar_spec = GroupSpec('A test group specification with a data type', @@ -353,6 +364,102 @@ def test_build_scalar(self): builder = type_map.build(bar_inst) self.assertEqual(builder.get('data').data, "['a', 'b', 'c', 'd']") + def test_build_2d_lol(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, None), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, None))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + str_lol_2d = [['aa', 'bb'], ['cc', 'dd']] + bar_inst = Bar('my_bar', str_lol_2d, 'value1', 10, attr_array=str_lol_2d) + builder = type_map.build(bar_inst) + self.assertEqual(builder.get('data').data, str_lol_2d) + self.assertEqual(builder.get('attr_array'), str_lol_2d) + + def test_build_2d_ndarray(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, None), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, None))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + str_array_2d = np.array([['aa', 'bb'], ['cc', 'dd']]) + bar_inst = Bar('my_bar', str_array_2d, 'value1', 10, attr_array=str_array_2d) + builder = type_map.build(bar_inst) + np.testing.assert_array_equal(builder.get('data').data, str_array_2d) + np.testing.assert_array_equal(builder.get('attr_array'), str_array_2d) + + def test_build_3d_lol(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, None, None), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, None, None))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + str_lol_3d = [[['aa', 'bb'], ['cc', 'dd']], [['ee', 'ff'], ['gg', 'hh']]] + bar_inst = Bar('my_bar', str_lol_3d, 'value1', 10, attr_array=str_lol_3d) + builder = type_map.build(bar_inst) + self.assertEqual(builder.get('data').data, str_lol_3d) + self.assertEqual(builder.get('attr_array'), str_lol_3d) + + def test_build_3d_ndarray(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, None, None), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, None, None))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + str_array_3d = np.array([[['aa', 'bb'], ['cc', 'dd']], [['ee', 'ff'], ['gg', 'hh']]]) + bar_inst = Bar('my_bar', str_array_3d, 'value1', 10, attr_array=str_array_3d) + builder = type_map.build(bar_inst) + np.testing.assert_array_equal(builder.get('data').data, str_array_3d) + np.testing.assert_array_equal(builder.get('attr_array'), str_array_3d) + def test_build_dataio(self): bar_spec = GroupSpec('A test group specification with a data type', data_type_def='Bar', diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 5a4fd5a32..b004a6c54 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -24,7 +24,7 @@ from hdmf.data_utils import DataChunkIterator, GenericDataChunkIterator, InvalidDataIOError from hdmf.spec.catalog import SpecCatalog from hdmf.spec.namespace import NamespaceCatalog, SpecNamespace -from hdmf.spec.spec import GroupSpec +from hdmf.spec.spec import GroupSpec, DtypeSpec from hdmf.testing import TestCase, remove_test_file from hdmf.common.resources import HERD from hdmf.term_set import TermSet, TermSetWrapper @@ -164,6 +164,31 @@ def test_write_dataset_list(self): dset = self.f['test_dataset'] self.assertTrue(np.all(dset[:] == a)) + def test_write_dataset_lol_strings(self): + a = [['aa', 'bb'], ['cc', 'dd']] + self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a, attributes={})) + dset = self.f['test_dataset'] + decoded_dset = [[item.decode('utf-8') if isinstance(item, bytes) else item for item in sublist] + for sublist in dset[:]] + self.assertTrue(decoded_dset == a) + + def test_write_dataset_list_compound_datatype(self): + a = np.array([(1, 2, 0.5), (3, 4, 0.5)], dtype=[('x', 'int'), ('y', 'int'), ('z', 'float')]) + dset_builder = DatasetBuilder( + name='test_dataset', + data=a.tolist(), + attributes={}, + dtype=[ + DtypeSpec('x', doc='x', dtype='int'), + DtypeSpec('y', doc='y', dtype='int'), + DtypeSpec('z', doc='z', dtype='float'), + ], + ) + self.io.write_dataset(self.f, dset_builder) + dset = self.f['test_dataset'] + for field in a.dtype.names: + self.assertTrue(np.all(dset[field][:] == a[field])) + def test_write_dataset_list_compress_gzip(self): a = H5DataIO(np.arange(30).reshape(5, 2, 3), compression='gzip',