Skip to content

Commit

Permalink
Add support to write multidimensional string arrays (#1173)
Browse files Browse the repository at this point in the history
* add condition for multidim string arrays

* add tests for multidim string array build

* update condition when defining hdf5 dataset shape

* add test to write multidim string array

* update CHANGELOG.md

* fix text decoding in test

* add recursive string type for arrays of arbitrary dim

* add test for compound data type with strings

* add tests for multidim str attributes

* fix line lengths

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update compound dtype test

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Ryan Ly <[email protected]>
  • Loading branch information
3 people authored Aug 21, 2024
1 parent d50db92 commit 2b167ae
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157)
- Adjusted stacklevel of warnings to point to user code when possible. @rly [#1166](https://github.com/hdmf-dev/hdmf/pull/1166)
- Improved "already exists" error message when adding a container to a `MultiContainerInterface`. @rly [#1165](https://github.com/hdmf-dev/hdmf/pull/1165)
- Added support to write multidimensional string arrays. @stephprince [#1173](https://github.com/hdmf-dev/hdmf/pull/1173)

## HDMF 3.14.3 (July 29, 2024)

Expand Down
2 changes: 1 addition & 1 deletion src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1469,7 +1469,7 @@ def __list_fill__(cls, parent, name, data, options=None):
data_shape = io_settings.pop('shape')
elif hasattr(data, 'shape'):
data_shape = data.shape
elif isinstance(dtype, np.dtype):
elif isinstance(dtype, np.dtype) and len(dtype) > 1: # check if compound dtype
data_shape = (len(data),)
else:
data_shape = get_data_shape(data)
Expand Down
10 changes: 8 additions & 2 deletions src/hdmf/build/objectmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,11 +598,17 @@ def __get_data_type(cls, spec):

def __convert_string(self, value, spec):
"""Convert string types to the specified dtype."""
def __apply_string_type(value, string_type):
if isinstance(value, (list, tuple, np.ndarray, DataIO)):
return [__apply_string_type(item, string_type) for item in value]
else:
return string_type(value)

ret = value
if isinstance(spec, AttributeSpec):
if 'text' in spec.dtype:
if spec.shape is not None or spec.dims is not None:
ret = list(map(str, value))
ret = __apply_string_type(value, str)
else:
ret = str(value)
elif isinstance(spec, DatasetSpec):
Expand All @@ -618,7 +624,7 @@ def string_type(x):
return x.isoformat() # method works for both date and datetime
if string_type is not None:
if spec.shape is not None or spec.dims is not None:
ret = list(map(string_type, value))
ret = __apply_string_type(value, string_type)
else:
ret = string_type(value)
# copy over any I/O parameters if they were specified
Expand Down
7 changes: 4 additions & 3 deletions tests/unit/build_tests/test_classgenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,11 @@ def test_dynamic_container_creation(self):
baz_spec = GroupSpec('A test extension with no Container class',
data_type_def='Baz', data_type_inc=self.bar_spec,
attributes=[AttributeSpec('attr3', 'a float attribute', 'float'),
AttributeSpec('attr4', 'another float attribute', 'float')])
AttributeSpec('attr4', 'another float attribute', 'float'),
AttributeSpec('attr_array', 'an array attribute', 'text', shape=(None,)),])
self.spec_catalog.register_spec(baz_spec, 'extension.yaml')
cls = self.type_map.get_dt_container_cls('Baz', CORE_NAMESPACE)
expected_args = {'name', 'data', 'attr1', 'attr2', 'attr3', 'attr4', 'skip_post_init'}
expected_args = {'name', 'data', 'attr1', 'attr2', 'attr3', 'attr4', 'attr_array', 'skip_post_init'}
received_args = set()

for x in get_docval(cls.__init__):
Expand Down Expand Up @@ -211,7 +212,7 @@ def test_dynamic_container_creation_defaults(self):
AttributeSpec('attr4', 'another float attribute', 'float')])
self.spec_catalog.register_spec(baz_spec, 'extension.yaml')
cls = self.type_map.get_dt_container_cls('Baz', CORE_NAMESPACE)
expected_args = {'name', 'data', 'attr1', 'attr2', 'attr3', 'attr4', 'foo', 'skip_post_init'}
expected_args = {'name', 'data', 'attr1', 'attr2', 'attr3', 'attr4', 'attr_array', 'foo', 'skip_post_init'}
received_args = set(map(lambda x: x['name'], get_docval(cls.__init__)))
self.assertSetEqual(expected_args, received_args)
self.assertEqual(cls.__name__, 'Baz')
Expand Down
119 changes: 113 additions & 6 deletions tests/unit/build_tests/test_io_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from hdmf.testing import TestCase
from abc import ABCMeta, abstractmethod
import unittest
import numpy as np

from tests.unit.helpers.utils import CORE_NAMESPACE, create_test_type_map

Expand All @@ -20,24 +21,27 @@ class Bar(Container):
{'name': 'attr1', 'type': str, 'doc': 'an attribute'},
{'name': 'attr2', 'type': int, 'doc': 'another attribute'},
{'name': 'attr3', 'type': float, 'doc': 'a third attribute', 'default': 3.14},
{'name': 'attr_array', 'type': 'array_data', 'doc': 'another attribute', 'default': (1, 2, 3)},
{'name': 'foo', 'type': 'Foo', 'doc': 'a group', 'default': None})
def __init__(self, **kwargs):
name, data, attr1, attr2, attr3, foo = getargs('name', 'data', 'attr1', 'attr2', 'attr3', 'foo', kwargs)
name, data, attr1, attr2, attr3, attr_array, foo = getargs('name', 'data', 'attr1', 'attr2', 'attr3',
'attr_array', 'foo', kwargs)
super().__init__(name=name)
self.__data = data
self.__attr1 = attr1
self.__attr2 = attr2
self.__attr3 = attr3
self.__attr_array = attr_array
self.__foo = foo
if self.__foo is not None and self.__foo.parent is None:
self.__foo.parent = self

def __eq__(self, other):
attrs = ('name', 'data', 'attr1', 'attr2', 'attr3', 'foo')
attrs = ('name', 'data', 'attr1', 'attr2', 'attr3', 'attr_array', 'foo')
return all(getattr(self, a) == getattr(other, a) for a in attrs)

def __str__(self):
attrs = ('name', 'data', 'attr1', 'attr2', 'attr3', 'foo')
attrs = ('name', 'data', 'attr1', 'attr2', 'attr3', 'attr_array', 'foo')
return ','.join('%s=%s' % (a, getattr(self, a)) for a in attrs)

@property
Expand All @@ -60,6 +64,10 @@ def attr2(self):
def attr3(self):
return self.__attr3

@property
def attr_array(self):
return self.__attr_array

@property
def foo(self):
return self.__foo
Expand Down Expand Up @@ -333,12 +341,15 @@ def test_build_1d(self):
datasets=[DatasetSpec('an example dataset', 'text', name='data', shape=(None,),
attributes=[AttributeSpec(
'attr2', 'an example integer attribute', 'int')])],
attributes=[AttributeSpec('attr1', 'an example string attribute', 'text')])
attributes=[AttributeSpec('attr1', 'an example string attribute', 'text'),
AttributeSpec('attr_array', 'an example array attribute', 'text',
shape=(None,))])
type_map = self.customSetUp(bar_spec)
type_map.register_map(Bar, BarMapper)
bar_inst = Bar('my_bar', ['a', 'b', 'c', 'd'], 'value1', 10)
bar_inst = Bar('my_bar', ['a', 'b', 'c', 'd'], 'value1', 10, attr_array=['a', 'b', 'c', 'd'])
builder = type_map.build(bar_inst)
self.assertEqual(builder.get('data').data, ['a', 'b', 'c', 'd'])
np.testing.assert_array_equal(builder.get('data').data, np.array(['a', 'b', 'c', 'd']))
np.testing.assert_array_equal(builder.get('attr_array'), np.array(['a', 'b', 'c', 'd']))

def test_build_scalar(self):
bar_spec = GroupSpec('A test group specification with a data type',
Expand All @@ -353,6 +364,102 @@ def test_build_scalar(self):
builder = type_map.build(bar_inst)
self.assertEqual(builder.get('data').data, "['a', 'b', 'c', 'd']")

def test_build_2d_lol(self):
bar_spec = GroupSpec(
doc='A test group specification with a data type',
data_type_def='Bar',
datasets=[
DatasetSpec(
doc='an example dataset',
dtype='text',
name='data',
shape=(None, None),
attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')],
)
],
attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text',
shape=(None, None))],
)
type_map = self.customSetUp(bar_spec)
type_map.register_map(Bar, BarMapper)
str_lol_2d = [['aa', 'bb'], ['cc', 'dd']]
bar_inst = Bar('my_bar', str_lol_2d, 'value1', 10, attr_array=str_lol_2d)
builder = type_map.build(bar_inst)
self.assertEqual(builder.get('data').data, str_lol_2d)
self.assertEqual(builder.get('attr_array'), str_lol_2d)

def test_build_2d_ndarray(self):
bar_spec = GroupSpec(
doc='A test group specification with a data type',
data_type_def='Bar',
datasets=[
DatasetSpec(
doc='an example dataset',
dtype='text',
name='data',
shape=(None, None),
attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')],
)
],
attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text',
shape=(None, None))],
)
type_map = self.customSetUp(bar_spec)
type_map.register_map(Bar, BarMapper)
str_array_2d = np.array([['aa', 'bb'], ['cc', 'dd']])
bar_inst = Bar('my_bar', str_array_2d, 'value1', 10, attr_array=str_array_2d)
builder = type_map.build(bar_inst)
np.testing.assert_array_equal(builder.get('data').data, str_array_2d)
np.testing.assert_array_equal(builder.get('attr_array'), str_array_2d)

def test_build_3d_lol(self):
bar_spec = GroupSpec(
doc='A test group specification with a data type',
data_type_def='Bar',
datasets=[
DatasetSpec(
doc='an example dataset',
dtype='text',
name='data',
shape=(None, None, None),
attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')],
)
],
attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text',
shape=(None, None, None))],
)
type_map = self.customSetUp(bar_spec)
type_map.register_map(Bar, BarMapper)
str_lol_3d = [[['aa', 'bb'], ['cc', 'dd']], [['ee', 'ff'], ['gg', 'hh']]]
bar_inst = Bar('my_bar', str_lol_3d, 'value1', 10, attr_array=str_lol_3d)
builder = type_map.build(bar_inst)
self.assertEqual(builder.get('data').data, str_lol_3d)
self.assertEqual(builder.get('attr_array'), str_lol_3d)

def test_build_3d_ndarray(self):
bar_spec = GroupSpec(
doc='A test group specification with a data type',
data_type_def='Bar',
datasets=[
DatasetSpec(
doc='an example dataset',
dtype='text',
name='data',
shape=(None, None, None),
attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')],
)
],
attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text',
shape=(None, None, None))],
)
type_map = self.customSetUp(bar_spec)
type_map.register_map(Bar, BarMapper)
str_array_3d = np.array([[['aa', 'bb'], ['cc', 'dd']], [['ee', 'ff'], ['gg', 'hh']]])
bar_inst = Bar('my_bar', str_array_3d, 'value1', 10, attr_array=str_array_3d)
builder = type_map.build(bar_inst)
np.testing.assert_array_equal(builder.get('data').data, str_array_3d)
np.testing.assert_array_equal(builder.get('attr_array'), str_array_3d)

def test_build_dataio(self):
bar_spec = GroupSpec('A test group specification with a data type',
data_type_def='Bar',
Expand Down
27 changes: 26 additions & 1 deletion tests/unit/test_io_hdf5_h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from hdmf.data_utils import DataChunkIterator, GenericDataChunkIterator, InvalidDataIOError
from hdmf.spec.catalog import SpecCatalog
from hdmf.spec.namespace import NamespaceCatalog, SpecNamespace
from hdmf.spec.spec import GroupSpec
from hdmf.spec.spec import GroupSpec, DtypeSpec
from hdmf.testing import TestCase, remove_test_file
from hdmf.common.resources import HERD
from hdmf.term_set import TermSet, TermSetWrapper
Expand Down Expand Up @@ -164,6 +164,31 @@ def test_write_dataset_list(self):
dset = self.f['test_dataset']
self.assertTrue(np.all(dset[:] == a))

def test_write_dataset_lol_strings(self):
a = [['aa', 'bb'], ['cc', 'dd']]
self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a, attributes={}))
dset = self.f['test_dataset']
decoded_dset = [[item.decode('utf-8') if isinstance(item, bytes) else item for item in sublist]
for sublist in dset[:]]
self.assertTrue(decoded_dset == a)

def test_write_dataset_list_compound_datatype(self):
a = np.array([(1, 2, 0.5), (3, 4, 0.5)], dtype=[('x', 'int'), ('y', 'int'), ('z', 'float')])
dset_builder = DatasetBuilder(
name='test_dataset',
data=a.tolist(),
attributes={},
dtype=[
DtypeSpec('x', doc='x', dtype='int'),
DtypeSpec('y', doc='y', dtype='int'),
DtypeSpec('z', doc='z', dtype='float'),
],
)
self.io.write_dataset(self.f, dset_builder)
dset = self.f['test_dataset']
for field in a.dtype.names:
self.assertTrue(np.all(dset[field][:] == a[field]))

def test_write_dataset_list_compress_gzip(self):
a = H5DataIO(np.arange(30).reshape(5, 2, 3),
compression='gzip',
Expand Down

0 comments on commit 2b167ae

Please sign in to comment.