diff --git a/CHANGELOG.md b/CHANGELOG.md index c36c903c..8769af4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,14 @@ # HDMF-ZARR Changelog -## 0.7.0 (May 2, 2024) +## 0.9.0 (Upcoming) +### Enhancements +* Added support for appending a dataset of references. @mavaylon1 [#203](https://github.com/hdmf-dev/hdmf-zarr/pull/203) +* NWBZarrIO load_namespaces=True by default. @mavaylon1 [#204](https://github.com/hdmf-dev/hdmf-zarr/pull/204) +* Added test for opening file with consolidated metadata from DANDI. @mavaylon1 [#206](https://github.com/hdmf-dev/hdmf-zarr/pull/206) + +## 0.8.0 (June 4, 2024) ### Bug Fixes +* Fixed bug when opening a file in with `mode=r+`. The file will open without using the consolidated metadata. @mavaylon1 [#182](https://github.com/hdmf-dev/hdmf-zarr/issues/182) * Fixed bug on how we access scalar arrays. Added warning filter for Zarr deprecation of NestedDirectoryStore. Fixed bug on how we write a dataset of references. @mavaylon1 [#195](https://github.com/hdmf-dev/hdmf-zarr/pull/195) ## 0.7.0 (May 2, 2024) diff --git a/docs/source/integrating_data_stores.rst b/docs/source/integrating_data_stores.rst index d7573e73..f5a8c481 100644 --- a/docs/source/integrating_data_stores.rst +++ b/docs/source/integrating_data_stores.rst @@ -36,7 +36,7 @@ Updating ZarrIO :py:meth:`~hdmf_zarr.backend.ZarrIO.get_builder_exists_on_disk` method may need to be updated to ensure references are opened correctly on read for files stored with your new store. The - :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` function may also need to be updated, in + :py:meth:`~hdmf_zarr.backend.ZarrIO._create_ref` function may also need to be updated, in particular in case the links to your store also modify the storage schema for links (e.g., if you need to store additional metadata in order to resolve links to your store). diff --git a/docs/source/storage.rst b/docs/source/storage.rst index f906f159..864fe420 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -217,7 +217,7 @@ For example: In :py:class:`~hdmf_zarr.backend.ZarrIO`, links are written by the :py:meth:`~hdmf_zarr.backend.ZarrIO.__write_link__` function, which also uses the helper functions - i) :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` to construct py:meth:`~hdmf_zarr.utils.ZarrRefernce` + i) :py:meth:`~hdmf_zarr.backend.ZarrIO._create_ref` to construct py:meth:`~hdmf_zarr.utils.ZarrRefernce` and ii) :py:meth:`~hdmf_zarr.backend.ZarrIO.__add_link__` to add a link to the Zarr file. :py:meth:`~hdmf_zarr.backend.ZarrIO.__read_links` then parses links and also uses the :py:meth:`~hdmf_zarr.backend.ZarrIO.__resolve_ref` helper function to resolve the paths stored in links. @@ -245,7 +245,7 @@ by their location (i.e., index) in the dataset. As such, object references only the relative path to the target Zarr file, and the ``path`` identifying the object within the source Zarr file. The individual object references are defined in the :py:class:`~hdmf_zarr.backend.ZarrIO` as py:class:`~hdmf_zarr.utils.ZarrReference` object created via -the :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` helper function. +the :py:meth:`~hdmf_zarr.backend.ZarrIO._create_ref` helper function. By default, :py:class:`~hdmf_zarr.backend.ZarrIO` uses the ``numcodecs.pickles.Pickle`` codec to encode object references defined as py:class:`~hdmf_zarr.utils.ZarrReference` dicts in datasets. @@ -297,7 +297,7 @@ store the definition of the ``region`` that is being referenced, e.g., a slice o To implement region references will require updating: 1) py:class:`~hdmf_zarr.utils.ZarrReference` to add a ``region`` key to support storing the selection for the region, - 2) :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` to support passing in the region definition to + 2) :py:meth:`~hdmf_zarr.backend.ZarrIO._create_ref` to support passing in the region definition to be added to the py:class:`~hdmf_zarr.utils.ZarrReference`, 3) :py:meth:`~hdmf_zarr.backend.ZarrIO.write_dataset` already partially implements the required logic for creating region references by checking for :py:class:`hdmf.build.RegionBuilder` inputs diff --git a/pyproject.toml b/pyproject.toml index 56c63b7e..7c029a6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,8 +29,8 @@ classifiers = [ "Topic :: Scientific/Engineering :: Medical Science Apps." ] dependencies = [ - 'hdmf>=3.9.0', - 'zarr>=2.11.0', + 'hdmf>=3.14.2', + 'zarr>=2.11.0, <3.0', # pin below 3.0 until HDMF-zarr supports zarr 3.0 'numpy>=1.24, <2.0', # pin below 2.0 until HDMF supports numpy 2.0 'numcodecs>=0.9.1', 'pynwb>=2.5.0', @@ -109,7 +109,7 @@ force-exclude = ''' ''' [tool.ruff] -select = ["E", "F", "T100", "T201", "T203"] +lint.select = ["E", "F", "T100", "T201", "T203"] exclude = [ ".git", ".tox", @@ -128,5 +128,5 @@ line-length = 120 "src/*/__init__.py" = ["F401"] "test_gallery.py" = ["T201"] -[tool.ruff.mccabe] +[tool.ruff.lint.mccabe] max-complexity = 17 diff --git a/requirements-min.txt b/requirements-min.txt index b5a13e58..02f97f1d 100644 --- a/requirements-min.txt +++ b/requirements-min.txt @@ -1,4 +1,4 @@ -hdmf==3.12.0 +hdmf==3.14.2 zarr==2.11.0 numcodecs==0.9.1 pynwb==2.5.0 diff --git a/requirements-opt.txt b/requirements-opt.txt index 13d8f5e2..771f5d04 100644 --- a/requirements-opt.txt +++ b/requirements-opt.txt @@ -1,3 +1,3 @@ -tqdm==4.66.1 -fsspec==2023.12.2 -s3fs==2023.12.2 +tqdm==4.66.4 +fsspec==2024.6.0 +s3fs==2024.6.0 diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 2b8af14c..953ae286 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -160,7 +160,10 @@ def open(self): if self.__file is None: # Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'. # As a result, when in other modes, the file will not use consolidated metadata. - if self.__mode not in ['r', 'r+']: + if self.__mode != 'r': + # When we consolidate metadata, we use ConsolidatedMetadataStore. + # This interface does not allow for setting items. + # In the doc string, it says it is "read only". As a result, we cannot use r+ with consolidate_metadata. # r- is only an internal mode in ZarrIO to force the use of regular open. For Zarr we need to # use the regular mode r when r- is specified mode_to_use = self.__mode if self.__mode != 'r-' else 'r' @@ -194,13 +197,17 @@ def is_remote(self): {'name': 'path', 'type': (str, *SUPPORTED_ZARR_STORES), 'doc': 'the path to the Zarr file or a supported Zarr store'}, - {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None}) - def load_namespaces(cls, namespace_catalog, path, namespaces=None): + {'name': 'storage_options', 'type': dict, + 'doc': 'Zarr storage options to read remote folders', + 'default': None}, + {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None} + ) + def load_namespaces(cls, namespace_catalog, path, storage_options, namespaces=None): ''' Load cached namespaces from a file. ''' # TODO: how to use storage_options here? - f = zarr.open(path, mode='r') + f = zarr.open(path, mode='r', storage_options=storage_options) if SPEC_LOC_ATTR not in f.attrs: msg = "No cached namespaces found in %s" % path warnings.warn(msg) @@ -588,13 +595,13 @@ def write_attributes(self, **kwargs): # TODO: Region References are not yet supported # if isinstance(value, RegionBuilder): # type_str = 'region' - # refs = self.__get_ref(value.builder) + # refs = self._create_ref(value.builder) if isinstance(value, (ReferenceBuilder, Container, Builder)): type_str = 'object' if isinstance(value, Builder): - refs = self.__get_ref(value, export_source) + refs = self._create_ref(value, export_source) else: - refs = self.__get_ref(value.builder, export_source) + refs = self._create_ref(value.builder, export_source) tmp = {'zarr_dtype': type_str, 'value': refs} obj.attrs[key] = tmp # Case 3: Scalar attributes @@ -736,7 +743,7 @@ def resolve_ref(self, zarr_ref): # Return the create path return target_name, target_zarr_obj - def __get_ref(self, ref_object, export_source=None): + def _create_ref(self, ref_object, export_source=None): """ Create a ZarrReference object that points to the given container @@ -755,6 +762,7 @@ def __get_ref(self, ref_object, export_source=None): builder = ref_object.builder else: builder = self.manager.build(ref_object) + path = self.__get_path(builder) # TODO Add to get region for region references. # Also add {'name': 'region', 'type': (slice, list, tuple), @@ -833,7 +841,7 @@ def write_link(self, **kwargs): name = builder.name target_builder = builder.builder # Get the reference - zarr_ref = self.__get_ref(target_builder) + zarr_ref = self._create_ref(target_builder) # EXPORT WITH LINKS: Fix link source # if the target and source are both the same, then we need to ALWAYS use ourselves as a source # When exporting from one source to another, the LinkBuilders.source are not updated, i.e,. the @@ -1004,7 +1012,7 @@ def write_dataset(self, **kwargs): # noqa: C901 elif isinstance(data, HDMFDataset): # If we have a dataset of containers we need to make the references to the containers if len(data) > 0 and isinstance(data[0], Container): - ref_data = [self.__get_ref(data[i], export_source=export_source) for i in range(len(data))] + ref_data = [self._create_ref(data[i], export_source=export_source) for i in range(len(data))] shape = (len(data), ) type_str = 'object' dset = parent.require_dataset(name, @@ -1037,7 +1045,7 @@ def write_dataset(self, **kwargs): # noqa: C901 for i, dts in enumerate(options['dtype']): if self.__is_ref(dts['dtype']): refs.append(i) - ref_tmp = self.__get_ref(data[0][i], export_source=export_source) + ref_tmp = self._create_ref(data[0][i], export_source=export_source) if isinstance(ref_tmp, ZarrReference): dts_str = 'object' else: @@ -1057,7 +1065,7 @@ def write_dataset(self, **kwargs): # noqa: C901 for j, item in enumerate(data): new_item = list(item) for i in refs: - new_item[i] = self.__get_ref(item[i], export_source=export_source) + new_item[i] = self._create_ref(item[i], export_source=export_source) new_items.append(tuple(new_item)) # Create dtype for storage, replacing values to match hdmf's hdf5 behavior @@ -1102,20 +1110,20 @@ def write_dataset(self, **kwargs): # noqa: C901 # if isinstance(data, RegionBuilder): # shape = (1,) # type_str = 'region' - # refs = self.__get_ref(data.builder, data.region) + # refs = self._create_ref(data.builder, data.region) if isinstance(data, ReferenceBuilder): shape = (1,) type_str = 'object' - refs = self.__get_ref(data.builder, export_source=export_source) + refs = self._create_ref(data.builder, export_source=export_source) # TODO: Region References are not yet supported # elif options['dtype'] == 'region': # shape = (len(data), ) # type_str = 'region' - # refs = [self.__get_ref(item.builder, item.region) for item in data] + # refs = [self._create_ref(item.builder, item.region) for item in data] else: shape = (len(data), ) type_str = 'object' - refs = [self.__get_ref(item, export_source=export_source) for item in data] + refs = [self._create_ref(item, export_source=export_source) for item in data] dset = parent.require_dataset(name, shape=shape, @@ -1305,7 +1313,7 @@ def __list_fill__(self, parent, name, data, options=None): # noqa: C901 dset.attrs['zarr_dtype'] = type_str # Write the data to file - if dtype == object: + if dtype == object: # noqa: E721 for c in np.ndindex(data_shape): o = data for i in c: @@ -1339,7 +1347,7 @@ def __scalar_fill__(self, parent, name, data, options=None): except Exception as exc: msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) raise Exception(msg) from exc - if dtype == object: + if dtype == object: # noqa: E721 io_settings['object_codec'] = self.__codec_cls() dset = parent.require_dataset(name, shape=(1, ), dtype=dtype, **io_settings) @@ -1483,6 +1491,7 @@ def __read_dataset(self, zarr_obj, name): # Read scalar dataset if dtype == 'scalar': data = zarr_obj[()] + if isinstance(dtype, list): # Check compound dataset where one of the subsets contains references has_reference = False diff --git a/src/hdmf_zarr/nwb.py b/src/hdmf_zarr/nwb.py index ed3d831c..582f1a63 100644 --- a/src/hdmf_zarr/nwb.py +++ b/src/hdmf_zarr/nwb.py @@ -22,7 +22,7 @@ class NWBZarrIO(ZarrIO): @docval(*get_docval(ZarrIO.__init__), {'name': 'load_namespaces', 'type': bool, 'doc': 'whether or not to load cached namespaces from given path - not applicable in write mode', - 'default': False}, + 'default': True}, {'name': 'extensions', 'type': (str, TypeMap, list), 'doc': 'a path to a namespace, a TypeMap, or a list consisting paths to namespaces and TypeMaps', 'default': None}) @@ -30,17 +30,14 @@ def __init__(self, **kwargs): path, mode, manager, extensions, load_namespaces, synchronizer, storage_options = \ popargs('path', 'mode', 'manager', 'extensions', 'load_namespaces', 'synchronizer', 'storage_options', kwargs) - if load_namespaces: - if manager is not None: - warn("loading namespaces from file - ignoring 'manager'") - if extensions is not None: - warn("loading namespaces from file - ignoring 'extensions' argument") - # namespaces are not loaded when creating an NWBZarrIO object in write mode - if 'w' in mode or mode == 'x': - raise ValueError("cannot load namespaces from file when writing to it") + io_modes_that_create_file = ['w', 'w-', 'x'] + if mode in io_modes_that_create_file or manager is not None or extensions is not None: + load_namespaces = False + + if load_namespaces: tm = get_type_map() - super(NWBZarrIO, self).load_namespaces(tm, path) + super(NWBZarrIO, self).load_namespaces(tm, path, storage_options) manager = BuildManager(tm) else: if manager is not None and extensions is not None: diff --git a/src/hdmf_zarr/zarr_utils.py b/src/hdmf_zarr/zarr_utils.py index b9717c09..d1e4b117 100644 --- a/src/hdmf_zarr/zarr_utils.py +++ b/src/hdmf_zarr/zarr_utils.py @@ -10,6 +10,7 @@ from zarr import Array as ZarrArray from hdmf.build import DatasetBuilder +from hdmf.data_utils import append_data from hdmf.array import Array from hdmf.query import HDMFDataset, ReferenceResolver, ContainerResolver, BuilderResolver from hdmf.utils import docval, popargs, get_docval @@ -73,6 +74,29 @@ def __iter__(self): def __next__(self): return self._get_ref(super().__next__()) + def append(self, arg): + # Building the root parent first. + # (Doing so will correctly set the parent of the child builder, which is needed to create the reference) + # Note: If the arg is a nested child such that objB is the parent of arg and objA is the parent of objB + # (and objA is not the root), then we need to have objA already added to the root as a child. Otherwise, + # the loop will use objA as the root. This might not raise an error (meaning the path could be correct), + # but it could lead to having an incorrect path for the reference. + # Having objA NOT be an orphaned container ensures correct functionality. + child = arg + while True: + if child.parent is not None: + parent = child.parent + child = parent + else: + parent = child + break + self.io.manager.build(parent) + builder = self.io.manager.build(arg) + + # Create ZarrReference + ref = self.io._create_ref(builder) + append_data(self.dataset, ref) + class BuilderResolverMixin(BuilderResolver): # refactor to backend/utils.py """ diff --git a/tests/unit/test_fsspec_streaming.py b/tests/unit/test_fsspec_streaming.py index 6fe71e0a..cbb0bf06 100644 --- a/tests/unit/test_fsspec_streaming.py +++ b/tests/unit/test_fsspec_streaming.py @@ -2,6 +2,8 @@ from hdmf_zarr import NWBZarrIO from .utils import check_s3fs_ffspec_installed +import zarr + HAVE_FSSPEC = check_s3fs_ffspec_installed() @@ -24,3 +26,16 @@ def test_fsspec_streaming(self): self.assertEqual(len(nwbfile.electrode_groups), 2) self.assertEqual(len(nwbfile.electrodes), 1152) self.assertEqual(nwbfile.institution, "AIND") + + @unittest.skipIf(not HAVE_FSSPEC, "fsspec not installed") + def test_s3_open_with_consolidated_(self): + """ + The file is a Zarr file with consolidated metadata. + """ + s3_path = "https://dandiarchive.s3.amazonaws.com/zarr/ccefbc9f-30e7-4a4c-b044-5b59d300040b/" + with NWBZarrIO(s3_path, mode='r') as read_io: + read_io.open() + self.assertIsInstance(read_io.file.store, zarr.storage.ConsolidatedMetadataStore) + with NWBZarrIO(s3_path, mode='-r') as read_io: + read_io.open() + self.assertIsInstance(read_io.file.store, zarr.storage.FSStore) diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index 081706a8..174effb2 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -16,9 +16,12 @@ from zarr.storage import (DirectoryStore, TempStore, NestedDirectoryStore) +from tests.unit.utils import (Baz, BazData, BazBucket, get_baz_buildmanager) import zarr from hdmf_zarr.backend import ZarrIO import os +import shutil +import warnings CUR_DIR = os.path.dirname(os.path.realpath(__file__)) @@ -182,3 +185,50 @@ def test_force_open_without_consolidated_fails(self): except ValueError as e: self.fail("ZarrIO.__open_file_consolidated raised an unexpected ValueError: {}".format(e)) + +class TestDatasetofReferences(ZarrStoreTestCase): + def setUp(self): + self.store_path = "test_io.zarr" + self.store = DirectoryStore(self.store_path) + + def tearDown(self): + """ + Remove all files and folders defined by self.store_path + """ + paths = self.store_path if isinstance(self.store_path, list) else [self.store_path, ] + for path in paths: + if os.path.exists(path): + if os.path.isdir(path): + shutil.rmtree(path) + elif os.path.isfile(path): + os.remove(path) + else: + warnings.warn("Could not remove: %s" % path) + + def test_append_references(self): + # Setup a file container with references + num_bazs = 10 + bazs = [] # set up dataset of references + for i in range(num_bazs): + bazs.append(Baz(name='baz%d' % i)) + baz_data = BazData(name='baz_data', data=bazs) + container = BazBucket(bazs=bazs, baz_data=baz_data) + manager = get_baz_buildmanager() + + with ZarrIO(self.store, manager=manager, mode='w') as writer: + writer.write(container=container) + + with ZarrIO(self.store, manager=manager, mode='a') as append_io: + read_container = append_io.read() + new_baz = Baz(name='new') + read_container.add_baz(new_baz) + + DoR = read_container.baz_data.data + DoR.append(new_baz) + + append_io.write(read_container) + + with ZarrIO(self.store, manager=manager, mode='r') as append_io: + read_container = append_io.read() + self.assertEqual(len(read_container.baz_data.data), 11) + self.assertIs(read_container.baz_data.data[10], read_container.bazs["new"])