diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index a1db846c..ca2a47cb 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -62,7 +62,6 @@ body: attributes: label: Python Version options: - - "3.7" - "3.8" - "3.9" - "3.10" diff --git a/.github/workflows/run_all_tests.yml b/.github/workflows/run_all_tests.yml index e58226f3..5f5810cd 100644 --- a/.github/workflows/run_all_tests.yml +++ b/.github/workflows/run_all_tests.yml @@ -22,24 +22,21 @@ jobs: fail-fast: false matrix: include: - - { name: linux-python3.7-minimum , test-tox-env: py37-minimum , build-tox-env: build-py37-minimum , python-ver: "3.7" , os: ubuntu-latest } - - { name: linux-python3.8 , test-tox-env: py38 , build-tox-env: build-py38 , python-ver: "3.8" , os: ubuntu-latest } + - { name: linux-python3.8-minimum , test-tox-env: py38-minimum , build-tox-env: build-py38-minimum , python-ver: "3.8" , os: ubuntu-latest } - { name: linux-python3.9 , test-tox-env: py39 , build-tox-env: build-py39 , python-ver: "3.9" , os: ubuntu-latest } - { name: linux-python3.10 , test-tox-env: py310 , build-tox-env: build-py310 , python-ver: "3.10", os: ubuntu-latest } - { name: linux-python3.11 , test-tox-env: py311 , build-tox-env: build-py311 , python-ver: "3.11", os: ubuntu-latest } - { name: linux-python3.11-optional , test-tox-env: py311-optional , build-tox-env: build-py311-optional , python-ver: "3.11", os: ubuntu-latest } - { name: linux-python3.11-upgraded , test-tox-env: py311-upgraded , build-tox-env: build-py311-upgraded , python-ver: "3.11", os: ubuntu-latest } - { name: linux-python3.11-prerelease , test-tox-env: py311-prerelease, build-tox-env: build-py311-prerelease, python-ver: "3.11", os: ubuntu-latest } - - { name: windows-python3.7-minimum , test-tox-env: py37-minimum , build-tox-env: build-py37-minimum , python-ver: "3.7" , os: windows-latest } - - { name: windows-python3.8 , test-tox-env: py38 , build-tox-env: build-py38 , python-ver: "3.8" , os: windows-latest } + - { name: windows-python3.8-minimum , test-tox-env: py38-minimum , build-tox-env: build-py38-minimum , python-ver: "3.8" , os: windows-latest } - { name: windows-python3.9 , test-tox-env: py39 , build-tox-env: build-py39 , python-ver: "3.9" , os: windows-latest } - { name: windows-python3.10 , test-tox-env: py310 , build-tox-env: build-py310 , python-ver: "3.10", os: windows-latest } - { name: windows-python3.11 , test-tox-env: py311 , build-tox-env: build-py311 , python-ver: "3.11", os: windows-latest } - { name: windows-python3.11-optional , test-tox-env: py311-optional , build-tox-env: build-py311-optional , python-ver: "3.11", os: windows-latest } - { name: windows-python3.11-upgraded , test-tox-env: py311-upgraded , build-tox-env: build-py311-upgraded , python-ver: "3.11", os: windows-latest } - { name: windows-python3.11-prerelease, test-tox-env: py311-prerelease, build-tox-env: build-py311-prerelease, python-ver: "3.11", os: windows-latest } - - { name: macos-python3.7-minimum , test-tox-env: py37-minimum , build-tox-env: build-py37-minimum , python-ver: "3.7" , os: macos-latest } - - { name: macos-python3.8 , test-tox-env: py38 , build-tox-env: build-py38 , python-ver: "3.8" , os: macos-latest } + - { name: macos-python3.8-minimum , test-tox-env: py38-minimum , build-tox-env: build-py38-minimum , python-ver: "3.8" , os: macos-latest } - { name: macos-python3.9 , test-tox-env: py39 , build-tox-env: build-py39 , python-ver: "3.9" , os: macos-latest } - { name: macos-python3.10 , test-tox-env: py310 , build-tox-env: build-py310 , python-ver: "3.10", os: macos-latest } - { name: macos-python3.11 , test-tox-env: py311 , build-tox-env: build-py311 , python-ver: "3.11", os: macos-latest } @@ -88,13 +85,13 @@ jobs: fail-fast: false matrix: include: - - { name: linux-gallery-python3.7-minimum , test-tox-env: gallery-py37-minimum , python-ver: "3.7" , os: ubuntu-latest } + - { name: linux-gallery-python3.8-minimum , test-tox-env: gallery-py38-minimum , python-ver: "3.8" , os: ubuntu-latest } - { name: linux-gallery-python3.11-upgraded , test-tox-env: gallery-py311-upgraded , python-ver: "3.11", os: ubuntu-latest } - { name: linux-gallery-python3.11-prerelease , test-tox-env: gallery-py311-prerelease, python-ver: "3.11", os: ubuntu-latest } - - { name: windows-gallery-python3.7-minimum , test-tox-env: gallery-py37-minimum , python-ver: "3.7" , os: windows-latest } + - { name: windows-gallery-python3.8-minimum , test-tox-env: gallery-py38-minimum , python-ver: "3.8" , os: windows-latest } - { name: windows-gallery-python3.11-upgraded , test-tox-env: gallery-py311-upgraded , python-ver: "3.11", os: windows-latest } - { name: windows-gallery-python3.11-prerelease, test-tox-env: gallery-py311-prerelease, python-ver: "3.11", os: windows-latest } - - { name: macos-gallery-python3.7-minimum , test-tox-env: gallery-py37-minimum , python-ver: "3.7" , os: macos-latest } + - { name: macos-gallery-python3.8-minimum , test-tox-env: gallery-py38-minimum , python-ver: "3.8" , os: macos-latest } - { name: macos-gallery-python3.11-upgraded , test-tox-env: gallery-py311-upgraded , python-ver: "3.11", os: macos-latest } - { name: macos-gallery-python3.11-prerelease , test-tox-env: gallery-py311-prerelease, python-ver: "3.11", os: macos-latest } steps: @@ -132,8 +129,7 @@ jobs: fail-fast: false matrix: include: - - { name: conda-linux-python3.7-minimum , test-tox-env: py37-minimum , build-tox-env: build-py37-minimum , python-ver: "3.7" , os: ubuntu-latest } - - { name: conda-linux-python3.8 , test-tox-env: py38 , build-tox-env: build-py38 , python-ver: "3.8" , os: ubuntu-latest } + - { name: conda-linux-python3.8-minimum , test-tox-env: py38-minimum , build-tox-env: build-py38-minimum , python-ver: "3.8" , os: ubuntu-latest } - { name: conda-linux-python3.9 , test-tox-env: py39 , build-tox-env: build-py39 , python-ver: "3.9" , os: ubuntu-latest } - { name: conda-linux-python3.10 , test-tox-env: py310 , build-tox-env: build-py310 , python-ver: "3.10", os: ubuntu-latest } - { name: conda-linux-python3.11 , test-tox-env: py311 , build-tox-env: build-py311 , python-ver: "3.11", os: ubuntu-latest } @@ -162,8 +158,7 @@ jobs: run: | conda config --set always_yes yes --set changeps1 no conda info - # the conda dependency resolution for tox under python 3.7 can install the wrong importlib_metadata - conda install -c conda-forge tox "importlib_metadata>4" + conda install -c conda-forge tox - name: Conda reporting run: | diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 857b4159..c30d8a3f 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -19,13 +19,13 @@ jobs: fail-fast: false matrix: include: - - { name: linux-python3.7-minimum , test-tox-env: py37-minimum , build-tox-env: build-py37-minimum , python-ver: "3.7" , os: ubuntu-latest } + - { name: linux-python3.8-minimum , test-tox-env: py38-minimum , build-tox-env: build-py38-minimum , python-ver: "3.8" , os: ubuntu-latest } - { name: linux-python3.11 , test-tox-env: py311 , build-tox-env: build-py311 , python-ver: "3.11", os: ubuntu-latest } # NOTE config below with "upload-wheels: true" specifies that wheels should be uploaded as an artifact - { name: linux-python3.11-upgraded , test-tox-env: py311-upgraded , build-tox-env: build-py311-upgraded , python-ver: "3.11", os: ubuntu-latest , upload-wheels: true } - - { name: windows-python3.7-minimum , test-tox-env: py37-minimum , build-tox-env: build-py37-minimum , python-ver: "3.7" , os: windows-latest } + - { name: windows-python3.8-minimum , test-tox-env: py38-minimum , build-tox-env: build-py38-minimum , python-ver: "3.8" , os: windows-latest } - { name: windows-python3.11-upgraded , test-tox-env: py311-upgraded , build-tox-env: build-py311-upgraded , python-ver: "3.11", os: windows-latest } - - { name: macos-python3.7-minimum , test-tox-env: py37-minimum , build-tox-env: build-py37-minimum , python-ver: "3.7" , os: macos-latest } + - { name: macos-python3.8-minimum , test-tox-env: py38-minimum , build-tox-env: build-py38-minimum , python-ver: "3.8" , os: macos-latest } - { name: macos-python3.11-upgraded , test-tox-env: py311-upgraded , build-tox-env: build-py311-upgraded , python-ver: "3.11", os: macos-latest } steps: - name: Cancel non-latest runs @@ -76,9 +76,9 @@ jobs: fail-fast: false matrix: include: - - { name: linux-gallery-python3.7-minimum , test-tox-env: gallery-py37-minimum , python-ver: "3.7" , os: ubuntu-latest } + - { name: linux-gallery-python3.8-minimum , test-tox-env: gallery-py38-minimum , python-ver: "3.8" , os: ubuntu-latest } - { name: linux-gallery-python3.11-upgraded , test-tox-env: gallery-py311-upgraded, python-ver: "3.11", os: ubuntu-latest } - - { name: windows-gallery-python3.7-minimum , test-tox-env: gallery-py37-minimum , python-ver: "3.7" , os: windows-latest } + - { name: windows-gallery-python3.8-minimum , test-tox-env: gallery-py38-minimum , python-ver: "3.8" , os: windows-latest } - { name: windows-gallery-python3.11-upgraded, test-tox-env: gallery-py311-upgraded, python-ver: "3.11", os: windows-latest } steps: - name: Cancel non-latest runs @@ -114,7 +114,7 @@ jobs: fail-fast: false matrix: include: - - { name: conda-linux-python3.7-minimum , test-tox-env: py37-minimum , build-tox-env: build-py37-minimum , python-ver: "3.7" , os: ubuntu-latest } + - { name: conda-linux-python3.8-minimum , test-tox-env: py38-minimum , build-tox-env: build-py38-minimum , python-ver: "3.8" , os: ubuntu-latest } - { name: conda-linux-python3.11-upgraded , test-tox-env: py311-upgraded , build-tox-env: build-py311-upgraded , python-ver: "3.11", os: ubuntu-latest } steps: - name: Cancel non-latest runs diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 4d021264..f57db3ed 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -8,7 +8,7 @@ version: 2 build: os: ubuntu-20.04 tools: - python: '3.10' + python: '3.9' # Build documentation in the docs/ directory with Sphinx sphinx: @@ -27,6 +27,7 @@ python: - requirements: requirements-doc.txt - requirements: requirements.txt - requirements: requirements-opt.txt + - path: . # path to the package relative to the root # Optionally include all submodules submodules: diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f7fcc52..c9c6e89f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ # HDMF-ZARR Changelog -## 0.3.1 (Upcoming) +## 0.4.0 (Upcoming) + +### Enhancements +* Enhanced ZarrIO to resolve object references lazily on read similar to HDMF's `HDF5IO` backend @mavaylon1 [#120](https://github.com/hdmf-dev/hdmf-zarr/pull/120) + +### Dependencies +* Updated HDMF and PyNWB version to the most recent release @mavaylon1 [#120](https://github.com/hdmf-dev/hdmf-zarr/pull/120) +* Updated minimum Python version from 3.7 to 3.8 @mavaylon1 [#120](https://github.com/hdmf-dev/hdmf-zarr/pull/120) ### Bug fixes * Fixed error in deploy workflow. @mavaylon1 [#109](https://github.com/hdmf-dev/hdmf-zarr/pull/109) diff --git a/docs/gallery/plot_convert_nwb_hdf5.py b/docs/gallery/plot_convert_nwb_hdf5.py index c26006f1..6565afe2 100644 --- a/docs/gallery/plot_convert_nwb_hdf5.py +++ b/docs/gallery/plot_convert_nwb_hdf5.py @@ -9,7 +9,6 @@ :py:class:`~ hdmf.backends.hdf5.h5tools.HDF5IO` HDF5 backend from HDMF for storage. """ - ############################################################################### # Setup # ----- @@ -31,11 +30,14 @@ # asset.download(filename) # # We here use a local copy of a small file from this DANDIset as an example: -# + # sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnail_plot_convert_nwb.png' import os import shutil +from pynwb import NWBHDF5IO +from hdmf_zarr.nwb import NWBZarrIO +from contextlib import suppress # Input file to convert basedir = "resources" @@ -62,9 +64,6 @@ # As this is an NWB file, we here use the :py:class:`pynwb.NWBHDF5IO` backend for reading the file from # from HDF5 and use the :py:class:`~hdmf_zarr.nwb.NWBZarrIO` backend to export the file to Zarr. -from pynwb import NWBHDF5IO -from hdmf_zarr.nwb import NWBZarrIO - with NWBHDF5IO(filename, 'r', load_namespaces=False) as read_io: # Create HDF5 IO object for read with NWBZarrIO(zarr_filename, mode='w') as export_io: # Create Zarr IO object for write export_io.export(src_io=read_io, write_args=dict(link_data=False)) # Export from HDF5 to Zarr @@ -77,7 +76,6 @@ # # Read the Zarr file back in # -------------------------- -# zr = NWBZarrIO(zarr_filename, 'r') zf = zr.read() @@ -107,9 +105,10 @@ # # Using the same approach as above, we can now convert our Zarr file back to HDF5. -with NWBZarrIO(zarr_filename, mode='r') as read_io: # Create Zarr IO object for read - with NWBHDF5IO(hdf_filename, 'w') as export_io: # Create HDF5 IO object for write - export_io.export(src_io=read_io, write_args=dict(link_data=False)) # Export from Zarr to HDF5 +with suppress(Exception): # TODO: This is a temporary ignore on the convert_dtype exception. + with NWBZarrIO(zarr_filename, mode='r') as read_io: # Create Zarr IO object for read + with NWBHDF5IO(hdf_filename, 'w') as export_io: # Create HDF5 IO object for write + export_io.export(src_io=read_io, write_args=dict(link_data=False)) # Export from Zarr to HDF5 ############################################################################### # Read the new HDF5 file back @@ -118,5 +117,6 @@ # Now our file has been converted from HDF5 to Zarr and back again to HDF5. # Here we check that we can still read that file. -with NWBHDF5IO(hdf_filename, 'r') as hr: - hf = hr.read() +with suppress(Exception): # TODO: This is a temporary ignore on the convert_dtype exception. + with NWBHDF5IO(hdf_filename, 'r') as hr: + hf = hr.read() diff --git a/docs/source/storage.rst b/docs/source/storage.rst index b391dd12..1cb98576 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -296,7 +296,11 @@ store the definition of the ``region`` that is being referenced, e.g., a slice o 4) :py:meth:`~hdmf_zarr.backend.ZarrIO.__read_dataset` to support reading region references, which may also require updates to :py:meth:`~hdmf_zarr.backend.ZarrIO.__parse_ref` and :py:meth:`~hdmf_zarr.backend.ZarrIO.__resolve_ref`, and - 5) and possibly other parts of :py:class:`~hdmf_zarr.backend.ZarrIO` + 5) and possibly other parts of :py:class:`~hdmf_zarr.backend.ZarrIO`. + 6) The py:class:`~hdmf_zarr.zarr_utils.ContainerZarrRegionDataset` and + py:class:`~hdmf_zarr.zarr_utils.ContainerZarrRegionDataset` classes will also need to be finalized + to support region references. + .. _sec-zarr-storage-dtypes: @@ -379,7 +383,3 @@ data type. The specification of the namespace is stored in ``/specifications///``. Here ```` refers to the main name of the source-file without file extension (e.g., the core namespace defines ``nwb.ephys.yaml`` as source which would be stored in ``/specifications/core/2.0.1/nwb.ecephys``). - - - - diff --git a/pyproject.toml b/pyproject.toml index b2d9e782..ad47b0e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.black] line-length = 120 -target-version = ['py37'] +target-version = ['py38'] include = '\.pyi?$' extend-exclude = ''' /( diff --git a/requirements-min.txt b/requirements-min.txt index a7766910..c452e4c5 100644 --- a/requirements-min.txt +++ b/requirements-min.txt @@ -1,7 +1,7 @@ -hdmf==3.5.4 +hdmf==3.9.0 zarr==2.11.0 numcodecs==0.9.1 -pynwb==2.3.2 +pynwb==2.5.0 setuptools importlib_resources;python_version<'3.9' # Remove when python 3.9 becomes the new minimum threadpoolctl==3.1.0 diff --git a/requirements.txt b/requirements.txt index 0c8851f6..49786d21 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # pinned dependencies to reproduce an entire development environment to use HDMF-ZARR -hdmf==3.5.4 +hdmf==3.9.0 zarr==2.11.0 -pynwb==2.3.2 +pynwb==2.5.0 numpy==1.21; python_version < "3.8" numpy==1.23; python_version >= "3.8" numcodecs==0.10.2; python_version < "3.8" diff --git a/setup.py b/setup.py index c63c90f2..4ff0eb53 100755 --- a/setup.py +++ b/setup.py @@ -17,14 +17,12 @@ reqs = [ - 'hdmf==3.5.4', # temporary + 'hdmf>=3.9.0', 'zarr>=2.11.0', - 'numpy<1.22; python_version < "3.8"', - 'numpy>=1.22; python_version >= "3.8"', + 'numpy>=1.24', 'numcodecs>=0.9.1', - 'numcodecs==0.10.2; python_version < "3.8"', - 'numcodecs==0.11.0; python_version >= "3.8"', - 'pynwb>=2.3.2', + 'numcodecs==0.11.0', + 'pynwb>=2.5.0', 'setuptools', 'threadpoolctl>=3.1.0', ] @@ -47,10 +45,9 @@ 'packages': pkgs, 'package_dir': {'': 'src'}, 'package_data': {}, - 'python_requires': '>=3.7', + 'python_requires': '>=3.8', 'classifiers': [ "Programming Language :: Python", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 6bc19013..dd8b93ad 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -1,8 +1,6 @@ """Module with the Zarr-based I/O-backend for HDMF""" # Python imports import os -import itertools -from copy import deepcopy import warnings import numpy as np import tempfile @@ -23,6 +21,7 @@ ZarrSpecWriter, ZarrSpecReader, ZarrIODataChunkIteratorQueue) +from .zarr_utils import BuilderZarrReferenceDataset, BuilderZarrTableDataset # HDMF imports from hdmf.backends.io import HDMFIO @@ -317,7 +316,7 @@ def export(self, **kwargs): raise UnsupportedOperation("Cannot export from non-Zarr backend %s to Zarr with write argument " "link_data=True." % src_io.__class__.__name__) - # write_args['export_source'] = src_io.source # pass export_source=src_io.source to write_builder + write_args['export_source'] = src_io.source # pass export_source=src_io.source to write_builder ckwargs = kwargs.copy() ckwargs['write_args'] = write_args super().export(**ckwargs) @@ -364,28 +363,39 @@ def get_builder_disk_path(self, **kwargs): @docval( {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder object representing the NWBFile'}, { - 'name': 'link_data', 'type': bool, + 'name': 'link_data', + 'type': bool, 'doc': 'If not specified otherwise link (True) or copy (False) Zarr Datasets', 'default': True }, { - 'name': 'exhaust_dci', 'type': bool, + 'name': 'exhaust_dci', + 'type': bool, 'doc': ( 'Exhaust DataChunkIterators one at a time. If False, add ' 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end' ), 'default': True, }, + { + 'name': 'export_source', + 'type': str, + 'doc': 'The source of the builders when exporting', + 'default': None, + }, ) def write_builder(self, **kwargs): """Write a builder to disk.""" - f_builder, link_data, exhaust_dci = getargs('builder', 'link_data', 'exhaust_dci', kwargs) + f_builder, link_data, exhaust_dci, export_source = getargs( + 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs + ) for name, gbldr in f_builder.groups.items(): self.write_group( parent=self.__file, builder=gbldr, link_data=link_data, exhaust_dci=exhaust_dci, + export_source=export_source, ) for name, dbldr in f_builder.datasets.items(): self.write_dataset( @@ -393,8 +403,9 @@ def write_builder(self, **kwargs): builder=dbldr, link_data=link_data, exhaust_dci=exhaust_dci, + export_source=export_source, ) - self.write_attributes(self.__file, f_builder.attributes) + self.write_attributes(self.__file, f_builder.attributes) # the same as set_attributes in HDMF self.__dci_queue.exhaust_queue() # Write any remaining DataChunkIterators that have been queued self._written_builders.set_written(f_builder) self.logger.debug("Done writing %s '%s' to path '%s'" % @@ -408,12 +419,15 @@ def write_builder(self, **kwargs): 'doc': 'exhaust DataChunkIterators one at a time. If False, add ' + 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end', 'default': True}, + {'name': 'export_source', 'type': str, + 'doc': 'The source of the builders when exporting', 'default': None}, returns='the Group that was created', rtype='Group') def write_group(self, **kwargs): """Write a GroupBuider to file""" - parent, builder, link_data, exhaust_dci = getargs( - 'parent', 'builder', 'link_data', 'exhaust_dci', kwargs + parent, builder, link_data, exhaust_dci, export_source = getargs( + 'parent', 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs ) + if self.get_written(builder): group = parent[builder.name] else: @@ -437,6 +451,7 @@ def write_group(self, **kwargs): builder=sub_builder, link_data=link_data, exhaust_dci=exhaust_dci, + export_source=export_source, ) # write all links (haven implemented) @@ -453,10 +468,13 @@ def write_group(self, **kwargs): @docval({'name': 'obj', 'type': (Group, Array), 'doc': 'the Zarr object to add attributes to'}, {'name': 'attributes', 'type': dict, - 'doc': 'a dict containing the attributes on the Group or Dataset, indexed by attribute name'}) + 'doc': 'a dict containing the attributes on the Group or Dataset, indexed by attribute name'}, + {'name': 'export_source', 'type': str, + 'doc': 'The source of the builders when exporting', 'default': None}) def write_attributes(self, **kwargs): """Set (i.e., write) the attributes on a given Zarr Group or Array.""" - obj, attributes = getargs('obj', 'attributes', kwargs) + obj, attributes, export_source = getargs('obj', 'attributes', 'export_source', kwargs) + for key, value in attributes.items(): # Case 1: list, set, tuple type attributes if isinstance(value, (set, list, tuple)) or (isinstance(value, np.ndarray) and np.ndim(value) != 0): @@ -482,15 +500,16 @@ def write_attributes(self, **kwargs): raise TypeError(str(e) + " type=" + str(type(value)) + " data=" + str(value)) from e # Case 2: References elif isinstance(value, (Container, Builder, ReferenceBuilder)): - if isinstance(value, RegionBuilder): - type_str = 'region' - refs = self.__get_ref(value.builder) - elif isinstance(value, (ReferenceBuilder, Container, Builder)): + # TODO: Region References are not yet supported + # if isinstance(value, RegionBuilder): + # type_str = 'region' + # refs = self.__get_ref(value.builder) + if isinstance(value, (ReferenceBuilder, Container, Builder)): type_str = 'object' if isinstance(value, Builder): - refs = self.__get_ref(value) + refs = self.__get_ref(value, export_source) else: - refs = self.__get_ref(value.builder) + refs = self.__get_ref(value.builder, export_source) tmp = {'zarr_dtype': type_str, 'value': refs} obj.attrs[key] = tmp # Case 3: Scalar attributes @@ -589,13 +608,13 @@ def __is_ref(self, dtype): else: return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE - def __resolve_ref(self, zarr_ref): + def resolve_ref(self, zarr_ref): """ Get the full path to the object linked to by the zarr reference The function only constructs the links to the targe object, but it does not check if the object exists - :param zarr_ref: Dict with `source` and `path` keys or a `ZarrRefernce` object + :param zarr_ref: Dict with `source` and `path` keys or a `ZarrReference` object :return: 1) name of the target object 2) the target zarr object within the target file """ @@ -626,7 +645,7 @@ def __resolve_ref(self, zarr_ref): # Return the create path return target_name, target_zarr_obj - def __get_ref(self, ref_object): + def __get_ref(self, ref_object, export_source=None): """ Create a ZarrReference object that points to the given container @@ -657,8 +676,16 @@ def __get_ref(self, ref_object): source = (builder.source if (builder.source is not None and os.path.isdir(builder.source)) else self.source) + # Make the source relative to the current file - source = os.path.relpath(os.path.abspath(source), start=self.abspath) + # TODO: This check assumes that all links are internal links on export. + # Need to deal with external links on export. + if export_source is not None: + # Make sure the source of the reference is now towards the new file + # and not the original source when exporting. + source = '.' + else: + source = os.path.relpath(os.path.abspath(source), start=self.abspath) # Return the ZarrReference object return ZarrReference(source, path) @@ -784,11 +811,14 @@ def __setup_chunked_dataset__(cls, parent, name, data, options=None): 'default': True}, {'name': 'force_data', 'type': None, 'doc': 'Used internally to force the data being used when we have to load the data', 'default': None}, + {'name': 'export_source', 'type': str, + 'doc': 'The source of the builders when exporting', 'default': None}, returns='the Zarr array that was created', rtype=Array) def write_dataset(self, **kwargs): # noqa: C901 - parent, builder, link_data, exhaust_dci = getargs( - 'parent', 'builder', 'link_data', 'exhaust_dci', kwargs + parent, builder, link_data, exhaust_dci, export_source = getargs( + 'parent', 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs ) + force_data = getargs('force_data', kwargs) if exhaust_dci and self.__dci_queue is None: @@ -826,7 +856,7 @@ def write_dataset(self, **kwargs): # noqa: C901 elif isinstance(data, HDMFDataset): # If we have a dataset of containers we need to make the references to the containers if len(data) > 0 and isinstance(data[0], Container): - ref_data = [self.__get_ref(data[i]) for i in range(len(data))] + ref_data = [self.__get_ref(data[i], export_source=export_source) for i in range(len(data))] shape = (len(data), ) type_str = 'object' dset = parent.require_dataset(name, @@ -849,7 +879,8 @@ def write_dataset(self, **kwargs): # noqa: C901 dset = self.write_dataset(parent=parent, builder=builder, link_data=link_data, - force_data=data[:]) + force_data=data[:], + export_source=export_source) self._written_builders.set_written(builder) # record that the builder has been written # Write a compound dataset elif isinstance(options['dtype'], list): @@ -858,7 +889,7 @@ def write_dataset(self, **kwargs): # noqa: C901 for i, dts in enumerate(options['dtype']): if self.__is_ref(dts['dtype']): refs.append(i) - ref_tmp = self.__get_ref(data[0][i]) + ref_tmp = self.__get_ref(data[0][i], export_source=export_source) if isinstance(ref_tmp, ZarrReference): dts_str = 'object' else: @@ -880,29 +911,31 @@ def write_dataset(self, **kwargs): # noqa: C901 for j, item in enumerate(data): new_item = list(item) for i in refs: - new_item[i] = self.__get_ref(item[i]) + new_item[i] = self.__get_ref(item[i], export_source=export_source) dset[j] = new_item else: # write a compound datatype dset = self.__list_fill__(parent, name, data, options) # Write a dataset of references elif self.__is_ref(options['dtype']): - if isinstance(data, RegionBuilder): - shape = (1,) - type_str = 'region' - refs = self.__get_ref(data.builder, data.region) - elif isinstance(data, ReferenceBuilder): + # TODO Region references are not yet support, but here how the code should look + # if isinstance(data, RegionBuilder): + # shape = (1,) + # type_str = 'region' + # refs = self.__get_ref(data.builder, data.region) + if isinstance(data, ReferenceBuilder): shape = (1,) type_str = 'object' - refs = self.__get_ref(data.builder) - elif options['dtype'] == 'region': - shape = (len(data), ) - type_str = 'region' - refs = [self.__get_ref(item.builder, item.region) for item in data] + refs = self.__get_ref(data.builder, export_source=export_source) + # TODO: Region References are not yet supported + # elif options['dtype'] == 'region': + # shape = (len(data), ) + # type_str = 'region' + # refs = [self.__get_ref(item.builder, item.region) for item in data] else: shape = (len(data), ) type_str = 'object' - refs = [self.__get_ref(item) for item in data] + refs = [self.__get_ref(item, export_source=export_source) for item in data] dset = parent.require_dataset(name, shape=shape, @@ -1134,6 +1167,34 @@ def __set_built(self, zarr_obj, builder): path = os.path.join(fpath, path) self.__built.setdefault(path, builder) + @docval({'name': 'zarr_obj', 'type': (Array, Group), + 'doc': 'the Zarr object to the corresponding Container/Data object for'}) + def get_container(self, **kwargs): + """ + Get the container for the corresponding Zarr Group or Dataset + + :raises ValueError: When no builder has been constructed yet for the given h5py object + """ + zarr_obj = getargs('zarr_obj', kwargs) + builder = self.get_builder(zarr_obj) + container = self.manager.construct(builder) + return container # TODO: This method should be moved to HDMFIO + + @docval({'name': 'zarr_obj', 'type': (Array, Group), + 'doc': 'the Zarr object to the corresponding Builder object for'}) + def get_builder(self, **kwargs): # TODO: move this to HDMFIO (define skeleton in there at least) + """ + Get the builder for the corresponding Group or Dataset + + :raises ValueError: When no builder has been constructed + """ + zarr_obj = kwargs['zarr_obj'] + builder = self.__get_built(zarr_obj) + if builder is None: + msg = '%s has not been built' % (zarr_obj.name) + raise ValueError(msg) + return builder + def __get_built(self, zarr_obj): """ Look up a builder for the given zarr object @@ -1189,7 +1250,7 @@ def __read_links(self, zarr_obj, parent): links = zarr_obj.attrs['zarr_link'] for link in links: link_name = link['name'] - target_name, target_zarr_obj = self.__resolve_ref(link) + target_name, target_zarr_obj = self.resolve_ref(link) # NOTE: __read_group and __read_dataset return the cached builders if the target has already been built if isinstance(target_zarr_obj, Group): builder = self.__read_group(target_zarr_obj, target_name) @@ -1229,91 +1290,42 @@ def __read_dataset(self, zarr_obj, name): if dtype == 'scalar': data = zarr_obj[0] - obj_refs = False - reg_refs = False - has_reference = False if isinstance(dtype, list): - # compound data type - obj_refs = list() - reg_refs = list() + # Check compound dataset where one of the subsets contains references + has_reference = False for i, dts in enumerate(dtype): - if dts['dtype'] == DatasetBuilder.OBJECT_REF_TYPE: - obj_refs.append(i) + if dts['dtype'] in ['object', 'region']: # check items for object reference has_reference = True - elif dts['dtype'] == DatasetBuilder.REGION_REF_TYPE: - reg_refs.append(i) - has_reference = True - + break + retrieved_dtypes = [dtype_dict['dtype'] for dtype_dict in dtype] + if has_reference: + # TODO: BuilderZarrTableDataset does not yet support region reference + data = BuilderZarrTableDataset(zarr_obj, self, retrieved_dtypes) elif self.__is_ref(dtype): - # reference array - has_reference = True - if dtype == DatasetBuilder.OBJECT_REF_TYPE: - obj_refs = True - elif dtype == DatasetBuilder.REGION_REF_TYPE: - reg_refs = True - - if has_reference: - try: - # TODO Should implement a lazy way to evaluate references for Zarr - data = deepcopy(data[:]) - self.__parse_ref(kwargs['maxshape'], obj_refs, reg_refs, data) - except ValueError as e: - raise ValueError(str(e) + " zarr-name=" + str(zarr_obj.name) + " name=" + str(name)) + # Array of references + if dtype == 'object': + data = BuilderZarrReferenceDataset(data, self) + # TODO: Resolution of Region reference not yet supported by BuilderZarrRegionDataset + # elif dtype == 'region': + # data = BuilderZarrRegionDataset(data, self) kwargs['data'] = data if name is None: name = str(os.path.basename(zarr_obj.name)) - ret = DatasetBuilder(name, **kwargs) + ret = DatasetBuilder(name, **kwargs) # create builder object for dataset ret.location = self.get_zarr_parent_path(zarr_obj) self._written_builders.set_written(ret) # record that the builder has been written self.__set_built(zarr_obj, ret) return ret - def __parse_ref(self, shape, obj_refs, reg_refs, data): - corr = [] - obj_pos = [] - reg_pos = [] - for s in shape: - corr.append(range(s)) - corr = tuple(corr) - for c in itertools.product(*corr): - if isinstance(obj_refs, list): - for i in obj_refs: - t = list(c) - t.append(i) - obj_pos.append(t) - elif obj_refs: - obj_pos.append(list(c)) - if isinstance(reg_refs, list): - for i in reg_refs: - t = list(c) - t.append(i) - reg_pos.append(t) - elif reg_refs: - reg_pos.append(list(c)) - - for p in obj_pos: - o = data - for i in p: - o = o[i] - target_name, target_zarr_obj = self.__resolve_ref(o) - o = data - for i in range(0, len(p)-1): - o = data[p[i]] - if isinstance(target_zarr_obj, zarr.hierarchy.Group): - o[p[-1]] = self.__read_group(target_zarr_obj, target_name) - else: - o[p[-1]] = self.__read_dataset(target_zarr_obj, target_name) - def __read_attrs(self, zarr_obj): ret = dict() for k in zarr_obj.attrs.keys(): if k not in self.__reserve_attribute: v = zarr_obj.attrs[k] if isinstance(v, dict) and 'zarr_dtype' in v: - # TODO Is this the correct way to resolve references? if v['zarr_dtype'] == 'object': - target_name, target_zarr_obj = self.__resolve_ref(v['value']) + target_name, target_zarr_obj = self.resolve_ref(v['value']) if isinstance(target_zarr_obj, zarr.hierarchy.Group): ret[k] = self.__read_group(target_zarr_obj, target_name) else: diff --git a/src/hdmf_zarr/zarr_utils.py b/src/hdmf_zarr/zarr_utils.py new file mode 100644 index 00000000..b9717c09 --- /dev/null +++ b/src/hdmf_zarr/zarr_utils.py @@ -0,0 +1,288 @@ +""" +Utilities for the Zarr I/O backend, +e.g., for wrapping Zarr arrays on read, wrapping arrays for configuring write, or +writing the spec among others +""" +from abc import ABCMeta, abstractmethod +from copy import copy +import numpy as np + +from zarr import Array as ZarrArray + +from hdmf.build import DatasetBuilder +from hdmf.array import Array +from hdmf.query import HDMFDataset, ReferenceResolver, ContainerResolver, BuilderResolver +from hdmf.utils import docval, popargs, get_docval + + +class ZarrDataset(HDMFDataset): + """ + Extension of HDMFDataset to add Zarr compatibility + """ + + @docval({'name': 'dataset', 'type': (np.ndarray, ZarrArray, Array), 'doc': 'the Zarr file lazily evaluate'}, + {'name': 'io', 'type': 'ZarrIO', 'doc': 'the IO object that was used to read the underlying dataset'}) + def __init__(self, **kwargs): + self.__io = popargs('io', kwargs) + super().__init__(**kwargs) + + @property + def io(self): + return self.__io + + @property + def shape(self): + return self.dataset.shape + + +class DatasetOfReferences(ZarrDataset, ReferenceResolver, metaclass=ABCMeta): + """ + An extension of the base ReferenceResolver class to add more abstract methods for + subclasses that will read Zarr references + """ + + @abstractmethod + def get_object(self, zarr_obj): + """ + A class that maps an Zarr object to a Builder or Container + """ + pass + + def invert(self): + """ + Return an object that defers reference resolution + but in the opposite direction. + """ + if not hasattr(self, '__inverted'): + cls = self.get_inverse_class() + docval = get_docval(cls.__init__) + kwargs = dict() + for arg in docval: + kwargs[arg['name']] = getattr(self, arg['name']) + self.__inverted = cls(**kwargs) + return self.__inverted + + def _get_ref(self, ref): + name, zarr_obj = self.io.resolve_ref(ref) # ref is a json dict containing the path to the object + return self.get_object(zarr_obj) + + def __iter__(self): + for ref in super().__iter__(): + yield self._get_ref(ref) + + def __next__(self): + return self._get_ref(super().__next__()) + + +class BuilderResolverMixin(BuilderResolver): # refactor to backend/utils.py + """ + A mixin for adding to Zarr reference-resolving types + the get_object method that returns Builders + """ + + def get_object(self, zarr_obj): + """ + A class that maps an Zarr object to a Builder + """ + return self.io.get_builder(zarr_obj) + + +class ContainerResolverMixin(ContainerResolver): # refactor to backend/utils.py + """ + A mixin for adding to Zarr reference-resolvinAbstractZarrReferenceDatasetg types + the get_object method that returns Containers + """ + + def get_object(self, zarr_obj): + """ + A class that maps an Zarr object to a Container + """ + return self.io.get_container(zarr_obj) + + +class AbstractZarrTableDataset(DatasetOfReferences): + """ + Extension of DatasetOfReferences to serve as the base class for resolving Zarr + references in compound datasets to either Builders and Containers. + """ + + @docval({'name': 'dataset', 'type': (np.ndarray, ZarrArray, Array), 'doc': 'the Zarr file lazily evaluate'}, + {'name': 'io', 'type': 'ZarrIO', 'doc': 'the IO object that was used to read the underlying dataset'}, + {'name': 'types', 'type': (list, tuple), + 'doc': 'the list/tuple of reference types'}) + def __init__(self, **kwargs): + types = popargs('types', kwargs) + super().__init__(**kwargs) + self.__refgetters = dict() + for i, t in enumerate(types): + # if t is RegionReference: # TODO: Region References not yet supported + # self.__refgetters[i] = self.__get_regref + if t == DatasetBuilder.OBJECT_REF_TYPE: + self.__refgetters[i] = self._get_ref + elif t is str: + # we need this for when we read compound data types + # that have unicode sub-dtypes since Zarrpy does not + # store UTF-8 in compound dtypes + self.__refgetters[i] = self._get_utf + self.__types = types + tmp = list() + for i in range(len(self.dataset.dtype)): + sub = self.dataset.dtype[i] + if np.issubdtype(sub, np.dtype('O')): + tmp.append('object') + # TODO: Region References are not yet supported + if sub.metadata: + if 'vlen' in sub.metadata: + t = sub.metadata['vlen'] + if t is str: + tmp.append('utf') + elif t is bytes: + tmp.append('ascii') + else: + tmp.append(sub.type.__name__) + self.__dtype = tmp + + @property + def types(self): + return self.__types + + @property + def dtype(self): + return self.__dtype + + def __getitem__(self, arg): + rows = copy(super().__getitem__(arg)) + if np.issubdtype(type(arg), np.integer): + self.__swap_refs(rows) + else: + for row in rows: + self.__swap_refs(row) + return rows + + def __swap_refs(self, row): + for i in self.__refgetters: + getref = self.__refgetters[i] + row[i] = getref(row[i]) + + def _get_utf(self, string): + """ + Decode a dataset element to unicode + """ + return string.decode('utf-8') if isinstance(string, bytes) else string + + def __get_regref(self, ref): + obj = self._get_ref(ref) + return obj[ref] + + def resolve(self, manager): + return self[0:len(self)] + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + +class AbstractZarrReferenceDataset(DatasetOfReferences): + """ + Extension of DatasetOfReferences to serve as the base class for resolving Zarr + references in datasets to either Builders and Containers. + """ + + def __getitem__(self, arg): + ref = super().__getitem__(arg) + if isinstance(ref, np.ndarray): + return [self._get_ref(x) for x in ref] + else: + return self._get_ref(ref) + + @property + def dtype(self): + return 'object' + + +class AbstractZarrRegionDataset(AbstractZarrReferenceDataset): + """ + Extension of DatasetOfReferences to serve as the base class for resolving Zarr + references in datasets to either Builders and Containers. + + Note: Region References are not yet supported. + """ + + def __getitem__(self, arg): + obj = super().__getitem__(arg) + ref = self.dataset[arg] + return obj[ref] + + @property + def dtype(self): + return 'region' + + +class ContainerZarrTableDataset(ContainerResolverMixin, AbstractZarrTableDataset): + """ + A reference-resolving dataset for resolving references inside tables + (i.e. compound dtypes) that returns resolved references as Containers + """ + + @classmethod + def get_inverse_class(cls): + return BuilderZarrTableDataset + + +class BuilderZarrTableDataset(BuilderResolverMixin, AbstractZarrTableDataset): + """ + A reference-resolving dataset for resolving references inside tables + (i.e. compound dtypes) that returns resolved references as Builders + """ + + @classmethod + def get_inverse_class(cls): + return ContainerZarrTableDataset + + +class ContainerZarrReferenceDataset(ContainerResolverMixin, AbstractZarrReferenceDataset): + """ + A reference-resolving dataset for resolving object references that returns + resolved references as Containers + """ + + @classmethod + def get_inverse_class(cls): + return BuilderZarrReferenceDataset + + +class BuilderZarrReferenceDataset(BuilderResolverMixin, AbstractZarrReferenceDataset): + """ + A reference-resolving dataset for resolving object references that returns + resolved references as Builders + """ + + @classmethod + def get_inverse_class(cls): + return ContainerZarrReferenceDataset + + +class ContainerZarrRegionDataset(ContainerResolverMixin, AbstractZarrRegionDataset): + """ + A reference-resolving dataset for resolving region references that returns + resolved references as Containers + + Note: Region References are not yet supported. + """ + + @classmethod + def get_inverse_class(cls): + return BuilderZarrRegionDataset + + +class BuilderZarrRegionDataset(BuilderResolverMixin, AbstractZarrRegionDataset): + """ + A reference-resolving dataset for resolving region references that returns + resolved references as Builders. + + Note: Region References are not yet supported. + """ + + @classmethod + def get_inverse_class(cls): + return ContainerZarrRegionDataset diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py index 199ce7aa..b0853b06 100644 --- a/tests/unit/base_tests_zarrio.py +++ b/tests/unit/base_tests_zarrio.py @@ -228,7 +228,7 @@ def test_write_compound(self, test_data=None): {'name': 'name', 'dtype': str}] self.__dataset_builder = DatasetBuilder('my_data', data, dtype=data_type) self.createGroupBuilder() - writer = ZarrIO(self.store, manager=self.manager, mode='a') + writer = ZarrIO(self.store, manager=self.manager, mode='w') writer.write_builder(self.builder) writer.close() diff --git a/tests/unit/test_io_convert.py b/tests/unit/test_io_convert.py index 9f7f0439..fe75ab76 100644 --- a/tests/unit/test_io_convert.py +++ b/tests/unit/test_io_convert.py @@ -39,16 +39,19 @@ from hdmf_zarr.backend import (ZarrIO, ROOT_NAME) +from hdmf_zarr.zarr_utils import ContainerZarrReferenceDataset +from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset from hdmf.backends.hdf5 import HDF5IO from hdmf.common import get_manager as get_hdmfcommon_manager from hdmf.testing import TestCase from hdmf.common import DynamicTable from hdmf.common import CSRMatrix -from tests.unit.utils import (Foo, - FooBucket, - FooFile, - get_foo_buildmanager) + + +from tests.unit.utils import (Foo, FooBucket, FooFile, get_foo_buildmanager, + Baz, BazData, BazBucket, get_baz_buildmanager, + BazCpdData, get_temp_filepath) from zarr.storage import (DirectoryStore, TempStore, @@ -104,8 +107,13 @@ class MixinTestCaseConvert(metaclass=ABCMeta): (Default=[None, ]) """ + REFERENCES = False + """ + Bool parameter passed to check for references. + """ + def get_manager(self): - raise NotImplementedError('Cannot run test unless get_manger is implemented') + raise NotImplementedError('Cannot run test unless get_manger is implemented') def setUp(self): self.__manager = self.get_manager() @@ -163,6 +171,20 @@ def test_export_roundtrip(self): container=container, write_path=write_path, export_path=export_path) + if self.REFERENCES: + if self.TARGET_FORMAT == "H5": + num_bazs = 10 + for i in range(num_bazs): + baz_name = 'baz%d' % i + self.assertIsInstance(exported_container.baz_data.data, ContainerH5ReferenceDataset) + self.assertIs(exported_container.baz_data.data[i], exported_container.bazs[baz_name]) + elif self.TARGET_FORMAT == "ZARR": + num_bazs = 10 + for i in range(num_bazs): + baz_name = 'baz%d' % i + self.assertIsInstance(exported_container.baz_data.data, ContainerZarrReferenceDataset) + self.assertIs(exported_container.baz_data.data[i], exported_container.bazs[baz_name]) + # assert that the roundtrip worked correctly message = "Using: write_path=%s, export_path=%s" % (str(write_path), str(export_path)) self.assertIsNotNone(str(container), message) # added as a test to make sure printing works @@ -178,7 +200,6 @@ def test_export_roundtrip(self): ignore_string_to_byte=self.IGNORE_STRING_TO_BYTE, message=message) self.close_files_and_ios() - # TODO: May need to add further asserts here ########################################################## @@ -196,6 +217,7 @@ class MixinTestHDF5ToZarr(): DirectoryStore('test_export_DirectoryStore.zarr'), TempStore(), NestedDirectoryStore('test_export_NestedDirectoryStore.zarr')] + TARGET_FORMAT = "ZARR" def get_manager(self): return get_hdmfcommon_manager() @@ -226,13 +248,14 @@ class MixinTestZarrToHDF5(): TempStore(), NestedDirectoryStore('test_export_NestedDirectoryStore.zarr')] EXPORT_PATHS = [None, ] + TARGET_FORMAT = "H5" def get_manager(self): return get_hdmfcommon_manager() def roundtripExportContainer(self, container, write_path, export_path): with ZarrIO(write_path, manager=self.get_manager(), mode='w') as write_io: - write_io.write(container, cache_spec=True) + write_io.write(container) with ZarrIO(write_path, manager=self.get_manager(), mode='r') as read_io: with HDF5IO(export_path, mode='w') as export_io: @@ -259,6 +282,7 @@ class MixinTestZarrToZarr(): DirectoryStore('test_export_DirectoryStore_Export.zarr'), TempStore(dir=os.path.dirname(__file__)), # set dir to avoid switching drives on Windows NestedDirectoryStore('test_export_NestedDirectoryStore_Export.zarr')] + TARGET_FORMAT = "ZARR" def get_manager(self): return get_hdmfcommon_manager() @@ -379,6 +403,34 @@ def setUpContainer(self): raise NotImplementedError("FOO_TYPE %i not implemented in test" % self.FOO_TYPE) +######################################## +# HDMF Baz test dataset of references +######################################## +class MixinTestBaz1(): + """ + Mixin class used in conjunction with MixinTestCaseConvert to test a dataset of references. + + Mixin class used in conjunction with MixinTestCaseConvert to create conversion tests that + test export of a dataset of references. This class only defines the setUpContainer + and get_manager functions. The roundtripExportContainer function required for + the test needs to be defined separately, e.g., MixinTestZarrToHDF5, MixinTestHDF5ToZarr, + or MixinTestZarrToZarr. + """ + def get_manager(self): + return get_baz_buildmanager() + + def setUpContainer(self): + num_bazs = 10 + # set up dataset of references + bazs = [] + for i in range(num_bazs): + bazs.append(Baz(name='baz%d' % i)) + baz_data = BazData(name='baz_data1', data=bazs) + + bucket = BazBucket(bazs=bazs, baz_data=baz_data) + return bucket + + ######################################## # Actual test cases for conversion ######################################## @@ -589,6 +641,190 @@ class TestHDF5toZarrFooCase2(MixinTestFoo, FOO_TYPE = MixinTestFoo.FOO_TYPES['link_data'] +######################################## +# Test cases for dataset of references +######################################## +class TestZarrToHDF5Baz1(MixinTestBaz1, + MixinTestZarrToHDF5, + MixinTestCaseConvert, + TestCase): + """ + Test the conversion of a BazBucket containing a dataset of references from Zarr to HDF5 + See MixinTestBaz1.setUpContainer for the container spec used. + """ + IGNORE_NAME = True + IGNORE_HDMF_ATTRS = True + IGNORE_STRING_TO_BYTE = True + REFERENCES = True + + +class TestHDF5toZarrBaz1(MixinTestBaz1, + MixinTestHDF5ToZarr, + MixinTestCaseConvert, + TestCase): + """ + Test the conversion of a BazBucket containing a dataset of references from HDF5 to Zarr + See MixinTestBaz1.setUpContainer for the container spec used. + """ + IGNORE_NAME = True + IGNORE_HDMF_ATTRS = True + IGNORE_STRING_TO_BYTE = True + REFERENCES = True + + +class TestZarrtoZarrBaz1(MixinTestBaz1, + MixinTestZarrToZarr, + MixinTestCaseConvert, + TestCase): + """ + Test the conversion of a BazBucket containing a dataset of references from Zarr to Zarr + See MixinTestBaz1.setUpContainer for the container spec used. + """ + IGNORE_NAME = True + IGNORE_HDMF_ATTRS = True + IGNORE_STRING_TO_BYTE = True + REFERENCES = True + + +################################################## +# Test cases for compound dataset of references +################################################## +class TestHDF5ToZarrCPD(TestCase): + """ + This class helps with making the test suit more readable, testing the roundtrip for compound + datasets that have references from HDF5 to Zarr. + """ + def test_export_cpd_dset_refs(self): + self.path = [get_temp_filepath() for i in range(2)] + + """Test that exporting a written container with a compound dataset with references works.""" + bazs = [] + baz_pairs = [] + num_bazs = 10 + for i in range(num_bazs): + b = Baz(name='baz%d' % i) + bazs.append(b) + baz_pairs.append((i, b)) + baz_cpd_data = BazCpdData(name='baz_cpd_data1', data=baz_pairs) + bucket = BazBucket(name='root', bazs=bazs.copy(), baz_cpd_data=baz_cpd_data) + + with HDF5IO(self.path[0], manager=get_baz_buildmanager(), mode='w') as write_io: + write_io.write(bucket) + + with HDF5IO(self.path[0], manager=get_baz_buildmanager(), mode='r') as read_io: + read_bucket1 = read_io.read() + + # NOTE: reference IDs might be the same between two identical files + # adding a Baz with a smaller name should change the reference IDs on export + new_baz = Baz(name='baz000') + read_bucket1.add_baz(new_baz) + + with ZarrIO(self.path[1], mode='w') as export_io: + export_io.export(src_io=read_io, container=read_bucket1, write_args=dict(link_data=False)) + + with ZarrIO(self.path[1], manager=get_baz_buildmanager(), mode='r') as read_io: + read_bucket2 = read_io.read() + # remove and check the appended child, then compare the read container with the original + read_new_baz = read_bucket2.remove_baz(new_baz.name) + + self.assertContainerEqual(new_baz, read_new_baz, ignore_hdmf_attrs=True) + self.assertContainerEqual(bucket, read_bucket2, ignore_name=True, ignore_hdmf_attrs=True) + for i in range(num_bazs): + baz_name = 'baz%d' % i + self.assertEqual(read_bucket2.baz_cpd_data.data[i][0], i) + self.assertIs(read_bucket2.baz_cpd_data.data[i][1], read_bucket2.bazs[baz_name]) + + +class TestZarrToHDF5CPD(TestCase): + """ + This class helps with making the test suit more readable, testing the roundtrip for compound + datasets that have references from Zarr to HDF5. + """ + def test_export_cpd_dset_refs(self): + self.path = [get_temp_filepath() for i in range(2)] + """Test that exporting a written container with a compound dataset with references works.""" + bazs = [] + baz_pairs = [] + num_bazs = 10 + for i in range(num_bazs): + b = Baz(name='baz%d' % i) + bazs.append(b) + baz_pairs.append((i, b)) + baz_cpd_data = BazCpdData(name='baz_cpd_data1', data=baz_pairs) + bucket = BazBucket(name='root', bazs=bazs.copy(), baz_cpd_data=baz_cpd_data) + + with ZarrIO(self.path[0], manager=get_baz_buildmanager(), mode='w') as write_io: + write_io.write(bucket) + + with ZarrIO(self.path[0], manager=get_baz_buildmanager(), mode='r') as read_io: + read_bucket1 = read_io.read() + + # NOTE: reference IDs might be the same between two identical files + # adding a Baz with a smaller name should change the reference IDs on export + new_baz = Baz(name='baz000') + read_bucket1.add_baz(new_baz) + + with HDF5IO(self.path[1], mode='w') as export_io: + export_io.export(src_io=read_io, container=read_bucket1, write_args=dict(link_data=False)) + + with HDF5IO(self.path[1], manager=get_baz_buildmanager(), mode='r') as read_io: + read_bucket2 = read_io.read() + + # remove and check the appended child, then compare the read container with the original + read_new_baz = read_bucket2.remove_baz(new_baz.name) + self.assertContainerEqual(new_baz, read_new_baz, ignore_hdmf_attrs=True) + self.assertContainerEqual(bucket, read_bucket2, ignore_name=True, ignore_hdmf_attrs=True) + for i in range(num_bazs): + baz_name = 'baz%d' % i + self.assertEqual(read_bucket2.baz_cpd_data.data[i][0], i) + self.assertIs(read_bucket2.baz_cpd_data.data[i][1], read_bucket2.bazs[baz_name]) + + +class TestZarrToZarrCPD(TestCase): + """ + This class helps with making the test suit more readable, testing the roundtrip for compound + datasets that have references from Zarr to Zarr. + """ + def test_export_cpd_dset_refs(self): + self.path = [get_temp_filepath() for i in range(2)] + + """Test that exporting a written container with a compound dataset with references works.""" + bazs = [] + baz_pairs = [] + num_bazs = 10 + for i in range(num_bazs): + b = Baz(name='baz%d' % i) + bazs.append(b) + baz_pairs.append((i, b)) + baz_cpd_data = BazCpdData(name='baz_cpd_data1', data=baz_pairs) + bucket = BazBucket(name='root', bazs=bazs.copy(), baz_cpd_data=baz_cpd_data) + + with ZarrIO(self.path[0], manager=get_baz_buildmanager(), mode='w') as write_io: + write_io.write(bucket) + with ZarrIO(self.path[0], manager=get_baz_buildmanager(), mode='r') as read_io: + read_bucket1 = read_io.read() + read_bucket1.baz_cpd_data.data[0][0] + # NOTE: reference IDs might be the same between two identical files + # adding a Baz with a smaller name should change the reference IDs on export + new_baz = Baz(name='baz000') + read_bucket1.add_baz(new_baz) + + with ZarrIO(self.path[1], mode='w') as export_io: + export_io.export(src_io=read_io, container=read_bucket1, write_args=dict(link_data=False)) + + with ZarrIO(self.path[1], manager=get_baz_buildmanager(), mode='r') as read_io: + read_bucket2 = read_io.read() + # remove and check the appended child, then compare the read container with the original + read_new_baz = read_bucket2.remove_baz(new_baz.name) + self.assertContainerEqual(new_baz, read_new_baz, ignore_hdmf_attrs=True) + + self.assertContainerEqual(bucket, read_bucket2, ignore_name=True, ignore_hdmf_attrs=True) + for i in range(num_bazs): + baz_name = 'baz%d' % i + self.assertEqual(read_bucket2.baz_cpd_data.data[i][0], i) + self.assertIs(read_bucket2.baz_cpd_data.data[i][1], read_bucket2.bazs[baz_name]) + + # TODO: Fails because we need to copy the data from the ExternalLink as it points to a non-Zarr source """ class TestFooExternalLinkHDF5ToZarr(MixinTestCaseConvert, TestCase): diff --git a/tests/unit/utils.py b/tests/unit/utils.py index 64ccc4af..67f2e8e0 100644 --- a/tests/unit/utils.py +++ b/tests/unit/utils.py @@ -9,6 +9,7 @@ SpecNamespace, NamespaceBuilder) from hdmf.spec.spec import (ZERO_OR_MANY, ONE_OR_MANY, ZERO_OR_ONE) from hdmf.utils import (docval, getargs, get_docval) +from hdmf_zarr.backend import ROOT_NAME CORE_NAMESPACE = 'test_core' @@ -113,8 +114,6 @@ class FooFile(Container): and should be reset to 'root' when use is finished to avoid potential cross-talk between tests. """ - ROOT_NAME = 'root' # For HDF5 and Zarr this is the root. It should be set before use if different for the backend. - @docval({'name': 'buckets', 'type': list, 'doc': 'the FooBuckets in this file', 'default': list()}, {'name': 'foo_link', 'type': Foo, 'doc': 'an optional linked Foo', 'default': None}, {'name': 'foofile_data', 'type': 'array_data', 'doc': 'an optional dataset', 'default': None}, @@ -123,7 +122,7 @@ class FooFile(Container): def __init__(self, **kwargs): buckets, foo_link, foofile_data, foo_ref_attr = getargs('buckets', 'foo_link', 'foofile_data', 'foo_ref_attr', kwargs) - super().__init__(name=self.ROOT_NAME) # name is not used - FooFile should be the root container + super().__init__(name=ROOT_NAME) # name is not used - FooFile should be the root container self.__buckets = {b.name: b for b in buckets} # note: collections of groups are unordered in HDF5 for f in buckets: f.parent = self @@ -306,8 +305,7 @@ class BazCpdData(Data): class BazBucket(Container): - - @docval({'name': 'name', 'type': str, 'doc': 'the name of this bucket'}, + @docval({'name': 'name', 'type': str, 'doc': 'the name of this bucket', 'default': ROOT_NAME}, {'name': 'bazs', 'type': list, 'doc': 'the Baz objects in this bucket'}, {'name': 'baz_data', 'type': BazData, 'doc': 'dataset of Baz references', 'default': None}, {'name': 'baz_cpd_data', 'type': BazCpdData, 'doc': 'dataset of Baz references', 'default': None}) diff --git a/tox.ini b/tox.ini index 9e98f41b..720a97f5 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py37, py38, py39, py310, py311 +envlist = py38, py39, py310, py311 requires = pip >= 22.0 [testenv] @@ -61,9 +61,9 @@ deps = -rrequirements-opt.txt commands = {[testenv]commands} -# Test with python 3.7; pinned dev reqs; minimum run reqs -[testenv:py37-minimum] -basepython = python3.7 +# Test with python 3.8; pinned dev reqs; minimum run reqs +[testenv:py38-minimum] +basepython = python3.8 deps = -rrequirements-dev.txt -rrequirements-min.txt @@ -75,10 +75,6 @@ commands = python -m pip install --upgrade build python -m build -[testenv:build-py37] -basepython = python3.7 -commands = {[testenv:build]commands} - [testenv:build-py38] basepython = python3.8 commands = {[testenv:build]commands} @@ -120,8 +116,8 @@ deps = -rrequirements-opt.txt commands = {[testenv:build]commands} -[testenv:build-py37-minimum] -basepython = python3.7 +[testenv:build-py38-minimum] +basepython = python3.8 deps = -rrequirements-dev.txt -rrequirements-min.txt @@ -187,9 +183,9 @@ deps = -rrequirements-opt.txt commands = {[testenv:gallery]commands} -# Test with python 3.7; pinned dev and doc reqs; minimum run reqs -[testenv:gallery-py37-minimum] -basepython = python3.7 +# Test with python 3.8; pinned dev and doc reqs; minimum run reqs +[testenv:gallery-py38-minimum] +basepython = python3.8 deps = -rrequirements-dev.txt -rrequirements-min.txt