hdmf-dev · oruebel · Jan 12, 2024 · Dec 31, 2023 · Dec 31, 2023 · Dec 31, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # HDMF-ZARR Changelog
 
+## 0.6.0 (Upcoming)
+
+### Enhancements
+* Enhanced `ZarrIO` and `ZarrDataIO` to infer io settings (e.g., chunking and compression) from HDF5 datasets to preserve storage settings on export if possible @oruebel [#153](https://github.com/hdmf-dev/hdmf-zarr/pull/153)
+
 ## 0.5.0 (December 8, 2023)
 
 ### Enhancements

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -8,3 +8,4 @@ pytest==7.1.2
 pytest-cov==3.0.0
 python-dateutil==2.8.2
 tox==3.25.1
+hdf5plugin==4.3.0  # hdf5plugin is used to test conversion of plugin filters
diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py
@@ -344,8 +344,9 @@ def export(self, **kwargs):
         )
 
         if not isinstance(src_io, ZarrIO) and write_args.get('link_data', True):
-            raise UnsupportedOperation("Cannot export from non-Zarr backend %s to Zarr with write argument "
-                                       "link_data=True." % src_io.__class__.__name__)
+            raise UnsupportedOperation(f"Cannot export from non-Zarr backend { src_io.__class__.__name__} " +
+                                       "to Zarr with write argument link_data=True. "
+                                       + "Set write_args={'link_data': False}")
 
         write_args['export_source'] = src_io.source  # pass export_source=src_io.source to write_builder
         ckwargs = kwargs.copy()
@@ -938,6 +939,11 @@ def write_dataset(self, **kwargs):  # noqa: C901
         name = builder.name
         data = builder.data if force_data is None else force_data
         options = dict()
+        # Check if data is a h5py.Dataset to infer I/O settings if necessary
+        if ZarrDataIO.is_h5py_dataset(data):
+            # Wrap the h5py.Dataset in ZarrDataIO with chunking and compression settings inferred from the input data
+            data = ZarrDataIO.from_h5py_dataset(h5dataset=data)
+        # Separate data values and io_settings for write
         if isinstance(data, ZarrDataIO):
             options['io_settings'] = data.io_settings
             link_data = data.link_data

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
@@ -461,13 +461,87 @@ def __init__(self, **kwargs):
             self.__iosettings['filters'] = filters
 
     @property
-    def link_data(self):
+    def link_data(self) -> bool:
+        """Bool indicating should it be linked to or copied. NOTE: Only applies to zarr.Array type data"""
         return self.__link_data
 
     @property
-    def io_settings(self):
+    def io_settings(self) -> dict:
+        """Dict with the io settings to use"""
         return self.__iosettings
 
+    @staticmethod
+    def from_h5py_dataset(h5dataset, **kwargs):
+        """
+        Factory method to create a ZarrDataIO instance from a h5py.Dataset.
+        The ZarrDataIO object wraps the h5py.Dataset and the io filter settings
+        are inferred from filters used in h5py such that the options in Zarr match
+        (if possible) the options used in HDF5.
+
+        :param dataset: h5py.Dataset object that should be wrapped
+        :type dataset: h5py.Dataset
+        :param kwargs: Other keyword arguments to pass to ZarrDataIO.__init__
+
+        :returns: ZarrDataIO object wrapping the dataset
+        """
+        filters = ZarrDataIO.hdf5_to_zarr_filters(h5dataset)
+        fillval = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue')
+        if isinstance(fillval, bytes): # bytes are not JSON serializable so use string instead
+            fillval = fillval.decode("utf-8")
+        chunks = h5dataset.chunks if 'chunks' not in kwargs else kwargs.pop('chunks')
+        re = ZarrDataIO(
+            data=h5dataset,
+            filters=filters,
+            fillvalue=fillval,
+            chunks=chunks,
+            **kwargs)
+        return re
+
+    @staticmethod
+    def hdf5_to_zarr_filters(h5dataset) -> list:
+        """From the given h5py.Dataset infer the corresponding filters to use in Zarr"""
+        # Based on https://github.com/fsspec/kerchunk/blob/617d9ce06b9d02375ec0e5584541fcfa9e99014a/kerchunk/hdf.py#L181
+        filters = []
+        # Check for unsupported filters
+        if h5dataset.scaleoffset:
+            # TODO: translate to  numcodecs.fixedscaleoffset.FixedScaleOffset()
+            warn( f"{h5dataset.name} HDF5 scaleoffset filter ignored in Zarr")
+        if h5dataset.compression in ("szip", "lzf"):
+            warn(f"{h5dataset.name} HDF5 szip or lzf compression ignored in Zarr")
+        # Add the shuffle filter if possible
+        if h5dataset.shuffle and h5dataset.dtype.kind != "O":
+            # cannot use shuffle if we materialised objects
+            filters.append(numcodecs.Shuffle(elementsize=h5dataset.dtype.itemsize))
+        # iterate through all the filters and add them to the list
+        for filter_id, properties in h5dataset._filters.items():
+            filter_id_str = str(filter_id)
+            if filter_id_str == "32001":
+                blosc_compressors = ("blosclz", "lz4", "lz4hc", "snappy", "zlib", "zstd")
+                (_1, _2, bytes_per_num, total_bytes, clevel, shuffle, compressor) = properties
+                pars = dict(
+                    blocksize=total_bytes,
+                    clevel=clevel,
+                    shuffle=shuffle,
+                    cname=blosc_compressors[compressor])
+                filters.append(numcodecs.Blosc(**pars))
+            elif filter_id_str == "32015":
+                filters.append(numcodecs.Zstd(level=properties[0]))
+            elif filter_id_str == "gzip":
+                filters.append(numcodecs.Zlib(level=properties))
+            elif filter_id_str == "32004":
+                warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr")
+            elif filter_id_str == "32008":
+                warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr")
+            elif filter_id_str == "shuffle": # already handled above
+                pass
+            else:
+                warn(f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.")
+        return filters
+
+    @staticmethod
+    def is_h5py_dataset(obj):
+        """Check if the object is an instance of h5py.Dataset without requiring import of h5py"""
+        return (obj.__class__.__module__, obj.__class__.__name__) == ('h5py._hl.dataset', 'Dataset')
 
 class ZarrReference(dict):
     """

diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py
@@ -1579,7 +1579,8 @@ def close(self):
 
         with OtherIO(manager=get_foo_buildmanager()) as read_io:
             with ZarrIO(self.store[1], mode='w') as export_io:
-                msg = "Cannot export from non-Zarr backend OtherIO to Zarr with write argument link_data=True."
+                msg = ("Cannot export from non-Zarr backend OtherIO to Zarr with write argument link_data=True. "
+                       "Set write_args={'link_data': False}")
                 with self.assertRaisesWith(UnsupportedOperation, msg):
                     export_io.export(src_io=read_io, container=foofile)
 

diff --git a/tests/unit/test_io_convert.py b/tests/unit/test_io_convert.py
@@ -35,13 +35,14 @@
 import os
 import shutil
 import numpy as np
+import numcodecs
 from abc import ABCMeta, abstractmethod
 
 from hdmf_zarr.backend import (ZarrIO,
                                ROOT_NAME)
 from hdmf_zarr.zarr_utils import ContainerZarrReferenceDataset
 
-from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset
+from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset, H5DataIO
 from hdmf.backends.hdf5 import HDF5IO
 from hdmf.common import get_manager as get_hdmfcommon_manager
 from hdmf.testing import TestCase
@@ -822,6 +823,88 @@ def test_export_cpd_dset_refs(self):
                 self.assertIs(read_bucket2.baz_cpd_data.data[i][1], read_bucket2.bazs[baz_name])
 
 
+class TestHDF5toZarrWithFilters(TestCase):
+    """
+    Test conversion from HDF5 to Zarr while preserving HDF5 filter settings
+    """
+    def setUp(self):
+        self.hdf_filename = get_temp_filepath()
+        self.zarr_filename = get_temp_filepath()
+        self.out_container = None
+        self.read_container = None
+
+    def tearDown(self):
+        # close the ZarrIO used for reading
+        del self.out_container
+        del self.read_container
+        # clean up any opened files
+        for fn in  [self.hdf_filename, self.zarr_filename]:
+            if fn is not None and os.path.exists(fn):
+                if os.path.isdir(fn):
+                    shutil.rmtree(fn)
+                else:
+                    os.remove(fn)
+
+    def __roundtrip_data(self, data):
+        """Sets the variables self.out_container, self.read_container"""
+        # Create example foofile with the provided data (which may be wrapped in H5DataIO)
+        foo1 = Foo('foo1', data, "I am foo1", 17, 3.14)
+        foobucket = FooBucket('bucket1', [foo1,])
+        foofile = FooFile(buckets=[foobucket])
+        self.out_container = foofile
+
+        # write example HDF5 file with no filter settings
+        with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='w') as write_io:
+            write_io.write(foofile, cache_spec=True)
+        # Export the HDF5 file to Zarr
+        with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='r') as hdf_read_io:
+            with ZarrIO(self.zarr_filename, mode='w') as export_io:
+                export_io.export(src_io=hdf_read_io, write_args={'link_data': False})
+        # read and compare the containers
+        with ZarrIO(self.zarr_filename, mode='r', manager=get_foo_buildmanager()) as zarr_read_io:
+            self.read_container = zarr_read_io.read()
+
+    def __get_data_array(self, foo_container):
+        """For a container created by __roundtrip_data return the data array"""
+        return foo_container.buckets['bucket1'].foos['foo1'].my_data
+
+    def test_nofilters(self):
+        """basic test that export without any options specified is working as expected"""
+        data = list(range(5))
+        self.__roundtrip_data(data=data)
+        self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)
+
+    def test_chunking(self):
+        """Test that chunking is being preserved"""
+        outdata = H5DataIO(data=list(range(100)), chunks=(10,))
+        self.__roundtrip_data(data=outdata)
+        self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)
+        read_array = self.__get_data_array(self.read_container)
+        self.assertTupleEqual((10,), read_array.chunks)
+
+    def test_shuffle(self):
+        """Test that shuffle filter is being preserved"""
+        outdata = H5DataIO(data=list(range(100)), chunks=(10,), shuffle=True)
+        self.__roundtrip_data(data=outdata)
+        self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)
+        read_array = self.__get_data_array(self.read_container)
+        self.assertEqual(len(read_array.filters), 1)
+        self.assertIsInstance(read_array.filters[0], numcodecs.Shuffle)
+        self.assertTupleEqual((10,), read_array.chunks)
+
+    def test_gzip(self):
+        """Test that gzip filter is being preserved"""
+        outdata = H5DataIO(data=list(range(100)), chunks=(10,), compression='gzip', compression_opts=2 )
+        self.__roundtrip_data(data=outdata)
+        self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)
+        read_array = self.__get_data_array(self.read_container)
+        self.assertEqual(len(read_array.filters), 1)
+        self.assertIsInstance(read_array.filters[0], numcodecs.Zlib)
+        self.assertEqual(read_array.filters[0].level, 2)
+        self.assertTupleEqual((10,), read_array.chunks)
+
+
+
 # TODO: Fails because we need to copy the data from the ExternalLink as it points to a non-Zarr source
 """
 class TestFooExternalLinkHDF5ToZarr(MixinTestCaseConvert, TestCase):