Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve h5py.Dataset filter settings on export #153

Merged
merged 17 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# HDMF-ZARR Changelog

## 0.6.0 (Upcoming)

### Enhancements
* Enhanced `ZarrIO` and `ZarrDataIO` to infer io settings (e.g., chunking and compression) from HDF5 datasets to preserve storage settings on export if possible @oruebel [#153](https://github.com/hdmf-dev/hdmf-zarr/pull/153)

## 0.5.0 (December 8, 2023)

### Enhancements
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ pytest==7.1.2
pytest-cov==3.0.0
python-dateutil==2.8.2
tox==3.25.1
hdf5plugin==4.3.0 # hdf5plugin is used to test conversion of plugin filters
10 changes: 8 additions & 2 deletions src/hdmf_zarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,8 +344,9 @@ def export(self, **kwargs):
)

if not isinstance(src_io, ZarrIO) and write_args.get('link_data', True):
raise UnsupportedOperation("Cannot export from non-Zarr backend %s to Zarr with write argument "
"link_data=True." % src_io.__class__.__name__)
raise UnsupportedOperation(f"Cannot export from non-Zarr backend { src_io.__class__.__name__} " +
"to Zarr with write argument link_data=True. "
+ "Set write_args={'link_data': False}")

write_args['export_source'] = src_io.source # pass export_source=src_io.source to write_builder
ckwargs = kwargs.copy()
Expand Down Expand Up @@ -938,6 +939,11 @@ def write_dataset(self, **kwargs): # noqa: C901
name = builder.name
data = builder.data if force_data is None else force_data
options = dict()
# Check if data is a h5py.Dataset to infer I/O settings if necessary
if ZarrDataIO.is_h5py_dataset(data):
# Wrap the h5py.Dataset in ZarrDataIO with chunking and compression settings inferred from the input data
data = ZarrDataIO.from_h5py_dataset(h5dataset=data)
# Separate data values and io_settings for write
if isinstance(data, ZarrDataIO):
options['io_settings'] = data.io_settings
link_data = data.link_data
Expand Down
78 changes: 76 additions & 2 deletions src/hdmf_zarr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,13 +461,87 @@ def __init__(self, **kwargs):
self.__iosettings['filters'] = filters

@property
def link_data(self):
def link_data(self) -> bool:
"""Bool indicating should it be linked to or copied. NOTE: Only applies to zarr.Array type data"""
return self.__link_data

@property
def io_settings(self):
def io_settings(self) -> dict:
"""Dict with the io settings to use"""
return self.__iosettings

@staticmethod
def from_h5py_dataset(h5dataset, **kwargs):
"""
Factory method to create a ZarrDataIO instance from a h5py.Dataset.
The ZarrDataIO object wraps the h5py.Dataset and the io filter settings
are inferred from filters used in h5py such that the options in Zarr match
(if possible) the options used in HDF5.

:param dataset: h5py.Dataset object that should be wrapped
:type dataset: h5py.Dataset
:param kwargs: Other keyword arguments to pass to ZarrDataIO.__init__

:returns: ZarrDataIO object wrapping the dataset
"""
filters = ZarrDataIO.hdf5_to_zarr_filters(h5dataset)
fillval = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue')
if isinstance(fillval, bytes): # bytes are not JSON serializable so use string instead
fillval = fillval.decode("utf-8")
chunks = h5dataset.chunks if 'chunks' not in kwargs else kwargs.pop('chunks')
re = ZarrDataIO(
data=h5dataset,
filters=filters,
fillvalue=fillval,
chunks=chunks,
**kwargs)
return re

@staticmethod
def hdf5_to_zarr_filters(h5dataset) -> list:
"""From the given h5py.Dataset infer the corresponding filters to use in Zarr"""
# Based on https://github.com/fsspec/kerchunk/blob/617d9ce06b9d02375ec0e5584541fcfa9e99014a/kerchunk/hdf.py#L181
filters = []
# Check for unsupported filters
if h5dataset.scaleoffset:
# TODO: translate to numcodecs.fixedscaleoffset.FixedScaleOffset()
warn( f"{h5dataset.name} HDF5 scaleoffset filter ignored in Zarr")
if h5dataset.compression in ("szip", "lzf"):
warn(f"{h5dataset.name} HDF5 szip or lzf compression ignored in Zarr")
# Add the shuffle filter if possible
if h5dataset.shuffle and h5dataset.dtype.kind != "O":
# cannot use shuffle if we materialised objects
filters.append(numcodecs.Shuffle(elementsize=h5dataset.dtype.itemsize))
# iterate through all the filters and add them to the list
for filter_id, properties in h5dataset._filters.items():
filter_id_str = str(filter_id)
if filter_id_str == "32001":
blosc_compressors = ("blosclz", "lz4", "lz4hc", "snappy", "zlib", "zstd")
(_1, _2, bytes_per_num, total_bytes, clevel, shuffle, compressor) = properties
pars = dict(
blocksize=total_bytes,
clevel=clevel,
shuffle=shuffle,
cname=blosc_compressors[compressor])
filters.append(numcodecs.Blosc(**pars))
elif filter_id_str == "32015":
filters.append(numcodecs.Zstd(level=properties[0]))
elif filter_id_str == "gzip":
filters.append(numcodecs.Zlib(level=properties))
elif filter_id_str == "32004":
warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr")
elif filter_id_str == "32008":
warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr")
elif filter_id_str == "shuffle": # already handled above
pass
else:
warn(f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.")
return filters

@staticmethod
def is_h5py_dataset(obj):
"""Check if the object is an instance of h5py.Dataset without requiring import of h5py"""
return (obj.__class__.__module__, obj.__class__.__name__) == ('h5py._hl.dataset', 'Dataset')

class ZarrReference(dict):
"""
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/base_tests_zarrio.py
Original file line number Diff line number Diff line change
Expand Up @@ -1579,7 +1579,8 @@ def close(self):

with OtherIO(manager=get_foo_buildmanager()) as read_io:
with ZarrIO(self.store[1], mode='w') as export_io:
msg = "Cannot export from non-Zarr backend OtherIO to Zarr with write argument link_data=True."
msg = ("Cannot export from non-Zarr backend OtherIO to Zarr with write argument link_data=True. "
"Set write_args={'link_data': False}")
with self.assertRaisesWith(UnsupportedOperation, msg):
export_io.export(src_io=read_io, container=foofile)

Expand Down
85 changes: 84 additions & 1 deletion tests/unit/test_io_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,14 @@
import os
import shutil
import numpy as np
import numcodecs
from abc import ABCMeta, abstractmethod

from hdmf_zarr.backend import (ZarrIO,
ROOT_NAME)
from hdmf_zarr.zarr_utils import ContainerZarrReferenceDataset

from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset
from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset, H5DataIO
from hdmf.backends.hdf5 import HDF5IO
from hdmf.common import get_manager as get_hdmfcommon_manager
from hdmf.testing import TestCase
Expand Down Expand Up @@ -822,6 +823,88 @@ def test_export_cpd_dset_refs(self):
self.assertIs(read_bucket2.baz_cpd_data.data[i][1], read_bucket2.bazs[baz_name])


class TestHDF5toZarrWithFilters(TestCase):
"""
Test conversion from HDF5 to Zarr while preserving HDF5 filter settings
"""
def setUp(self):
self.hdf_filename = get_temp_filepath()
self.zarr_filename = get_temp_filepath()
self.out_container = None
self.read_container = None

def tearDown(self):
# close the ZarrIO used for reading
del self.out_container
del self.read_container
# clean up any opened files
for fn in [self.hdf_filename, self.zarr_filename]:
if fn is not None and os.path.exists(fn):
if os.path.isdir(fn):
shutil.rmtree(fn)
else:
os.remove(fn)

def __roundtrip_data(self, data):
"""Sets the variables self.out_container, self.read_container"""
# Create example foofile with the provided data (which may be wrapped in H5DataIO)
foo1 = Foo('foo1', data, "I am foo1", 17, 3.14)
foobucket = FooBucket('bucket1', [foo1,])
foofile = FooFile(buckets=[foobucket])
self.out_container = foofile

# write example HDF5 file with no filter settings
with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='w') as write_io:
write_io.write(foofile, cache_spec=True)
# Export the HDF5 file to Zarr
with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='r') as hdf_read_io:
with ZarrIO(self.zarr_filename, mode='w') as export_io:
export_io.export(src_io=hdf_read_io, write_args={'link_data': False})
# read and compare the containers
with ZarrIO(self.zarr_filename, mode='r', manager=get_foo_buildmanager()) as zarr_read_io:
self.read_container = zarr_read_io.read()

def __get_data_array(self, foo_container):
"""For a container created by __roundtrip_data return the data array"""
return foo_container.buckets['bucket1'].foos['foo1'].my_data

def test_nofilters(self):
"""basic test that export without any options specified is working as expected"""
data = list(range(5))
self.__roundtrip_data(data=data)
self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)

def test_chunking(self):
"""Test that chunking is being preserved"""
outdata = H5DataIO(data=list(range(100)), chunks=(10,))
self.__roundtrip_data(data=outdata)
self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)
read_array = self.__get_data_array(self.read_container)
self.assertTupleEqual((10,), read_array.chunks)

def test_shuffle(self):
"""Test that shuffle filter is being preserved"""
outdata = H5DataIO(data=list(range(100)), chunks=(10,), shuffle=True)
self.__roundtrip_data(data=outdata)
self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)
read_array = self.__get_data_array(self.read_container)
self.assertEqual(len(read_array.filters), 1)
self.assertIsInstance(read_array.filters[0], numcodecs.Shuffle)
self.assertTupleEqual((10,), read_array.chunks)

def test_gzip(self):
"""Test that gzip filter is being preserved"""
outdata = H5DataIO(data=list(range(100)), chunks=(10,), compression='gzip', compression_opts=2 )
self.__roundtrip_data(data=outdata)
self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)
read_array = self.__get_data_array(self.read_container)
self.assertEqual(len(read_array.filters), 1)
self.assertIsInstance(read_array.filters[0], numcodecs.Zlib)
self.assertEqual(read_array.filters[0].level, 2)
self.assertTupleEqual((10,), read_array.chunks)



# TODO: Fails because we need to copy the data from the ExternalLink as it points to a non-Zarr source
"""
class TestFooExternalLinkHDF5ToZarr(MixinTestCaseConvert, TestCase):
Expand Down
Loading
Loading