catalystneuro · pauladkisson · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 13, 2024
diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py
@@ -5,7 +5,9 @@
 
 from ._backend_configuration import (
     BACKEND_CONFIGURATIONS,
+    BACKEND_NWB_IO,
     get_default_backend_configuration,
+    get_existing_backend_configuration,
 )
 from ._configuration_models import DATASET_IO_CONFIGURATIONS
 from ._configuration_models._base_backend import BackendConfiguration
@@ -21,15 +23,15 @@
     ZarrDatasetIOConfiguration,
 )
 from ._configure_backend import configure_backend
-from ._dataset_configuration import get_default_dataset_io_configurations
+from ._dataset_configuration import get_default_dataset_io_configurations, get_existing_dataset_io_configurations
 from ._metadata_and_file_helpers import (
-    BACKEND_NWB_IO,
     add_device_from_metadata,
     configure_and_write_nwbfile,
     get_default_nwbfile_metadata,
     get_module,
     make_nwbfile_from_metadata,
     make_or_load_nwbfile,
+    repack_nwbfile,
 )
 
 __all__ = [
@@ -46,6 +48,8 @@
     "ZarrDatasetIOConfiguration",
     "get_default_backend_configuration",
     "get_default_dataset_io_configurations",
+    "get_existing_backend_configuration",
+    "get_existing_dataset_io_configurations",
     "configure_backend",
     "get_default_dataset_io_configurations",
     "get_default_backend_configuration",
@@ -55,4 +59,5 @@
     "get_module",
     "make_nwbfile_from_metadata",
     "make_or_load_nwbfile",
+    "repack_nwbfile",
 ]
diff --git a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py
@@ -2,12 +2,14 @@
 
 from typing import Literal, Union
 
-from pynwb import NWBFile
+from hdmf_zarr import NWBZarrIO
+from pynwb import NWBHDF5IO, NWBFile
 
 from ._configuration_models._hdf5_backend import HDF5BackendConfiguration
 from ._configuration_models._zarr_backend import ZarrBackendConfiguration
 
 BACKEND_CONFIGURATIONS = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration)
+BACKEND_NWB_IO = dict(hdf5=NWBHDF5IO, zarr=NWBZarrIO)
 
 
 def get_default_backend_configuration(
@@ -17,3 +19,25 @@ def get_default_backend_configuration(
 
     BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend]
     return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile)
+
+
+def get_existing_backend_configuration(nwbfile: NWBFile) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]:
+    """Fill an existing backend configuration to serve as a starting point for further customization.
+
+    Parameters
+    ----------
+    nwbfile : NWBFile
+        The NWBFile object to extract the backend configuration from. The nwbfile must have been read from an io object
+        to work properly.
+
+    Returns
+    -------
+    Union[HDF5BackendConfiguration, ZarrBackendConfiguration]
+        The backend configuration extracted from the nwbfile.
+    """
+    read_io = nwbfile.read_io
+    for backend, io in BACKEND_NWB_IO.items():
+        if isinstance(read_io, io):
+            break
+    BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend]
+    return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile, use_default_dataset_io_configurations=False)
diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py
@@ -9,7 +9,10 @@
 
 from ._base_dataset_io import DatasetIOConfiguration
 from ._pydantic_pure_json_schema_generator import PureJSONSchemaGenerator
-from .._dataset_configuration import get_default_dataset_io_configurations
+from .._dataset_configuration import (
+    get_default_dataset_io_configurations,
+    get_existing_dataset_io_configurations,
+)
 
 
 class BackendConfiguration(BaseModel):
@@ -56,11 +59,31 @@ def model_json_schema(cls, **kwargs) -> Dict[str, Any]:
         return super().model_json_schema(mode="validation", schema_generator=PureJSONSchemaGenerator, **kwargs)
 
     @classmethod
-    def from_nwbfile(cls, nwbfile: NWBFile) -> Self:
-        default_dataset_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
+    def from_nwbfile(cls, nwbfile: NWBFile, use_default_dataset_io_configurations: bool = True) -> Self:
+        """
+        Create a backend configuration from an NWBFile.
+
+        Parameters
+        ----------
+        nwbfile : pynwb.NWBFile
+            The NWBFile object to extract the backend configuration from.
+        use_default_dataset_io_configurations : bool, optional
+            Whether to use default dataset configurations, by default True. If False, the existing dataset
+            configurations in the NWBFile will be used, which requires that the NWBFile was read from an io object.
+
+        Returns
+        -------
+        Self
+            The backend configuration extracted from the NWBFile.
+        """
+
+        if use_default_dataset_io_configurations:
+            dataset_io_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
+        else:
+            dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
         dataset_configurations = {
             default_dataset_configuration.location_in_file: default_dataset_configuration
-            for default_dataset_configuration in default_dataset_configurations
+            for default_dataset_configuration in dataset_io_configurations
         }
 
         return cls(dataset_configurations=dataset_configurations)

diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
@@ -147,7 +147,6 @@ def __str__(self) -> str:
         """
         size_in_bytes = math.prod(self.full_shape) * self.dtype.itemsize
         maximum_ram_usage_per_iteration_in_bytes = math.prod(self.buffer_shape) * self.dtype.itemsize
-        disk_space_usage_per_chunk_in_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize
 
         string = (
             f"\n{self.location_in_file}"
@@ -159,10 +158,14 @@ def __str__(self) -> str:
             f"\n  buffer shape : {self.buffer_shape}"
             f"\n  expected RAM usage : {human_readable_size(maximum_ram_usage_per_iteration_in_bytes)}"
             "\n"
-            f"\n  chunk shape : {self.chunk_shape}"
-            f"\n  disk space usage per chunk : {human_readable_size(disk_space_usage_per_chunk_in_bytes)}"
-            "\n"
         )
+        if self.chunk_shape is not None:
+            disk_space_usage_per_chunk_in_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize
+            string += (
+                f"\n  chunk shape : {self.chunk_shape}"
+                f"\n  disk space usage per chunk : {human_readable_size(disk_space_usage_per_chunk_in_bytes)}"
+                "\n"
+            )
         if self.compression_method is not None:
             string += f"\n  compression method : {self.compression_method}"
         if self.compression_options is not None:
@@ -182,9 +185,9 @@ def validate_all_shapes(cls, values: Dict[str, Any]) -> Dict[str, Any]:
             dataset_name == location_in_file.split("/")[-1]
         ), f"The `dataset_name` ({dataset_name}) does not match the end of the `location_in_file` ({location_in_file})!"
 
-        chunk_shape = values["chunk_shape"]
-        buffer_shape = values["buffer_shape"]
         full_shape = values["full_shape"]
+        chunk_shape = values["chunk_shape"] if values["chunk_shape"] is not None else full_shape
+        buffer_shape = values["buffer_shape"] if values["buffer_shape"] is not None else full_shape
 
         if len(chunk_shape) != len(buffer_shape):
             raise ValueError(

diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py
@@ -3,9 +3,13 @@
 from typing import Any, Dict, Literal, Union
 
 import h5py
+import numpy as np
+from hdmf import Container
 from pydantic import Field, InstanceOf
+from typing_extensions import Self
 
-from ._base_dataset_io import DatasetIOConfiguration
+from ._base_dataset_io import DatasetIOConfiguration, _find_location_in_memory_nwbfile
+from ...hdmf import SliceableDataChunkIterator
 from ...importing import is_package_installed
 
 _base_hdf5_filters = set(h5py.filters.decode)
@@ -78,3 +82,37 @@ def get_data_io_kwargs(self) -> Dict[str, Any]:
             compression_bundle = dict(compression=self.compression_method, compression_opts=compression_opts)
 
         return dict(chunks=self.chunk_shape, **compression_bundle)
+
+    @classmethod
+    def from_neurodata_object(
+        cls,
+        neurodata_object: Container,
+        dataset_name: Literal["data", "timestamps"],
+        use_default_dataset_io_configuration: bool = True,
+    ) -> Self:
+        if use_default_dataset_io_configuration:
+            return super().from_neurodata_object(neurodata_object=neurodata_object, dataset_name=dataset_name)
+
+        location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name)
+        full_shape = getattr(neurodata_object, dataset_name).shape
+        dtype = getattr(neurodata_object, dataset_name).dtype
+        chunk_shape = getattr(neurodata_object, dataset_name).chunks
+        buffer_chunk_shape = chunk_shape or full_shape
+        buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape(
+            buffer_gb=0.5, chunk_shape=buffer_chunk_shape, maxshape=full_shape, dtype=np.dtype(dtype)
+        )
+        compression_method = getattr(neurodata_object, dataset_name).compression
+        compression_opts = getattr(neurodata_object, dataset_name).compression_opts
+        compression_options = dict(compression_opts=compression_opts)
+        return cls(
+            object_id=neurodata_object.object_id,
+            object_name=neurodata_object.name,
+            location_in_file=location_in_file,
+            dataset_name=dataset_name,
+            full_shape=full_shape,
+            dtype=dtype,
+            chunk_shape=chunk_shape,
+            buffer_shape=buffer_shape,
+            compression_method=compression_method,
+            compression_options=compression_options,
+        )
diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py
@@ -1,12 +1,15 @@
 """Base Pydantic models for the ZarrDatasetConfiguration."""
 
-from typing import Any, Dict, List, Literal, Union
+from typing import Any, Dict, List, Literal, Self, Union
 
 import numcodecs
+import numpy as np
 import zarr
+from hdmf import Container
 from pydantic import Field, InstanceOf, model_validator
 
-from ._base_dataset_io import DatasetIOConfiguration
+from ._base_dataset_io import DatasetIOConfiguration, _find_location_in_memory_nwbfile
+from ...hdmf import SliceableDataChunkIterator
 
 _base_zarr_codecs = set(zarr.codec_registry.keys())
 _lossy_zarr_codecs = set(("astype", "bitround", "quantize"))
@@ -130,3 +133,36 @@ def get_data_io_kwargs(self) -> Dict[str, Any]:
             compressor = False
 
         return dict(chunks=self.chunk_shape, filters=filters, compressor=compressor)
+
+    @classmethod
+    def from_neurodata_object(
+        cls,
+        neurodata_object: Container,
+        dataset_name: Literal["data", "timestamps"],
+        use_default_dataset_io_configuration: bool = True,
+    ) -> Self:
+        if use_default_dataset_io_configuration:
+            return super().from_neurodata_object(neurodata_object=neurodata_object, dataset_name=dataset_name)
+
+        location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name)
+        full_shape = getattr(neurodata_object, dataset_name).shape
+        dtype = getattr(neurodata_object, dataset_name).dtype
+        chunk_shape = getattr(neurodata_object, dataset_name).chunks
+        buffer_chunk_shape = chunk_shape or full_shape
+        buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape(
+            buffer_gb=0.5, chunk_shape=buffer_chunk_shape, maxshape=full_shape, dtype=np.dtype(dtype)
+        )
+        compression_method = getattr(neurodata_object, dataset_name).compressor
+        filter_methods = getattr(neurodata_object, dataset_name).filters
+        return cls(
+            object_id=neurodata_object.object_id,
+            object_name=neurodata_object.name,
+            location_in_file=location_in_file,
+            dataset_name=dataset_name,
+            full_shape=full_shape,
+            dtype=dtype,
+            chunk_shape=chunk_shape,
+            buffer_shape=buffer_shape,
+            compression_method=compression_method,
+            filter_methods=filter_methods,
+        )
diff --git a/src/neuroconv/tools/nwb_helpers/_configure_backend.py b/src/neuroconv/tools/nwb_helpers/_configure_backend.py
@@ -4,6 +4,7 @@
 from typing import Union
 
 from hdmf.common import Data
+from hdmf.data_utils import DataChunkIterator
 from pynwb import NWBFile, TimeSeries
 
 from ._configuration_models._hdf5_backend import HDF5BackendConfiguration
@@ -46,16 +47,24 @@ def configure_backend(
 
         # Table columns
         if isinstance(neurodata_object, Data):
-            neurodata_object.set_data_io(data_io_class=data_io_class, data_io_kwargs=data_io_kwargs)
+            neurodata_object.set_data_io(
+                data_io_class=data_io_class, data_io_kwargs=data_io_kwargs, data_chunk_iterator_class=DataChunkIterator
+            )
         # TimeSeries data or timestamps
         elif isinstance(neurodata_object, TimeSeries) and not is_dataset_linked:
             neurodata_object.set_data_io(
-                dataset_name=dataset_name, data_io_class=data_io_class, data_io_kwargs=data_io_kwargs
+                dataset_name=dataset_name,
+                data_io_class=data_io_class,
+                data_io_kwargs=data_io_kwargs,
+                data_chunk_iterator_class=DataChunkIterator,
             )
         # Special ndx-events v0.2.0 types
         elif is_ndx_events_installed and isinstance(neurodata_object, ndx_events.Events):
             neurodata_object.set_data_io(
-                dataset_name=dataset_name, data_io_class=data_io_class, data_io_kwargs=data_io_kwargs
+                dataset_name=dataset_name,
+                data_io_class=data_io_class,
+                data_io_kwargs=data_io_kwargs,
+                data_chunk_iterator_class=DataChunkIterator,
             )
         # But temporarily skipping LabeledEvents
         elif is_ndx_events_installed and isinstance(neurodata_object, ndx_events.LabeledEvents):

diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
@@ -172,3 +172,82 @@ def get_default_dataset_io_configurations(
                 )
 
                 yield dataset_io_configuration
+
+
+def get_existing_dataset_io_configurations(
+    nwbfile: NWBFile,
+    backend: Literal["hdf5", "zarr"],
+) -> Generator[DatasetIOConfiguration, None, None]:
+    """
+    Generate DatasetIOConfiguration objects for each neurodata object in an nwbfile.
+
+    Parameters
+    ----------
+    nwbfile : pynwb.NWBFile
+        An NWBFile object that has been read from an existing file with an existing backend configuration.
+    backend : "hdf5" or "zarr"
+        Which backend format type you would like to use in configuring each dataset's compression methods and options.
+
+    Yields
+    ------
+    DatasetIOConfiguration
+        A configuration object for each dataset in the NWB file.
+    """
+
+    DatasetIOConfigurationClass = DATASET_IO_CONFIGURATIONS[backend]
+
+    known_dataset_fields = ("data", "timestamps")
+    for neurodata_object in nwbfile.objects.values():
+        if isinstance(neurodata_object, DynamicTable):
+            dynamic_table = neurodata_object  # For readability
+
+            for column in dynamic_table.columns:
+                candidate_dataset = column.data  # VectorData object
+
+                # Skip over columns whose values are links, such as the 'group' of an ElectrodesTable
+                if any(isinstance(value, Container) for value in candidate_dataset):
+                    continue  # Skip
+
+                # Skip when columns whose values are a reference type
+                if isinstance(column, TimeSeriesReferenceVectorData):
+                    continue
+
+                # Skip datasets with any zero-length axes
+                dataset_name = "data"
+                candidate_dataset = getattr(column, dataset_name)
+                full_shape = get_data_shape(data=candidate_dataset)
+                if any(axis_length == 0 for axis_length in full_shape):
+                    continue
+
+                dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
+                    neurodata_object=column,
+                    dataset_name=dataset_name,
+                    use_default_dataset_io_configuration=False,
+                )
+
+                yield dataset_io_configuration
+        elif isinstance(neurodata_object, NWBContainer):
+            for known_dataset_field in known_dataset_fields:
+                # Skip optional fields that aren't present
+                if known_dataset_field not in neurodata_object.fields:
+                    continue
+
+                candidate_dataset = getattr(neurodata_object, known_dataset_field)
+
+                # Skip edge case of in-memory ImageSeries with external mode; data is in fields and is empty array
+                if isinstance(candidate_dataset, np.ndarray) and candidate_dataset.size == 0:
+                    continue
+
+                # Skip datasets with any zero-length axes
+                candidate_dataset = getattr(neurodata_object, known_dataset_field)
+                full_shape = get_data_shape(data=candidate_dataset)
+                if any(axis_length == 0 for axis_length in full_shape):
+                    continue
+
+                dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
+                    neurodata_object=neurodata_object,
+                    dataset_name=known_dataset_field,
+                    use_default_dataset_io_configuration=False,
+                )
+
+                yield dataset_io_configuration