Merge branch 'main' of https://github.com/NeurodataWithoutBorders/lindi

NeurodataWithoutBorders · Apr 4, 2024 · ed31ef3 · ed31ef3
2 parents fc1421f + b2c2788
commit ed31ef3
Show file tree

Hide file tree

Showing 40 changed files with 1,960 additions and 615 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 *.zarr.json
+*.nwb
 
 .coverage
 

diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@ LINDI features include:
 - A specification for representing arbitrary HDF5 files as Zarr stores. This handles scalar datasets, references, soft links, and compound data types for datasets.
 - A Zarr wrapper for remote or local HDF5 files (LindiH5ZarrStore). This involves pointers to remote files for remote data chunks.
 - A function for generating a reference file system .zarr.json file from a Zarr store. This is inspired by [kerchunk](https://github.com/fsspec/kerchunk).
-- An h5py-like interface for accessing these Zarr stores that can be used with [pynwb](https://pynwb.readthedocs.io/en/stable/).
+- An h5py-like interface for accessing these Zarr stores that can be used with [pynwb](https://pynwb.readthedocs.io/en/stable/). Both read and write operations are supported.
 
 This project was inspired by [kerchunk](https://github.com/fsspec/kerchunk) and [hdmf-zarr](https://hdmf-zarr.readthedocs.io/en/latest/index.html) and depends on [zarr](https://zarr.readthedocs.io/en/stable/), [h5py](https://www.h5py.org/), [remfile](https://github.com/magland/remfile) and [numcodecs](https://numcodecs.readthedocs.io/en/stable/).
 

diff --git a/devel/demonstrate_slow_get_chunk_info.py b/devel/demonstrate_slow_get_chunk_info.py
@@ -22,7 +22,9 @@ def demonstrate_slow_get_chunk_info():
     print(f"shape: {shape}")  # (128000, 212, 322, 2)
     print(f"chunk_shape: {chunk_shape}")  # (3, 53, 81, 1)
     chunk_coord_shape = [
-        (shape[i] + chunk_shape[i] - 1) // chunk_shape[i] for i in range(len(shape))
+        # the shape could be zero -- for example dandiset 000559 - acquisition/depth_video/data has shape [0, 0, 0]
+        (shape[i] + chunk_shape[i] - 1) // chunk_shape[i] if chunk_shape[i] != 0 else 0
+        for i in range(len(shape))
     ]
     print(f"chunk_coord_shape: {chunk_coord_shape}")  # [42667, 4, 4, 2]
     num_chunks = np.prod(chunk_coord_shape)

diff --git a/devel/test_write_nwb.py b/devel/test_write_nwb.py
@@ -0,0 +1,110 @@
+from typing import Any
+
+from datetime import datetime
+from uuid import uuid4
+import numpy as np
+from dateutil.tz import tzlocal
+from pynwb import NWBHDF5IO, NWBFile, H5DataIO
+from pynwb.ecephys import LFP, ElectricalSeries
+import zarr
+import lindi
+
+nwbfile: Any = NWBFile(
+    session_description="my first synthetic recording",
+    identifier=str(uuid4()),
+    session_start_time=datetime.now(tzlocal()),
+    experimenter=[
+        "Baggins, Bilbo",
+    ],
+    lab="Bag End Laboratory",
+    institution="University of Middle Earth at the Shire",
+    experiment_description="I went on an adventure to reclaim vast treasures.",
+    session_id="LONELYMTN001",
+)
+
+device = nwbfile.create_device(
+    name="array", description="the best array", manufacturer="Probe Company 9000"
+)
+
+nwbfile.add_electrode_column(name="label", description="label of electrode")
+
+nshanks = 4
+nchannels_per_shank = 3
+electrode_counter = 0
+
+for ishank in range(nshanks):
+    # create an electrode group for this shank
+    electrode_group = nwbfile.create_electrode_group(
+        name="shank{}".format(ishank),
+        description="electrode group for shank {}".format(ishank),
+        device=device,
+        location="brain area",
+    )
+    # add electrodes to the electrode table
+    for ielec in range(nchannels_per_shank):
+        nwbfile.add_electrode(
+            group=electrode_group,
+            label="shank{}elec{}".format(ishank, ielec),
+            location="brain area",
+        )
+        electrode_counter += 1
+
+all_table_region = nwbfile.create_electrode_table_region(
+    region=list(range(electrode_counter)),  # reference row indices 0 to N-1
+    description="all electrodes",
+)
+
+raw_data = np.random.randn(300000, 100)
+raw_electrical_series = ElectricalSeries(
+    name="ElectricalSeries",
+    data=H5DataIO(data=raw_data, chunks=(100000, 100)),  # type: ignore
+    electrodes=all_table_region,
+    starting_time=0.0,  # timestamp of the first sample in seconds relative to the session start time
+    rate=20000.0,  # in Hz
+)
+
+nwbfile.add_acquisition(raw_electrical_series)
+
+lfp_data = np.random.randn(50, 12)
+lfp_electrical_series = ElectricalSeries(
+    name="ElectricalSeries",
+    data=lfp_data,
+    electrodes=all_table_region,
+    starting_time=0.0,
+    rate=200.0,
+)
+
+lfp = LFP(electrical_series=lfp_electrical_series)
+
+ecephys_module = nwbfile.create_processing_module(
+    name="ecephys", description="processed extracellular electrophysiology data"
+)
+ecephys_module.add(lfp)
+
+nwbfile.add_unit_column(name="quality", description="sorting quality")
+
+firing_rate = 20
+n_units = 10
+res = 1000
+duration = 20
+for n_units_per_shank in range(n_units):
+    spike_times = (
+        np.where(np.random.rand((res * duration)) < (firing_rate / res))[0] / res
+    )
+    nwbfile.add_unit(spike_times=spike_times, quality="good")
+
+# with tempfile.TemporaryDirectory() as tmpdir:
+tmpdir = '.'
+dirname = f'{tmpdir}/test.nwb'
+store = zarr.DirectoryStore(dirname)
+# create a top-level group
+root = zarr.group(store=store, overwrite=True)
+client = lindi.LindiH5pyFile.from_zarr_store(store, mode='r+')
+with NWBHDF5IO(file=client, mode='w') as io:
+    io.write(nwbfile)  # type: ignore
+
+store2 = zarr.DirectoryStore(dirname)
+client2 = lindi.LindiH5pyFile.from_zarr_store(store2, mode='r')
+with NWBHDF5IO(file=client2, mode='r') as io:
+    nwbfile2 = io.read()  # type: ignore
+    print(nwbfile2)
diff --git a/docs/special_zarr_annotations.md b/docs/special_zarr_annotations.md
@@ -44,7 +44,7 @@ HDF5 references can appear within both attributes and datasets. For attributes,
 
 ### `_COMPOUND_DTYPE: [['x', 'int32'], ['y', 'float64'], ...]`
 
-Zarr arrays can represent compound data types from HDF5 datasets. The `_COMPOUND_DTYPE` attribute on a Zarr array indicates this, listing each field's name and data type. The array data should be JSON encoded, aligning with the specified compound structure. The `h5py.Reference` type is also supported within these structures, enabling references within compound data types.
+Zarr arrays can represent compound data types from HDF5 datasets. The `_COMPOUND_DTYPE` attribute on a Zarr array indicates this, listing each field's name and data type. The array data should be JSON encoded, aligning with the specified compound structure. The `h5py.Reference` type is also supported within these structures (represented by the type string '<REFERENCE>').
 
 ## External Array Links
 

diff --git a/examples/example_create_zarr_nwb.py b/examples/example_create_zarr_nwb.py
@@ -0,0 +1,121 @@
+from typing import Any
+import shutil
+import os
+import zarr
+import pynwb
+import lindi
+
+
+def example_create_zarr_nwb():
+    zarr_dirname = 'example_nwb.zarr'
+    if os.path.exists(zarr_dirname):
+        shutil.rmtree(zarr_dirname)
+
+    nwbfile = _create_sample_nwb_file()
+
+    store = zarr.DirectoryStore(zarr_dirname)
+    zarr.group(store=store)  # create a root group
+    with lindi.LindiH5pyFile.from_zarr_store(store, mode='r+') as client:
+        with pynwb.NWBHDF5IO(file=client, mode='r+') as io:
+            io.write(nwbfile)  # type: ignore
+
+
+def _create_sample_nwb_file():
+    from datetime import datetime
+    from uuid import uuid4
+
+    import numpy as np
+    from dateutil.tz import tzlocal
+
+    from pynwb import NWBFile
+    from pynwb.ecephys import LFP, ElectricalSeries
+
+    nwbfile: Any = NWBFile(
+        session_description="my first synthetic recording",
+        identifier=str(uuid4()),
+        session_start_time=datetime.now(tzlocal()),
+        experimenter=[
+            "Baggins, Bilbo",
+        ],
+        lab="Bag End Laboratory",
+        institution="University of Middle Earth at the Shire",
+        experiment_description="I went on an adventure to reclaim vast treasures.",
+        session_id="LONELYMTN001",
+    )
+
+    device = nwbfile.create_device(
+        name="array", description="the best array", manufacturer="Probe Company 9000"
+    )
+
+    nwbfile.add_electrode_column(name="label", description="label of electrode")
+
+    nshanks = 4
+    nchannels_per_shank = 3
+    electrode_counter = 0
+
+    for ishank in range(nshanks):
+        # create an electrode group for this shank
+        electrode_group = nwbfile.create_electrode_group(
+            name="shank{}".format(ishank),
+            description="electrode group for shank {}".format(ishank),
+            device=device,
+            location="brain area",
+        )
+        # add electrodes to the electrode table
+        for ielec in range(nchannels_per_shank):
+            nwbfile.add_electrode(
+                group=electrode_group,
+                label="shank{}elec{}".format(ishank, ielec),
+                location="brain area",
+            )
+            electrode_counter += 1
+
+    all_table_region = nwbfile.create_electrode_table_region(
+        region=list(range(electrode_counter)),  # reference row indices 0 to N-1
+        description="all electrodes",
+    )
+
+    raw_data = np.random.randn(50, 12)
+    raw_electrical_series = ElectricalSeries(
+        name="ElectricalSeries",
+        data=raw_data,
+        electrodes=all_table_region,
+        starting_time=0.0,  # timestamp of the first sample in seconds relative to the session start time
+        rate=20000.0,  # in Hz
+    )
+
+    nwbfile.add_acquisition(raw_electrical_series)
+
+    lfp_data = np.random.randn(50, 12)
+    lfp_electrical_series = ElectricalSeries(
+        name="ElectricalSeries",
+        data=lfp_data,
+        electrodes=all_table_region,
+        starting_time=0.0,
+        rate=200.0,
+    )
+
+    lfp = LFP(electrical_series=lfp_electrical_series)
+
+    ecephys_module = nwbfile.create_processing_module(
+        name="ecephys", description="processed extracellular electrophysiology data"
+    )
+    ecephys_module.add(lfp)
+
+    nwbfile.add_unit_column(name="quality", description="sorting quality")
+
+    firing_rate = 20
+    n_units = 10
+    res = 1000
+    duration = 20
+    for n_units_per_shank in range(n_units):
+        spike_times = (
+            np.where(np.random.rand((res * duration)) < (firing_rate / res))[0] / res
+        )
+        nwbfile.add_unit(spike_times=spike_times, quality="good")
+
+    return nwbfile
+
+
+if __name__ == '__main__':
+    example_create_zarr_nwb()
diff --git a/examples/example_edit_nwb.py b/examples/example_edit_nwb.py
@@ -0,0 +1,32 @@
+import lindi
+import h5py
+import pynwb
+
+
+# Define the URL for a remote .zarr.json file
+url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json'
+
+# Load the h5py-like client from the reference file system
+client = lindi.LindiH5pyFile.from_reference_file_system(url, mode='r+')
+
+# modify the age of the subject
+subject = client['general']['subject']  # type: ignore
+assert isinstance(subject, h5py.Group)
+del subject['age']  # type: ignore
+subject.create_dataset('age', data=b'3w')
+
+# Create a new reference file system
+rfs_new = client.to_reference_file_system()
+
+# Optionally write to a file
+# import json
+# with open('new.zarr.json', 'w') as f:
+#     json.dump(rfs_new, f)
+
+# Load a new h5py-like client from the new reference file system
+client_new = lindi.LindiH5pyFile.from_reference_file_system(rfs_new)
+
+# Open using pynwb and verify that the subject age has been updated
+with pynwb.NWBHDF5IO(file=client, mode="r") as io:
+    nwbfile = io.read()
+    print(nwbfile)
diff --git a/lindi/LindiH5ZarrStore/FloatJsonEncoder.py b/lindi/LindiH5ZarrStore/FloatJsonEncoder.py