Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
magland committed Apr 4, 2024
2 parents fc1421f + b2c2788 commit ed31ef3
Show file tree
Hide file tree
Showing 40 changed files with 1,960 additions and 615 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
*.zarr.json
*.nwb

.coverage

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ LINDI features include:
- A specification for representing arbitrary HDF5 files as Zarr stores. This handles scalar datasets, references, soft links, and compound data types for datasets.
- A Zarr wrapper for remote or local HDF5 files (LindiH5ZarrStore). This involves pointers to remote files for remote data chunks.
- A function for generating a reference file system .zarr.json file from a Zarr store. This is inspired by [kerchunk](https://github.com/fsspec/kerchunk).
- An h5py-like interface for accessing these Zarr stores that can be used with [pynwb](https://pynwb.readthedocs.io/en/stable/).
- An h5py-like interface for accessing these Zarr stores that can be used with [pynwb](https://pynwb.readthedocs.io/en/stable/). Both read and write operations are supported.

This project was inspired by [kerchunk](https://github.com/fsspec/kerchunk) and [hdmf-zarr](https://hdmf-zarr.readthedocs.io/en/latest/index.html) and depends on [zarr](https://zarr.readthedocs.io/en/stable/), [h5py](https://www.h5py.org/), [remfile](https://github.com/magland/remfile) and [numcodecs](https://numcodecs.readthedocs.io/en/stable/).

Expand Down
4 changes: 3 additions & 1 deletion devel/demonstrate_slow_get_chunk_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ def demonstrate_slow_get_chunk_info():
print(f"shape: {shape}") # (128000, 212, 322, 2)
print(f"chunk_shape: {chunk_shape}") # (3, 53, 81, 1)
chunk_coord_shape = [
(shape[i] + chunk_shape[i] - 1) // chunk_shape[i] for i in range(len(shape))
# the shape could be zero -- for example dandiset 000559 - acquisition/depth_video/data has shape [0, 0, 0]
(shape[i] + chunk_shape[i] - 1) // chunk_shape[i] if chunk_shape[i] != 0 else 0
for i in range(len(shape))
]
print(f"chunk_coord_shape: {chunk_coord_shape}") # [42667, 4, 4, 2]
num_chunks = np.prod(chunk_coord_shape)
Expand Down
110 changes: 110 additions & 0 deletions devel/test_write_nwb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from typing import Any

from datetime import datetime
from uuid import uuid4
import numpy as np
from dateutil.tz import tzlocal
from pynwb import NWBHDF5IO, NWBFile, H5DataIO
from pynwb.ecephys import LFP, ElectricalSeries
import zarr
import lindi

nwbfile: Any = NWBFile(
session_description="my first synthetic recording",
identifier=str(uuid4()),
session_start_time=datetime.now(tzlocal()),
experimenter=[
"Baggins, Bilbo",
],
lab="Bag End Laboratory",
institution="University of Middle Earth at the Shire",
experiment_description="I went on an adventure to reclaim vast treasures.",
session_id="LONELYMTN001",
)

device = nwbfile.create_device(
name="array", description="the best array", manufacturer="Probe Company 9000"
)

nwbfile.add_electrode_column(name="label", description="label of electrode")

nshanks = 4
nchannels_per_shank = 3
electrode_counter = 0

for ishank in range(nshanks):
# create an electrode group for this shank
electrode_group = nwbfile.create_electrode_group(
name="shank{}".format(ishank),
description="electrode group for shank {}".format(ishank),
device=device,
location="brain area",
)
# add electrodes to the electrode table
for ielec in range(nchannels_per_shank):
nwbfile.add_electrode(
group=electrode_group,
label="shank{}elec{}".format(ishank, ielec),
location="brain area",
)
electrode_counter += 1

all_table_region = nwbfile.create_electrode_table_region(
region=list(range(electrode_counter)), # reference row indices 0 to N-1
description="all electrodes",
)

raw_data = np.random.randn(300000, 100)
raw_electrical_series = ElectricalSeries(
name="ElectricalSeries",
data=H5DataIO(data=raw_data, chunks=(100000, 100)), # type: ignore
electrodes=all_table_region,
starting_time=0.0, # timestamp of the first sample in seconds relative to the session start time
rate=20000.0, # in Hz
)

nwbfile.add_acquisition(raw_electrical_series)

lfp_data = np.random.randn(50, 12)
lfp_electrical_series = ElectricalSeries(
name="ElectricalSeries",
data=lfp_data,
electrodes=all_table_region,
starting_time=0.0,
rate=200.0,
)

lfp = LFP(electrical_series=lfp_electrical_series)

ecephys_module = nwbfile.create_processing_module(
name="ecephys", description="processed extracellular electrophysiology data"
)
ecephys_module.add(lfp)

nwbfile.add_unit_column(name="quality", description="sorting quality")

firing_rate = 20
n_units = 10
res = 1000
duration = 20
for n_units_per_shank in range(n_units):
spike_times = (
np.where(np.random.rand((res * duration)) < (firing_rate / res))[0] / res
)
nwbfile.add_unit(spike_times=spike_times, quality="good")

# with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = '.'
dirname = f'{tmpdir}/test.nwb'
store = zarr.DirectoryStore(dirname)
# create a top-level group
root = zarr.group(store=store, overwrite=True)
client = lindi.LindiH5pyFile.from_zarr_store(store, mode='r+')
with NWBHDF5IO(file=client, mode='w') as io:
io.write(nwbfile) # type: ignore

store2 = zarr.DirectoryStore(dirname)
client2 = lindi.LindiH5pyFile.from_zarr_store(store2, mode='r')
with NWBHDF5IO(file=client2, mode='r') as io:
nwbfile2 = io.read() # type: ignore
print(nwbfile2)
2 changes: 1 addition & 1 deletion docs/special_zarr_annotations.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ HDF5 references can appear within both attributes and datasets. For attributes,

### `_COMPOUND_DTYPE: [['x', 'int32'], ['y', 'float64'], ...]`

Zarr arrays can represent compound data types from HDF5 datasets. The `_COMPOUND_DTYPE` attribute on a Zarr array indicates this, listing each field's name and data type. The array data should be JSON encoded, aligning with the specified compound structure. The `h5py.Reference` type is also supported within these structures, enabling references within compound data types.
Zarr arrays can represent compound data types from HDF5 datasets. The `_COMPOUND_DTYPE` attribute on a Zarr array indicates this, listing each field's name and data type. The array data should be JSON encoded, aligning with the specified compound structure. The `h5py.Reference` type is also supported within these structures (represented by the type string '<REFERENCE>').

## External Array Links

Expand Down
121 changes: 121 additions & 0 deletions examples/example_create_zarr_nwb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from typing import Any
import shutil
import os
import zarr
import pynwb
import lindi


def example_create_zarr_nwb():
zarr_dirname = 'example_nwb.zarr'
if os.path.exists(zarr_dirname):
shutil.rmtree(zarr_dirname)

nwbfile = _create_sample_nwb_file()

store = zarr.DirectoryStore(zarr_dirname)
zarr.group(store=store) # create a root group
with lindi.LindiH5pyFile.from_zarr_store(store, mode='r+') as client:
with pynwb.NWBHDF5IO(file=client, mode='r+') as io:
io.write(nwbfile) # type: ignore


def _create_sample_nwb_file():
from datetime import datetime
from uuid import uuid4

import numpy as np
from dateutil.tz import tzlocal

from pynwb import NWBFile
from pynwb.ecephys import LFP, ElectricalSeries

nwbfile: Any = NWBFile(
session_description="my first synthetic recording",
identifier=str(uuid4()),
session_start_time=datetime.now(tzlocal()),
experimenter=[
"Baggins, Bilbo",
],
lab="Bag End Laboratory",
institution="University of Middle Earth at the Shire",
experiment_description="I went on an adventure to reclaim vast treasures.",
session_id="LONELYMTN001",
)

device = nwbfile.create_device(
name="array", description="the best array", manufacturer="Probe Company 9000"
)

nwbfile.add_electrode_column(name="label", description="label of electrode")

nshanks = 4
nchannels_per_shank = 3
electrode_counter = 0

for ishank in range(nshanks):
# create an electrode group for this shank
electrode_group = nwbfile.create_electrode_group(
name="shank{}".format(ishank),
description="electrode group for shank {}".format(ishank),
device=device,
location="brain area",
)
# add electrodes to the electrode table
for ielec in range(nchannels_per_shank):
nwbfile.add_electrode(
group=electrode_group,
label="shank{}elec{}".format(ishank, ielec),
location="brain area",
)
electrode_counter += 1

all_table_region = nwbfile.create_electrode_table_region(
region=list(range(electrode_counter)), # reference row indices 0 to N-1
description="all electrodes",
)

raw_data = np.random.randn(50, 12)
raw_electrical_series = ElectricalSeries(
name="ElectricalSeries",
data=raw_data,
electrodes=all_table_region,
starting_time=0.0, # timestamp of the first sample in seconds relative to the session start time
rate=20000.0, # in Hz
)

nwbfile.add_acquisition(raw_electrical_series)

lfp_data = np.random.randn(50, 12)
lfp_electrical_series = ElectricalSeries(
name="ElectricalSeries",
data=lfp_data,
electrodes=all_table_region,
starting_time=0.0,
rate=200.0,
)

lfp = LFP(electrical_series=lfp_electrical_series)

ecephys_module = nwbfile.create_processing_module(
name="ecephys", description="processed extracellular electrophysiology data"
)
ecephys_module.add(lfp)

nwbfile.add_unit_column(name="quality", description="sorting quality")

firing_rate = 20
n_units = 10
res = 1000
duration = 20
for n_units_per_shank in range(n_units):
spike_times = (
np.where(np.random.rand((res * duration)) < (firing_rate / res))[0] / res
)
nwbfile.add_unit(spike_times=spike_times, quality="good")

return nwbfile


if __name__ == '__main__':
example_create_zarr_nwb()
32 changes: 32 additions & 0 deletions examples/example_edit_nwb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import lindi
import h5py
import pynwb


# Define the URL for a remote .zarr.json file
url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json'

# Load the h5py-like client from the reference file system
client = lindi.LindiH5pyFile.from_reference_file_system(url, mode='r+')

# modify the age of the subject
subject = client['general']['subject'] # type: ignore
assert isinstance(subject, h5py.Group)
del subject['age'] # type: ignore
subject.create_dataset('age', data=b'3w')

# Create a new reference file system
rfs_new = client.to_reference_file_system()

# Optionally write to a file
# import json
# with open('new.zarr.json', 'w') as f:
# json.dump(rfs_new, f)

# Load a new h5py-like client from the new reference file system
client_new = lindi.LindiH5pyFile.from_reference_file_system(rfs_new)

# Open using pynwb and verify that the subject age has been updated
with pynwb.NWBHDF5IO(file=client, mode="r") as io:
nwbfile = io.read()
print(nwbfile)
41 changes: 0 additions & 41 deletions lindi/LindiH5ZarrStore/FloatJsonEncoder.py

This file was deleted.

Loading

0 comments on commit ed31ef3

Please sign in to comment.