Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement lindi.File, drop-in replacement for h5py.File that supports .lindi.json files based on extension (4) #60

Merged
merged 15 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
*.lindi.json
*.lindi.json*
rly marked this conversation as resolved.
Show resolved Hide resolved
*.nwb

.coverage
Expand Down
34 changes: 34 additions & 0 deletions lindi/File/File.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from typing import Literal
import os
import h5py
from ..LindiH5pyFile.LindiH5pyFile import LindiH5pyFile
from ..LindiStagingStore.StagingArea import StagingArea
from ..LocalCache.LocalCache import LocalCache


class File(h5py.File):
"""
A drop-in replacement for h5py.File that is either a lindi.LindiH5pyFile or
h5py.File depending on whether the file name ends with .lindi.json or not.
"""
def __new__(cls, name, mode: Literal['r', 'r+', 'w', 'w-', 'x', 'a'] = 'r', **kwds):
if isinstance(name, str) and name.endswith('.lindi.json'):
# should we raise exceptions on select unsupported kwds? or just go with the flow?
if mode != 'r':
staging_area = StagingArea.create(dir=name + '.d')
else:
staging_area = None
local_cache_dir = os.environ.get('LINDI_LOCAL_CACHE_DIR', None)
if local_cache_dir is not None:
local_cache = LocalCache(cache_dir=local_cache_dir)
else:
local_cache = None

return LindiH5pyFile.from_lindi_file(
name,
mode=mode,
staging_area=staging_area,
local_cache=local_cache
)
else:
return h5py.File(name, mode=mode, **kwds)
Empty file added lindi/File/__init__.py
Empty file.
7 changes: 6 additions & 1 deletion lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,12 @@ def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str):
)
if h5_item.chunks is not None:
# Get the byte range in the file for the chunk.
byte_offset, byte_count = _get_chunk_byte_range(h5_item, chunk_coords)
try:
byte_offset, byte_count = _get_chunk_byte_range(h5_item, chunk_coords)
except Exception as e:
raise Exception(
f"Error getting byte range for chunk {key_parent}/{key_name}. Shape: {h5_item.shape}, Chunks: {h5_item.chunks}, Chunk coords: {chunk_coords}: {e}"
)
else:
# In this case (contiguous dataset), we need to check that the chunk
# coordinates are (0, 0, 0, ...)
Expand Down
13 changes: 11 additions & 2 deletions lindi/LindiH5pyFile/LindiH5pyFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ def from_lindi_file(url_or_path: str, *, mode: LindiFileMode = "r", staging_area

For a description of parameters, see from_reference_file_system().
"""
if local_file_path is None:
if not url_or_path.startswith("http://") and not url_or_path.startswith("https://"):
local_file_path = url_or_path
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Set the local file path (the one that can get edited) to be equal to the url_or_path in the case that it is not provided and is not a url

return LindiH5pyFile.from_reference_file_system(url_or_path, mode=mode, staging_area=staging_area, local_cache=local_cache, local_file_path=local_file_path)

@staticmethod
Expand Down Expand Up @@ -166,7 +169,7 @@ def from_reference_file_system(rfs: Union[dict, str, None], *, mode: LindiFileMo
store = LindiReferenceFileSystemStore(rfs, local_cache=local_cache)
if staging_area:
store = LindiStagingStore(base_store=store, staging_area=staging_area)
return LindiH5pyFile.from_zarr_store(store, mode=mode, local_file_path=local_file_path)
return LindiH5pyFile.from_zarr_store(store, mode=mode, local_file_path=local_file_path, local_cache=local_cache)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was accidentally omitted.

else:
raise Exception(f"Unhandled type for rfs: {type(rfs)}")

Expand Down Expand Up @@ -279,18 +282,24 @@ def upload(
_write_rfs_to_file(rfs=rfs, output_file_name=rfs_fname)
return on_upload_main(rfs_fname)

def write_lindi_file(self, filename: str):
def write_lindi_file(self, filename: str, *, generation_metadata: Union[dict, None] = None):
"""
Write the reference file system to a .lindi.json file.

Parameters
----------
filename : str
The filename to write to. It must end with '.lindi.json'.
generation_metadata : Union[dict, None], optional
The optional generation metadata to include in the reference file
system, by default None. This information dict is simply set to the
'generationMetadata' key in the reference file system.
"""
if not filename.endswith(".lindi.json"):
raise Exception("Filename must end with '.lindi.json'")
rfs = self.to_reference_file_system()
if generation_metadata is not None:
rfs['generationMetadata'] = generation_metadata
_write_rfs_to_file(rfs=rfs, output_file_name=filename)

@property
Expand Down
14 changes: 8 additions & 6 deletions lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,22 +49,24 @@ class LindiReferenceFileSystemStore(ZarrStore):

We also support the use of templates as in fsspec, but do not support the
full jinja2 templating. There may be an optional "templates" key in the
dictionary, which is a dictionary of template strings. For example,
{
dictionary, which is a dictionary of template strings. For example, {
"templates": {"u1": "https://some/url", "u2": "https://some/other/url"},
"refs": {
... "/some/key/0": [
"{{u1}}" 0, 100
],
...
}
}
In this case, the "{{u1}}" will be replaced with the value of the "u1"
} In this case, the "{{u1}}" will be replaced with the value of the "u1"
template string.

Optionally, the reference file system may contain a "generationMetadata"
key, which is a dictionary of metadata about the generation of the reference
file system. This metadata is not used by this class, but could be by other
software. See LindiH5pyFile.write_lindi_file(...)

It is okay for rfs to be modified outside of this class, and the changes
will be reflected immediately in the store. This can be used by experimental
tools such as lindi-cloud.
will be reflected immediately in the store.
"""
def __init__(self, rfs: dict, *, mode: Literal["r", "r+"] = "r+", local_cache: Union[LocalCache, None] = None):
"""
Expand Down
23 changes: 17 additions & 6 deletions lindi/LindiStagingStore/StagingArea.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import Union
import os
import random
import string
Expand All @@ -21,17 +22,27 @@ def __init__(self, *, _directory: str) -> None:
self._directory = os.path.abspath(_directory)

@staticmethod
def create(base_dir: str) -> 'StagingArea':
def create(*, base_dir: Union[str, None] = None, dir: Union[str, None] = None) -> 'StagingArea':
"""
Create a new staging area.
Create a new staging area. Provide either `base_dir` or `dir`, but not
both.

Parameters
----------
base_dir : str
The base directory where the staging area will be created. The
staging directory will be a subdirectory of this directory.
base_dir : str or None
If provided, the base directory where the staging area will be
created. The staging directory will be a subdirectory of this
directory.
dir : str or None
If provided, the exact directory where the staging area will be
created. It is okay if this directory already exists.
"""
dir = os.path.join(base_dir, _create_random_id())
if base_dir is not None and dir is not None:
raise ValueError("Provide either base_dir or dir, but not both")
if base_dir is not None:
dir = os.path.join(base_dir, _create_random_id())
if dir is None:
raise ValueError("Provide either base_dir or dir")
Comment on lines +25 to +45
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either supply a dir or a base_dir

return StagingArea(_directory=dir)

def cleanup(self) -> None:
Expand Down
1 change: 1 addition & 0 deletions lindi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .LindiH5pyFile import LindiH5pyFile, LindiH5pyGroup, LindiH5pyDataset, LindiH5pyHardLink, LindiH5pySoftLink # noqa: F401
from .LindiStagingStore import LindiStagingStore, StagingArea # noqa: F401
from .LocalCache.LocalCache import LocalCache # noqa: F401
from .File.File import File # noqa: F401
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "lindi"
version = "0.3.0"
version = "0.3.1"
description = ""
authors = [
"Jeremy Magland <[email protected]>",
Expand Down
23 changes: 23 additions & 0 deletions tests/test_lindi_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import tempfile
import numpy as np
import h5py
import lindi


def test_lindi_file():
with tempfile.TemporaryDirectory() as tmpdir:
fname = f'{tmpdir}/test.lindi.json'
with lindi.File(fname, 'w') as f:
f.create_dataset('data', data=np.arange(500000, dtype=np.uint32), chunks=(100000,))

with lindi.File(fname, 'r') as f:
ds = f['data']
assert isinstance(ds, h5py.Dataset)
assert ds.shape == (500000,)
assert ds.chunks == (100000,)
assert ds.dtype == np.uint32
assert np.all(ds[:] == np.arange(500000, dtype=np.uint32))


if __name__ == '__main__':
test_lindi_file()
2 changes: 1 addition & 1 deletion tests/test_staging_area.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def test_staging_area():
with tempfile.TemporaryDirectory() as tmpdir:
staging_area = lindi.StagingArea.create(tmpdir + '/staging_area')
staging_area = lindi.StagingArea.create(base_dir=tmpdir + '/staging_area')
client = lindi.LindiH5pyFile.from_reference_file_system(None, mode='r+', staging_area=staging_area)
X = np.random.randn(1000, 1000).astype(np.float32)
client.create_dataset('large_array', data=X, chunks=(400, 400))
Expand Down