Merge pull request #54 from NeurodataWithoutBorders/local-cache

implement LocalCache (1)
NeurodataWithoutBorders · May 9, 2024 · 5640e17 · 5640e17
2 parents 162f19d + eefdcff
commit 5640e17
Show file tree

Hide file tree

Showing 18 changed files with 689 additions and 186 deletions.
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ LINDI provides:
 - An h5py-like interface for reading from and writing to these data sources that can be used with [pynwb](https://pynwb.readthedocs.io/en/stable/).
 - A mechanism for uploading and downloading these data sources to and from cloud storage, including DANDI.
 
-This project was inspired by [kerchunk](https://github.com/fsspec/kerchunk) and [hdmf-zarr](https://hdmf-zarr.readthedocs.io/en/latest/index.html) and depends on [zarr](https://zarr.readthedocs.io/en/stable/), [h5py](https://www.h5py.org/), [remfile](https://github.com/magland/remfile) and [numcodecs](https://numcodecs.readthedocs.io/en/stable/).
+This project was inspired by [kerchunk](https://github.com/fsspec/kerchunk) and [hdmf-zarr](https://hdmf-zarr.readthedocs.io/en/latest/index.html) and depends on [zarr](https://zarr.readthedocs.io/en/stable/), [h5py](https://www.h5py.org/) and [numcodecs](https://numcodecs.readthedocs.io/en/stable/).
 
 ## Installation
 
@@ -35,39 +35,56 @@ pip install -e .
 
 ## Use cases
 
+* Lazy-load a remote NWB/HDF5 file for efficient access to metadata and data.
 * Represent a remote NWB/HDF5 file as a .nwb.lindi.json file.
 * Read a local or remote .nwb.lindi.json file using pynwb or other tools.
 * Edit a .nwb.lindi.json file using pynwb or other tools.
 * Add datasets to a .nwb.lindi.json file using a local staging area.
-* Upload a .nwb.lindi.json file to a cloud storage service such as DANDI.
+* Upload a .nwb.lindi.json file with staged datasets to a cloud storage service such as DANDI.
+
+### Lazy-load a remote NWB/HDF5 file for efficient access to metadata and data
+
+```python
+import pynwb
+import lindi
+
+# URL of the remote NWB file
+h5_url = "https://api.dandiarchive.org/api/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/download/"
+
+# Set up a local cache
+local_cache = lindi.LocalCache(cache_dir='lindi_cache')
+
+# Create the h5py-like client
+client = lindi.LindiH5pyFile.from_hdf5_file(h5_url, local_cache=local_cache)
+
+# Open using pynwb
+with pynwb.NWBHDF5IO(file=client, mode="r") as io:
+    nwbfile = io.read()
+    print(nwbfile)
+
+# The downloaded data will be cached locally, so subsequent reads will be faster
+```
 
 ### Represent a remote NWB/HDF5 file as a .nwb.lindi.json file
 
 ```python
 import json
-import pynwb
 import lindi
 
 # URL of the remote NWB file
 h5_url = "https://api.dandiarchive.org/api/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/download/"
 
-# Create a read-only Zarr store as a wrapper for the h5 file
-store = lindi.LindiH5ZarrStore.from_file(h5_url)
+# Create the h5py-like client
+client = lindi.LindiH5pyFile.from_hdf5_file(h5_url)
 
 # Generate a reference file system
-rfs = store.to_reference_file_system()
+rfs = client.to_reference_file_system()
 
 # Save it to a file for later use
 with open("example.lindi.json", "w") as f:
     json.dump(rfs, f, indent=2)
 
-# Create an h5py-like client from the reference file system
-client = lindi.LindiH5pyFile.from_reference_file_system(rfs)
-
-# Open using pynwb
-with pynwb.NWBHDF5IO(file=client, mode="r") as io:
-    nwbfile = io.read()
-    print(nwbfile)
+# See the next example for how to read this file
 ```
 
 ### Read a local or remote .nwb.lindi.json file using pynwb or other tools
@@ -79,8 +96,8 @@ import lindi
 # URL of the remote .nwb.lindi.json file
 url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json'
 
-# Load the h5py-like client for the reference file system
-client = lindi.LindiH5pyFile.from_reference_file_system(url)
+# Load the h5py-like client
+client = lindi.LindiH5pyFile.from_lindi_file(url)
 
 # Open using pynwb
 with pynwb.NWBHDF5IO(file=client, mode="r") as io:
@@ -121,7 +138,7 @@ url = 'https://lindi.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4
 # Load the h5py-like client for the reference file system
 # in read-write mode with a staging area
 with lindi.StagingArea.create(base_dir='lindi_staging') as staging_area:
-    client = lindi.LindiH5pyFile.from_reference_file_system(
+    client = lindi.LindiH5pyFile.from_lindi_file(
         url,
         mode="r+",
         staging_area=staging_area
@@ -130,7 +147,7 @@ with lindi.StagingArea.create(base_dir='lindi_staging') as staging_area:
     # upload the changes to the remote .nwb.lindi.json file
 ```
 
-### Upload a .nwb.lindi.json file to a cloud storage service such as DANDI
+### Upload a .nwb.lindi.json file with staged datasets to a cloud storage service such as DANDI
 
 See [this example](https://github.com/magland/lindi-dandi/blob/main/devel/lindi_test_2.py).
 

diff --git a/docs/special_zarr_annotations.md b/docs/special_zarr_annotations.md
@@ -34,7 +34,7 @@ Note that we do not currently support external links.
 - `object_id`: The object_id attribute of the target object (for validation).
 - `source_object_id`: The object_id attribute of the source object (for validation).
 
-The largely follows the [convention used by hdmf-zarr](https://hdmf-zarr.readthedocs.io/en/latest/storage.html#storing-object-references-in-attributes). 
+This largely follows the [convention used by hdmf-zarr](https://hdmf-zarr.readthedocs.io/en/latest/storage.html#storing-object-references-in-attributes).
 
 HDF5 references can appear within both attributes and datasets. For attributes, the value of the attribute is a dict in the above form. For datasets, the value of an item within the dataset is a dict in the above form.
 
@@ -50,4 +50,4 @@ Zarr arrays can represent compound data types from HDF5 datasets. The `_COMPOUND
 
 ### `_EXTERNAL_ARRAY_LINK = {'link_type': 'hdf5_dataset', 'url': '...', 'name': '...'}`
 
-For datasets with an extensive number of chunks such that inclusion in the Zarr or reference file system is impractical, LINDI uses the `_EXTERNAL_ARRAY_LINK` attribute on a Zarr array. This attribute points to an external HDF5 file, specifying the `url` for remote access (or local path) and the `name` of the target dataset within that file. When slicing that dataset, the `LindiH5pyClient` will handle data retrieval, leveraging `h5py` and `remfile` for remote access.
+For datasets with an extensive number of chunks such that inclusion in the Zarr or reference file system is impractical, LINDI uses the `_EXTERNAL_ARRAY_LINK` attribute on a Zarr array. This attribute points to an external HDF5 file, specifying the `url` for remote access (or local path) and the `name` of the target dataset within that file. When slicing that dataset, the `LindiH5pyClient` will handle data retrieval, leveraging `h5py` and `LindiRemfile` for remote access.
diff --git a/examples/example2.py b/examples/example2.py
@@ -5,7 +5,7 @@
 url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json'
 
 # Load the h5py-like client from the reference file system
-client = lindi.LindiH5pyFile.from_reference_file_system(url)
+client = lindi.LindiH5pyFile.from_lindi_file(url)
 
 # Open using pynwb
 with pynwb.NWBHDF5IO(file=client, mode="r") as io:

diff --git a/examples/example_edit_nwb.py b/examples/example_edit_nwb.py
@@ -7,7 +7,7 @@
 url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json'
 
 # Load the h5py-like client from the reference file system
-client = lindi.LindiH5pyFile.from_reference_file_system(url, mode='r+')
+client = lindi.LindiH5pyFile.from_lindi_file(url, mode='r+')
 
 # modify the age of the subject
 subject = client['general']['subject']  # type: ignore

diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
@@ -4,7 +4,6 @@
 from dataclasses import dataclass
 import numpy as np
 import zarr
-import remfile
 from zarr.storage import Store, MemoryStore
 import h5py
 from ._util import (
@@ -20,6 +19,8 @@
 from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs
 from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data
 from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore
+from ..LocalCache.LocalCache import LocalCache
+from ..LindiRemfile.LindiRemfile import LindiRemfile
 
 
 @dataclass
@@ -57,7 +58,8 @@ def __init__(
         _file: Union[IO, Any],
         _opts: LindiH5ZarrStoreOpts,
         _url: Union[str, None] = None,
-        _entities_to_close: List[Any]
+        _entities_to_close: List[Any],
+        _local_cache: Union[LocalCache, None] = None
     ):
         """
         Do not call the constructor directly. Instead, use the from_file class
@@ -67,6 +69,7 @@ def __init__(
         self._h5f: Union[h5py.File, None] = h5py.File(_file, "r")
         self._url = _url
         self._opts = _opts
+        self._local_cache = _local_cache
         self._entities_to_close = _entities_to_close + [self._h5f]
 
         # Some datasets do not correspond to traditional chunked datasets. For
@@ -82,6 +85,7 @@ def from_file(
         *,
         opts: LindiH5ZarrStoreOpts = LindiH5ZarrStoreOpts(),
         url: Union[str, None] = None,
+        local_cache: Union[LocalCache, None] = None
     ):
         """
         Create a LindiH5ZarrStore from a file or url pointing to an HDF5 file.
@@ -99,14 +103,19 @@ def from_file(
             local file name, then you will need to set
             opts.num_dataset_chunks_threshold to None, and you will not be able
             to use the to_reference_file_system method.
+        local_cache : LocalCache or None
+            A local cache to use when reading chunks from a remote file. If None,
+            then no local cache is used.
         """
         if hdf5_file_name_or_url.startswith(
             "http://"
         ) or hdf5_file_name_or_url.startswith("https://"):
             # note that the remfile.File object does not need to be closed
-            remf = remfile.File(hdf5_file_name_or_url, verbose=False)
-            return LindiH5ZarrStore(_file=remf, _url=hdf5_file_name_or_url, _opts=opts, _entities_to_close=[])
+            remf = LindiRemfile(hdf5_file_name_or_url, verbose=False, local_cache=local_cache)
+            return LindiH5ZarrStore(_file=remf, _url=hdf5_file_name_or_url, _opts=opts, _entities_to_close=[], _local_cache=local_cache)
         else:
+            if local_cache is not None:
+                raise Exception("local_cache cannot be used with a local file")
             f = open(hdf5_file_name_or_url, "rb")
             return LindiH5ZarrStore(_file=f, _url=url, _opts=opts, _entities_to_close=[f])
 
@@ -334,7 +343,24 @@ def _get_chunk_file_bytes(self, key_parent: str, key_name: str):
         else:
             assert byte_offset is not None
             assert byte_count is not None
+            if self._local_cache is not None:
+                assert self._url is not None, "Unexpected: url is None but local_cache is not None"
+                ch = self._local_cache.get_remote_chunk(
+                    url=self._url,
+                    offset=byte_offset,
+                    size=byte_count
+                )
+                if ch is not None:
+                    return ch
             buf = _read_bytes(self._file, byte_offset, byte_count)
+            if self._local_cache is not None:
+                assert self._url is not None, "Unexpected: url is None but local_cache is not None"
+                self._local_cache.put_remote_chunk(
+                    url=self._url,
+                    offset=byte_offset,
+                    size=byte_count,
+                    data=buf
+                )
             return buf
 
     def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str):
@@ -464,7 +490,7 @@ def listdir(self, path: str = "") -> List[str]:
     def write_reference_file_system(self, output_file_name: str):
         """Write a reference file system corresponding to this store to a file.
 
-        This can then be loaded using LindiH5pyFile.from_reference_file_system(file_name)
+        This can then be loaded using LindiH5pyFile.from_lindi_file(file_name)
         """
 
         if not output_file_name.endswith(".lindi.json"):

diff --git a/lindi/LindiH5pyFile/FileSegmentReader/DandiFileSegmentReader.py b/lindi/LindiH5pyFile/FileSegmentReader/DandiFileSegmentReader.py
diff --git a/lindi/LindiH5pyFile/FileSegmentReader/FileSegmentReader.py b/lindi/LindiH5pyFile/FileSegmentReader/FileSegmentReader.py
diff --git a/lindi/LindiH5pyFile/LindiH5pyDataset.py b/lindi/LindiH5pyFile/LindiH5pyDataset.py
@@ -2,10 +2,10 @@
 import numpy as np
 import h5py
 import zarr
-import remfile
 
 from .LindiH5pyAttributes import LindiH5pyAttributes
 from .LindiH5pyReference import LindiH5pyReference
+from ..LindiRemfile.LindiRemfile import LindiRemfile
 
 from ..conversion.decode_references import decode_references
 
@@ -116,7 +116,7 @@ def dtype(self):
                 # but validate seems to work only when I put in vlen = bytes
                 #
                 vlen = bytes
-                ret = np.dtype(str(ret), metadata={'vlen': vlen})
+                ret = np.dtype(str(ret), metadata={'vlen': vlen})  # type: ignore
         return ret
 
     @property
@@ -213,13 +213,15 @@ def _get_item_for_zarr(self, zarr_array: zarr.Array, selection: Any):
             # make sure selection is ()
             if selection != ():
                 raise TypeError(f'Cannot slice a scalar dataset with {selection}')
-            return zarr_array[0]
+            # For some reason, with the newest version of zarr (2.18.0) we need to use [:][0] rather than just [0].
+            # Otherwise we get an error "ValueError: buffer source array is read-only"
+            return zarr_array[:][0]
         return decode_references(zarr_array[selection])
 
     def _get_external_hdf5_client(self, url: str) -> h5py.File:
         if url not in _external_hdf5_clients:
             if url.startswith("http://") or url.startswith("https://"):
-                ff = remfile.File(url)
+                ff = LindiRemfile(url, local_cache=self._file._local_cache)
             else:
                 ff = open(url, "rb")  # this never gets closed
             _external_hdf5_clients[url] = h5py.File(ff, "r")