From 2fbe8deff1c8d79f3125def9d930d19c213f03f6 Mon Sep 17 00:00:00 2001 From: Ian Thomas Date: Thu, 17 Aug 2023 14:15:32 +0100 Subject: [PATCH] Support cache mapper that is basename plus fixed number of parent directories (#1318) --- fsspec/implementations/cache_mapper.py | 33 ++++++-- fsspec/implementations/cached.py | 36 ++++++-- fsspec/implementations/tests/test_cached.py | 91 +++++++++++++++++++-- 3 files changed, 141 insertions(+), 19 deletions(-) diff --git a/fsspec/implementations/cache_mapper.py b/fsspec/implementations/cache_mapper.py index f9ee29ac2..000ccebc8 100644 --- a/fsspec/implementations/cache_mapper.py +++ b/fsspec/implementations/cache_mapper.py @@ -2,9 +2,10 @@ import abc import hashlib -import os from typing import TYPE_CHECKING +from fsspec.implementations.local import make_path_posix + if TYPE_CHECKING: from typing import Any @@ -30,14 +31,36 @@ def __hash__(self) -> int: class BasenameCacheMapper(AbstractCacheMapper): - """Cache mapper that uses the basename of the remote URL. + """Cache mapper that uses the basename of the remote URL and a fixed number + of directory levels above this. - Different paths with the same basename will therefore have the same cached - basename. + The default is zero directory levels, meaning different paths with the same + basename will have the same cached basename. """ + def __init__(self, directory_levels: int = 0): + if directory_levels < 0: + raise ValueError( + "BasenameCacheMapper requires zero or positive directory_levels" + ) + self.directory_levels = directory_levels + + # Separator for directories when encoded as strings. + self._separator = "_@_" + def __call__(self, path: str) -> str: - return os.path.basename(path) + path = make_path_posix(path) + prefix, *bits = path.rsplit("/", self.directory_levels + 1) + if bits: + return self._separator.join(bits) + else: + return prefix # No separator found, simple filename + + def __eq__(self, other: Any) -> bool: + return super().__eq__(other) and self.directory_levels == other.directory_levels + + def __hash__(self) -> int: + return super().__hash__() ^ hash(self.directory_levels) class HashCacheMapper(AbstractCacheMapper): diff --git a/fsspec/implementations/cached.py b/fsspec/implementations/cached.py index c47c3b290..30aeb119d 100644 --- a/fsspec/implementations/cached.py +++ b/fsspec/implementations/cached.py @@ -8,7 +8,7 @@ import tempfile import time from shutil import rmtree -from typing import Any, ClassVar +from typing import TYPE_CHECKING, Any, Callable, ClassVar from fsspec import AbstractFileSystem, filesystem from fsspec.callbacks import _DEFAULT_CALLBACK @@ -19,6 +19,9 @@ from fsspec.spec import AbstractBufferedFile from fsspec.utils import infer_compression +if TYPE_CHECKING: + from fsspec.implementations.cache_mapper import AbstractCacheMapper + logger = logging.getLogger("fsspec.cached") @@ -53,8 +56,9 @@ def __init__( expiry_time=604800, target_options=None, fs=None, - same_names=False, + same_names: bool | None = None, compression=None, + cache_mapper: AbstractCacheMapper | None = None, **kwargs, ): """ @@ -84,13 +88,19 @@ def __init__( fs: filesystem instance The target filesystem to run against. Provide this or ``protocol``. same_names: bool (optional) - By default, target URLs are hashed, so that files from different backends - with the same basename do not conflict. If this is true, the original - basename is used. + By default, target URLs are hashed using a ``HashCacheMapper`` so + that files from different backends with the same basename do not + conflict. If this argument is ``true``, a ``BasenameCacheMapper`` + is used instead. Other cache mapper options are available by using + the ``cache_mapper`` keyword argument. Only one of this and + ``cache_mapper`` should be specified. compression: str (optional) To decompress on download. Can be 'infer' (guess from the URL name), one of the entries in ``fsspec.compression.compr``, or None for no decompression. + cache_mapper: AbstractCacheMapper (optional) + The object use to map from original filenames to cached filenames. + Only one of this and ``same_names`` should be specified. """ super().__init__(**kwargs) if fs is None and target_protocol is None: @@ -115,7 +125,19 @@ def __init__( self.check_files = check_files self.expiry = expiry_time self.compression = compression - self._mapper = create_cache_mapper(same_names) + + if same_names is not None and cache_mapper is not None: + raise ValueError( + "Cannot specify both same_names and cache_mapper in " + "CachingFileSystem.__init__" + ) + if cache_mapper is not None: + self._mapper = cache_mapper + else: + self._mapper = create_cache_mapper( + same_names if same_names is not None else False + ) + self.target_protocol = ( target_protocol if isinstance(target_protocol, str) @@ -128,7 +150,7 @@ def _strip_protocol(path): # acts as a method, since each instance has a difference target return self.fs._strip_protocol(type(self)._strip_protocol(path)) - self._strip_protocol = _strip_protocol + self._strip_protocol: Callable = _strip_protocol def _mkcache(self): os.makedirs(self.storage[-1], exist_ok=True) diff --git a/fsspec/implementations/tests/test_cached.py b/fsspec/implementations/tests/test_cached.py index d8295e778..19ac0975a 100644 --- a/fsspec/implementations/tests/test_cached.py +++ b/fsspec/implementations/tests/test_cached.py @@ -8,7 +8,11 @@ import fsspec from fsspec.compression import compr from fsspec.exceptions import BlocksizeMismatchError -from fsspec.implementations.cache_mapper import create_cache_mapper +from fsspec.implementations.cache_mapper import ( + BasenameCacheMapper, + HashCacheMapper, + create_cache_mapper, +) from fsspec.implementations.cached import CachingFileSystem, LocalTempFile from fsspec.implementations.local import make_path_posix @@ -36,10 +40,20 @@ def local_filecache(): def test_mapper(): mapper0 = create_cache_mapper(True) + assert mapper0("somefile") == "somefile" + assert mapper0("/somefile") == "somefile" assert mapper0("/somedir/somefile") == "somefile" assert mapper0("/otherdir/somefile") == "somefile" mapper1 = create_cache_mapper(False) + assert ( + mapper1("somefile") + == "dd00b9487898b02555b6a2d90a070586d63f93e80c70aaa60c992fa9e81a72fe" + ) + assert ( + mapper1("/somefile") + == "884c07bc2efe65c60fb9d280a620e7f180488718fb5d97736521b7f9cf5c8b37" + ) assert ( mapper1("/somedir/somefile") == "67a6956e5a5f95231263f03758c1fd9254fdb1c564d311674cec56b0372d2056" @@ -57,9 +71,47 @@ def test_mapper(): assert hash(create_cache_mapper(True)) == hash(mapper0) assert hash(create_cache_mapper(False)) == hash(mapper1) - -@pytest.mark.parametrize("same_names", [False, True]) -def test_metadata(tmpdir, same_names): + with pytest.raises( + ValueError, + match="BasenameCacheMapper requires zero or positive directory_levels", + ): + BasenameCacheMapper(-1) + + mapper2 = BasenameCacheMapper(1) + assert mapper2("/somefile") == "somefile" + assert mapper2("/somedir/somefile") == "somedir_@_somefile" + assert mapper2("/otherdir/somefile") == "otherdir_@_somefile" + assert mapper2("/dir1/dir2/dir3/somefile") == "dir3_@_somefile" + + assert mapper2 != mapper0 + assert mapper2 != mapper1 + assert BasenameCacheMapper(1) == mapper2 + + assert hash(mapper2) != hash(mapper0) + assert hash(mapper2) != hash(mapper1) + assert hash(BasenameCacheMapper(1)) == hash(mapper2) + + mapper3 = BasenameCacheMapper(2) + assert mapper3("/somefile") == "somefile" + assert mapper3("/somedir/somefile") == "somedir_@_somefile" + assert mapper3("/otherdir/somefile") == "otherdir_@_somefile" + assert mapper3("/dir1/dir2/dir3/somefile") == "dir2_@_dir3_@_somefile" + + assert mapper3 != mapper0 + assert mapper3 != mapper1 + assert mapper3 != mapper2 + assert BasenameCacheMapper(2) == mapper3 + + assert hash(mapper3) != hash(mapper0) + assert hash(mapper3) != hash(mapper1) + assert hash(mapper3) != hash(mapper2) + assert hash(BasenameCacheMapper(2)) == hash(mapper3) + + +@pytest.mark.parametrize( + "cache_mapper", [BasenameCacheMapper(), BasenameCacheMapper(1), HashCacheMapper()] +) +def test_metadata(tmpdir, cache_mapper): source = os.path.join(tmpdir, "source") afile = os.path.join(source, "afile") os.mkdir(source) @@ -69,7 +121,7 @@ def test_metadata(tmpdir, same_names): "filecache", target_protocol="file", cache_storage=os.path.join(tmpdir, "cache"), - same_names=same_names, + cache_mapper=cache_mapper, ) with fs.open(afile, "rb") as f: @@ -85,8 +137,33 @@ def test_metadata(tmpdir, same_names): assert detail["original"] == afile_posix assert detail["fn"] == fs._mapper(afile_posix) - if same_names: - assert detail["fn"] == "afile" + + if isinstance(cache_mapper, BasenameCacheMapper): + if cache_mapper.directory_levels == 0: + assert detail["fn"] == "afile" + else: + assert detail["fn"] == "source_@_afile" + + +def test_constructor_kwargs(tmpdir): + fs = fsspec.filesystem("filecache", target_protocol="file", same_names=True) + assert isinstance(fs._mapper, BasenameCacheMapper) + + fs = fsspec.filesystem("filecache", target_protocol="file", same_names=False) + assert isinstance(fs._mapper, HashCacheMapper) + + fs = fsspec.filesystem("filecache", target_protocol="file") + assert isinstance(fs._mapper, HashCacheMapper) + + with pytest.raises( + ValueError, match="Cannot specify both same_names and cache_mapper" + ): + fs = fsspec.filesystem( + "filecache", + target_protocol="file", + cache_mapper=HashCacheMapper(), + same_names=True, + ) def test_idempotent():