Skip to content

Commit

Permalink
Better double asterisks ** support (#1329)
Browse files Browse the repository at this point in the history
  • Loading branch information
john-jam authored Aug 22, 2023
1 parent 1f12ee6 commit c3b4bc3
Show file tree
Hide file tree
Showing 17 changed files with 1,797 additions and 369 deletions.
104 changes: 67 additions & 37 deletions fsspec/asyn.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,7 @@

from .callbacks import _DEFAULT_CALLBACK
from .exceptions import FSTimeoutError
from .implementations.local import (
LocalFileSystem,
make_path_posix,
trailing_sep,
trailing_sep_maybe_asterisk,
)
from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep
from .spec import AbstractBufferedFile, AbstractFileSystem
from .utils import is_exception, other_paths

Expand Down Expand Up @@ -357,14 +352,19 @@ async def _copy(
if not paths:
return

isdir = isinstance(path2, str) and (
source_is_file = len(paths) == 1
dest_is_dir = isinstance(path2, str) and (
trailing_sep(path2) or await self._isdir(path2)
)

exists = source_is_str and (
(has_magic(path1) and source_is_file)
or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
)
path2 = other_paths(
paths,
path2,
exists=isdir and source_is_str and not trailing_sep_maybe_asterisk(path1),
is_dir=isdir,
exists=exists,
flatten=not source_is_str,
)
batch_size = batch_size or self.batch_size
Expand Down Expand Up @@ -514,15 +514,20 @@ async def _put(
if not lpaths:
return

isdir = isinstance(rpath, str) and (
source_is_file = len(lpaths) == 1
dest_is_dir = isinstance(rpath, str) and (
trailing_sep(rpath) or await self._isdir(rpath)
)

rpath = self._strip_protocol(rpath)
exists = source_is_str and (
(has_magic(lpath) and source_is_file)
or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
)
rpaths = other_paths(
lpaths,
rpath,
exists=isdir and source_is_str and not trailing_sep_maybe_asterisk(lpath),
is_dir=isdir,
exists=exists,
flatten=not source_is_str,
)

Expand Down Expand Up @@ -571,11 +576,9 @@ async def _get(
"""
source_is_str = isinstance(rpath, str)
# First check for rpath trailing slash as _strip_protocol removes it.
source_not_trailing_sep = source_is_str and not trailing_sep_maybe_asterisk(
rpath
)
source_not_trailing_sep = source_is_str and not trailing_sep(rpath)
rpath = self._strip_protocol(rpath)
rpaths = await self._expand_path(rpath, recursive=recursive)
rpaths = await self._expand_path(rpath, recursive=recursive, maxdepth=maxdepth)
if source_is_str and (not recursive or maxdepth is not None):
# Non-recursive glob does not copy directories
rpaths = [
Expand All @@ -585,14 +588,19 @@ async def _get(
return

lpath = make_path_posix(lpath)
isdir = isinstance(lpath, str) and (
source_is_file = len(rpaths) == 1
dest_is_dir = isinstance(lpath, str) and (
trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
)

exists = source_is_str and (
(has_magic(rpath) and source_is_file)
or (not has_magic(rpath) and dest_is_dir and source_not_trailing_sep)
)
lpaths = other_paths(
rpaths,
lpath,
exists=isdir and source_not_trailing_sep,
is_dir=isdir,
exists=exists,
flatten=not source_is_str,
)
[os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths]
Expand Down Expand Up @@ -695,25 +703,24 @@ async def _walk(self, path, maxdepth=None, on_error="omit", **kwargs):
):
yield _

async def _glob(self, path, **kwargs):
async def _glob(self, path, maxdepth=None, **kwargs):
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")

import re

ends = path.endswith("/")
path = self._strip_protocol(path)
indstar = path.find("*") if path.find("*") >= 0 else len(path)
indques = path.find("?") if path.find("?") >= 0 else len(path)
indbrace = path.find("[") if path.find("[") >= 0 else len(path)
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)

ind = min(indstar, indques, indbrace)
min_idx = min(idx_star, idx_qmark, idx_brace)

detail = kwargs.pop("detail", False)

if not has_magic(path):
root = path
depth = 1
if ends:
path += "/*"
elif await self._exists(path):
if await self._exists(path):
if not detail:
return [path]
else:
Expand All @@ -723,13 +730,21 @@ async def _glob(self, path, **kwargs):
return [] # glob of non-existent returns empty
else:
return {}
elif "/" in path[:ind]:
ind2 = path[:ind].rindex("/")
root = path[: ind2 + 1]
depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
elif "/" in path[:min_idx]:
min_idx = path[:min_idx].rindex("/")
root = path[: min_idx + 1]
depth = path[min_idx + 1 :].count("/") + 1
else:
root = ""
depth = None if "**" in path else path[ind + 1 :].count("/") + 1
depth = path[min_idx + 1 :].count("/") + 1

if "**" in path:
if maxdepth is not None:
idx_double_stars = path.find("**")
depth_double_stars = path[idx_double_stars:].count("/") + 1
depth = depth - depth_double_stars + maxdepth
else:
depth = None

allpaths = await self._find(
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
Expand Down Expand Up @@ -757,14 +772,23 @@ async def _glob(self, path, **kwargs):
)
+ "$"
)
pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern)
pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern)
pattern = re.sub("[*]", "[^/]*", pattern)
pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern)
pattern = re.sub("=DOUBLE_STARS=", ".*", pattern)
pattern = re.compile(pattern)
out = {
p: allpaths[p]
for p in sorted(allpaths)
if pattern.match(p.replace("//", "/").rstrip("/"))
}

# Return directories only when the glob end by a slash
# This is needed for posix glob compliance
if ends:
out = {k: v for k, v in out.items() if v["type"] == "directory"}

if detail:
return out
else:
Expand All @@ -785,6 +809,12 @@ async def _find(self, path, maxdepth=None, withdirs=False, **kwargs):
path = self._strip_protocol(path)
out = dict()
detail = kwargs.pop("detail", False)

# Add the root directory if withdirs is requested
# This is needed for posix glob compliance
if withdirs and path != "" and await self._isdir(path):
out[path] = await self._info(path)

# async for?
async for _, dirs, files in self._walk(path, maxdepth, detail=True, **kwargs):
if withdirs:
Expand All @@ -811,7 +841,7 @@ async def _expand_path(self, path, recursive=False, maxdepth=None):
path = [self._strip_protocol(p) for p in path]
for p in path: # can gather here
if has_magic(p):
bit = set(await self._glob(p))
bit = set(await self._glob(p, maxdepth=maxdepth))
out |= bit
if recursive:
# glob call above expanded one depth so if maxdepth is defined
Expand Down
47 changes: 31 additions & 16 deletions fsspec/implementations/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,31 +431,29 @@ async def _info(self, url, **kwargs):

return {"name": url, "size": None, **info, "type": "file"}

async def _glob(self, path, **kwargs):
async def _glob(self, path, maxdepth=None, **kwargs):
"""
Find files by glob-matching.
This implementation is idntical to the one in AbstractFileSystem,
but "?" is not considered as a character for globbing, because it is
so common in URLs, often identifying the "query" part.
"""
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
import re

ends = path.endswith("/")
path = self._strip_protocol(path)
indstar = path.find("*") if path.find("*") >= 0 else len(path)
indbrace = path.find("[") if path.find("[") >= 0 else len(path)
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)

ind = min(indstar, indbrace)
min_idx = min(idx_star, idx_brace)

detail = kwargs.pop("detail", False)

if not has_magic(path):
root = path
depth = 1
if ends:
path += "/*"
elif await self._exists(path):
if await self._exists(path):
if not detail:
return [path]
else:
Expand All @@ -465,13 +463,21 @@ async def _glob(self, path, **kwargs):
return [] # glob of non-existent returns empty
else:
return {}
elif "/" in path[:ind]:
ind2 = path[:ind].rindex("/")
root = path[: ind2 + 1]
depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
elif "/" in path[:min_idx]:
min_idx = path[:min_idx].rindex("/")
root = path[: min_idx + 1]
depth = path[min_idx + 1 :].count("/") + 1
else:
root = ""
depth = None if "**" in path else path[ind + 1 :].count("/") + 1
depth = path[min_idx + 1 :].count("/") + 1

if "**" in path:
if maxdepth is not None:
idx_double_stars = path.find("**")
depth_double_stars = path[idx_double_stars:].count("/") + 1
depth = depth - depth_double_stars + maxdepth
else:
depth = None

allpaths = await self._find(
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
Expand All @@ -498,14 +504,23 @@ async def _glob(self, path, **kwargs):
)
+ "$"
)
pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern)
pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern)
pattern = re.sub("[*]", "[^/]*", pattern)
pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern)
pattern = re.sub("=DOUBLE_STARS=", ".*", pattern)
pattern = re.compile(pattern)
out = {
p: allpaths[p]
for p in sorted(allpaths)
if pattern.match(p.replace("//", "/").rstrip("/"))
}

# Return directories only when the glob end by a slash
# This is needed for posix glob compliance
if ends:
out = {k: v for k, v in out.items() if v["type"] == "directory"}

if detail:
return out
else:
Expand Down
19 changes: 0 additions & 19 deletions fsspec/implementations/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@ def ls(self, path, detail=False, **kwargs):
else:
return [posixpath.join(path, f) for f in os.listdir(path)]

def glob(self, path, **kwargs):
path = self._strip_protocol(path)
return super().glob(path, **kwargs)

def info(self, path, **kwargs):
if isinstance(path, os.DirEntry):
# scandir DirEntry
Expand Down Expand Up @@ -287,21 +283,6 @@ def trailing_sep(path):
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))


def trailing_sep_maybe_asterisk(path):
"""Return True if the path ends with a path separator and optionally an
asterisk.
A forward slash is always considered a path separator, even on Operating
Systems that normally use a backslash.
"""
# TODO: if all incoming paths were posix-compliant then separator would
# always be a forward slash, simplifying this function.
# See https://github.com/fsspec/filesystem_spec/pull/1250
return path.endswith((os.sep, os.sep + "*")) or (
os.altsep is not None and path.endswith((os.altsep, os.altsep + "*"))
)


class LocalFileOpener(io.IOBase):
def __init__(
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
Expand Down
6 changes: 5 additions & 1 deletion fsspec/implementations/tests/local/local_fixtures.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from fsspec.implementations.local import LocalFileSystem
from fsspec.implementations.local import LocalFileSystem, make_path_posix
from fsspec.tests.abstract import AbstractFixtures


Expand All @@ -12,3 +12,7 @@ def fs(self):
@pytest.fixture
def fs_path(self, tmpdir):
return str(tmpdir)

@pytest.fixture
def fs_sanitize_path(self):
return make_path_posix
8 changes: 5 additions & 3 deletions fsspec/implementations/tests/test_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,9 @@ def test_globfind_dirs(tmpdir):
fs.glob(tmpdir + "/dir/*", detail=True)[tmpdir + "/dir/afile"]["type"] == "file"
)
assert [tmpdir + "/dir/afile"] == fs.find(tmpdir)
assert [tmpdir + "/dir", tmpdir + "/dir/afile"] == fs.find(tmpdir, withdirs=True)
assert [tmpdir, tmpdir + "/dir", tmpdir + "/dir/afile"] == fs.find(
tmpdir, withdirs=True
)


def test_touch(tmpdir):
Expand Down Expand Up @@ -952,12 +954,12 @@ def test_cp_get_put_empty_directory(tmpdir, funcname):
# cp/get/put without slash, target directory exists
assert fs.isdir(target)
func(empty, target)
assert fs.find(target, withdirs=True) == []
assert fs.find(target, withdirs=True) == [make_path_posix(target)]

# cp/get/put with slash, target directory exists
assert fs.isdir(target)
func(empty + "/", target)
assert fs.find(target, withdirs=True) == []
assert fs.find(target, withdirs=True) == [make_path_posix(target)]

fs.rmdir(target)

Expand Down
4 changes: 2 additions & 2 deletions fsspec/implementations/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,12 +316,12 @@ def test_cp_empty_directory(m):
# cp without slash, target directory exists
assert m.isdir(target)
m.cp(empty, target)
assert m.find(target, withdirs=True) == []
assert m.find(target, withdirs=True) == [target]

# cp with slash, target directory exists
assert m.isdir(target)
m.cp(empty + "/", target)
assert m.find(target, withdirs=True) == []
assert m.find(target, withdirs=True) == [target]

m.rmdir(target)

Expand Down
Loading

0 comments on commit c3b4bc3

Please sign in to comment.