Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better double asterisks ** support #1329

Merged
merged 31 commits into from
Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a85d689
chore: add posix compliance tests
john-jam Aug 6, 2023
23ece4f
feat: make glob posix compliant
john-jam Aug 6, 2023
63aeb0f
fix: remove unnecessary trailing slash removal
john-jam Aug 6, 2023
0883de9
Support the **/* case
john-jam Aug 7, 2023
d42d09a
feat: support maxdepth for globs
john-jam Aug 7, 2023
bd41c2f
chore: refactor tests and add maxdepth tests
john-jam Aug 7, 2023
1c1aa0d
fix: pass the maxdepth option to glob in expand_path
john-jam Aug 4, 2023
1abb2c6
fix: remove unused is_dir option in other_paths
john-jam Aug 7, 2023
1b2463c
fix: avoid copying unwanted directories
john-jam Aug 8, 2023
31424bd
feat: refactor copy, get and put tests
john-jam Aug 8, 2023
69bb15f
feat: add missing tests
john-jam Aug 8, 2023
2dc4338
fix: fix other tests to include root dirs
john-jam Aug 8, 2023
45f828d
fix: fix other repos import
john-jam Aug 8, 2023
2f33e1b
fix: fix missing tests with root dir
john-jam Aug 8, 2023
6362058
fix: add missing async code changes
john-jam Aug 8, 2023
52b0e65
tmp: use personal repos for friends
john-jam Aug 8, 2023
b20ce62
fix: update the wront exist test case
john-jam Aug 8, 2023
8680aef
fix: update http glob implementation
john-jam Aug 8, 2023
ee514b6
fix: handle file not found when removing glob target test
john-jam Aug 8, 2023
37aabad
fix: test empty ls results only against fs that supports empty dir
john-jam Aug 9, 2023
9559ab6
chore: remove unecesary fixture
john-jam Aug 9, 2023
2b24204
fix: skip bash posix tests on windows
john-jam Aug 9, 2023
8747cb5
fix: add sanitize path fixture for win tests
john-jam Aug 9, 2023
cdde7f4
fix: typo
john-jam Aug 9, 2023
7733691
fix: split glob tests into scenario and edge cases tests
john-jam Aug 13, 2023
2e555b2
chore: fix mypy config
john-jam Aug 13, 2023
c1b0739
fix: skip bash tests if bash or globstar is not available
john-jam Aug 13, 2023
69bb36a
fix: skip python glob tests on windows
john-jam Aug 13, 2023
2a089d3
Revert "tmp: use personal repos for friends"
john-jam Aug 16, 2023
85b7ae0
fix: use relative paths for posix tests
john-jam Aug 21, 2023
495d986
fix: remove comment
john-jam Aug 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 67 additions & 37 deletions fsspec/asyn.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,7 @@

from .callbacks import _DEFAULT_CALLBACK
from .exceptions import FSTimeoutError
from .implementations.local import (
LocalFileSystem,
make_path_posix,
trailing_sep,
trailing_sep_maybe_asterisk,
)
from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep
from .spec import AbstractBufferedFile, AbstractFileSystem
from .utils import is_exception, other_paths

Expand Down Expand Up @@ -357,14 +352,19 @@ async def _copy(
if not paths:
return

isdir = isinstance(path2, str) and (
source_is_file = len(paths) == 1
dest_is_dir = isinstance(path2, str) and (
trailing_sep(path2) or await self._isdir(path2)
)

exists = source_is_str and (
(has_magic(path1) and source_is_file)
or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
)
path2 = other_paths(
paths,
path2,
exists=isdir and source_is_str and not trailing_sep_maybe_asterisk(path1),
is_dir=isdir,
exists=exists,
flatten=not source_is_str,
)
batch_size = batch_size or self.batch_size
Expand Down Expand Up @@ -514,15 +514,20 @@ async def _put(
if not lpaths:
return

isdir = isinstance(rpath, str) and (
source_is_file = len(lpaths) == 1
dest_is_dir = isinstance(rpath, str) and (
trailing_sep(rpath) or await self._isdir(rpath)
)

rpath = self._strip_protocol(rpath)
exists = source_is_str and (
(has_magic(lpath) and source_is_file)
or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
)
rpaths = other_paths(
lpaths,
rpath,
exists=isdir and source_is_str and not trailing_sep_maybe_asterisk(lpath),
is_dir=isdir,
exists=exists,
flatten=not source_is_str,
)

Expand Down Expand Up @@ -571,11 +576,9 @@ async def _get(
"""
source_is_str = isinstance(rpath, str)
# First check for rpath trailing slash as _strip_protocol removes it.
source_not_trailing_sep = source_is_str and not trailing_sep_maybe_asterisk(
rpath
)
source_not_trailing_sep = source_is_str and not trailing_sep(rpath)
rpath = self._strip_protocol(rpath)
rpaths = await self._expand_path(rpath, recursive=recursive)
rpaths = await self._expand_path(rpath, recursive=recursive, maxdepth=maxdepth)
if source_is_str and (not recursive or maxdepth is not None):
# Non-recursive glob does not copy directories
rpaths = [
Expand All @@ -585,14 +588,19 @@ async def _get(
return

lpath = make_path_posix(lpath)
isdir = isinstance(lpath, str) and (
source_is_file = len(rpaths) == 1
dest_is_dir = isinstance(lpath, str) and (
trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
)

exists = source_is_str and (
(has_magic(rpath) and source_is_file)
or (not has_magic(rpath) and dest_is_dir and source_not_trailing_sep)
)
lpaths = other_paths(
rpaths,
lpath,
exists=isdir and source_not_trailing_sep,
is_dir=isdir,
exists=exists,
flatten=not source_is_str,
)
[os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths]
Expand Down Expand Up @@ -695,25 +703,24 @@ async def _walk(self, path, maxdepth=None, on_error="omit", **kwargs):
):
yield _

async def _glob(self, path, **kwargs):
async def _glob(self, path, maxdepth=None, **kwargs):
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")

import re

ends = path.endswith("/")
path = self._strip_protocol(path)
indstar = path.find("*") if path.find("*") >= 0 else len(path)
indques = path.find("?") if path.find("?") >= 0 else len(path)
indbrace = path.find("[") if path.find("[") >= 0 else len(path)
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)

ind = min(indstar, indques, indbrace)
min_idx = min(idx_star, idx_qmark, idx_brace)

detail = kwargs.pop("detail", False)

if not has_magic(path):
root = path
depth = 1
if ends:
path += "/*"
elif await self._exists(path):
if await self._exists(path):
if not detail:
return [path]
else:
Expand All @@ -723,13 +730,21 @@ async def _glob(self, path, **kwargs):
return [] # glob of non-existent returns empty
else:
return {}
elif "/" in path[:ind]:
ind2 = path[:ind].rindex("/")
root = path[: ind2 + 1]
depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
elif "/" in path[:min_idx]:
min_idx = path[:min_idx].rindex("/")
root = path[: min_idx + 1]
depth = path[min_idx + 1 :].count("/") + 1
else:
root = ""
depth = None if "**" in path else path[ind + 1 :].count("/") + 1
depth = path[min_idx + 1 :].count("/") + 1

if "**" in path:
if maxdepth is not None:
idx_double_stars = path.find("**")
depth_double_stars = path[idx_double_stars:].count("/") + 1
depth = depth - depth_double_stars + maxdepth
else:
depth = None

allpaths = await self._find(
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
Expand Down Expand Up @@ -757,14 +772,23 @@ async def _glob(self, path, **kwargs):
)
+ "$"
)
pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern)
pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern)
pattern = re.sub("[*]", "[^/]*", pattern)
pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern)
pattern = re.sub("=DOUBLE_STARS=", ".*", pattern)
pattern = re.compile(pattern)
out = {
p: allpaths[p]
for p in sorted(allpaths)
if pattern.match(p.replace("//", "/").rstrip("/"))
}

# Return directories only when the glob end by a slash
# This is needed for posix glob compliance
if ends:
out = {k: v for k, v in out.items() if v["type"] == "directory"}

if detail:
return out
else:
Expand All @@ -785,6 +809,12 @@ async def _find(self, path, maxdepth=None, withdirs=False, **kwargs):
path = self._strip_protocol(path)
out = dict()
detail = kwargs.pop("detail", False)

# Add the root directory if withdirs is requested
# This is needed for posix glob compliance
if withdirs and path != "" and await self._isdir(path):
out[path] = await self._info(path)

# async for?
async for _, dirs, files in self._walk(path, maxdepth, detail=True, **kwargs):
if withdirs:
Expand All @@ -811,7 +841,7 @@ async def _expand_path(self, path, recursive=False, maxdepth=None):
path = [self._strip_protocol(p) for p in path]
for p in path: # can gather here
if has_magic(p):
bit = set(await self._glob(p))
bit = set(await self._glob(p, maxdepth=maxdepth))
out |= bit
if recursive:
# glob call above expanded one depth so if maxdepth is defined
Expand Down
47 changes: 31 additions & 16 deletions fsspec/implementations/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,31 +432,29 @@ async def _info(self, url, **kwargs):

return {"name": url, "size": None, **info, "type": "file"}

async def _glob(self, path, **kwargs):
async def _glob(self, path, maxdepth=None, **kwargs):
"""
Find files by glob-matching.

This implementation is idntical to the one in AbstractFileSystem,
but "?" is not considered as a character for globbing, because it is
so common in URLs, often identifying the "query" part.
"""
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
import re

ends = path.endswith("/")
path = self._strip_protocol(path)
indstar = path.find("*") if path.find("*") >= 0 else len(path)
indbrace = path.find("[") if path.find("[") >= 0 else len(path)
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)

ind = min(indstar, indbrace)
min_idx = min(idx_star, idx_brace)

detail = kwargs.pop("detail", False)

if not has_magic(path):
root = path
depth = 1
if ends:
path += "/*"
elif await self._exists(path):
if await self._exists(path):
if not detail:
return [path]
else:
Expand All @@ -466,13 +464,21 @@ async def _glob(self, path, **kwargs):
return [] # glob of non-existent returns empty
else:
return {}
elif "/" in path[:ind]:
ind2 = path[:ind].rindex("/")
root = path[: ind2 + 1]
depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
elif "/" in path[:min_idx]:
min_idx = path[:min_idx].rindex("/")
root = path[: min_idx + 1]
depth = path[min_idx + 1 :].count("/") + 1
else:
root = ""
depth = None if "**" in path else path[ind + 1 :].count("/") + 1
depth = path[min_idx + 1 :].count("/") + 1

if "**" in path:
if maxdepth is not None:
idx_double_stars = path.find("**")
depth_double_stars = path[idx_double_stars:].count("/") + 1
depth = depth - depth_double_stars + maxdepth
else:
depth = None

allpaths = await self._find(
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
Expand All @@ -499,14 +505,23 @@ async def _glob(self, path, **kwargs):
)
+ "$"
)
pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern)
pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern)
pattern = re.sub("[*]", "[^/]*", pattern)
pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern)
pattern = re.sub("=DOUBLE_STARS=", ".*", pattern)
pattern = re.compile(pattern)
out = {
p: allpaths[p]
for p in sorted(allpaths)
if pattern.match(p.replace("//", "/").rstrip("/"))
}

# Return directories only when the glob end by a slash
# This is needed for posix glob compliance
if ends:
out = {k: v for k, v in out.items() if v["type"] == "directory"}

if detail:
return out
else:
Expand Down
19 changes: 0 additions & 19 deletions fsspec/implementations/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@ def ls(self, path, detail=False, **kwargs):
else:
return [posixpath.join(path, f) for f in os.listdir(path)]

def glob(self, path, **kwargs):
path = self._strip_protocol(path)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing _strip_protocol from the flow here is likely the cause of all the windows errors, because for LocalFileSystem, it also converts windows-style paths to posix.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what caused this issue: #1322
How about using make_posix_path helper here then?

Copy link
Contributor Author

@john-jam john-jam Aug 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fixed the windows tests by using make_path_posix on the expected outputs in the tests when required: https://github.com/john-jam/filesystem_spec/actions/runs/5807521471/job/15742517288

return super().glob(path, **kwargs)

def info(self, path, **kwargs):
if isinstance(path, os.DirEntry):
# scandir DirEntry
Expand Down Expand Up @@ -287,21 +283,6 @@ def trailing_sep(path):
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))


def trailing_sep_maybe_asterisk(path):
"""Return True if the path ends with a path separator and optionally an
asterisk.

A forward slash is always considered a path separator, even on Operating
Systems that normally use a backslash.
"""
# TODO: if all incoming paths were posix-compliant then separator would
# always be a forward slash, simplifying this function.
# See https://github.com/fsspec/filesystem_spec/pull/1250
return path.endswith((os.sep, os.sep + "*")) or (
os.altsep is not None and path.endswith((os.altsep, os.altsep + "*"))
)


class LocalFileOpener(io.IOBase):
def __init__(
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
Expand Down
6 changes: 5 additions & 1 deletion fsspec/implementations/tests/local/local_fixtures.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from fsspec.implementations.local import LocalFileSystem
from fsspec.implementations.local import LocalFileSystem, make_path_posix
from fsspec.tests.abstract import AbstractFixtures


Expand All @@ -12,3 +12,7 @@ def fs(self):
@pytest.fixture
def fs_path(self, tmpdir):
return str(tmpdir)

@pytest.fixture
def fs_sanitize_path(self):
return make_path_posix
8 changes: 5 additions & 3 deletions fsspec/implementations/tests/test_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,9 @@ def test_globfind_dirs(tmpdir):
fs.glob(tmpdir + "/dir/*", detail=True)[tmpdir + "/dir/afile"]["type"] == "file"
)
assert [tmpdir + "/dir/afile"] == fs.find(tmpdir)
assert [tmpdir + "/dir", tmpdir + "/dir/afile"] == fs.find(tmpdir, withdirs=True)
assert [tmpdir, tmpdir + "/dir", tmpdir + "/dir/afile"] == fs.find(
tmpdir, withdirs=True
)


def test_touch(tmpdir):
Expand Down Expand Up @@ -952,12 +954,12 @@ def test_cp_get_put_empty_directory(tmpdir, funcname):
# cp/get/put without slash, target directory exists
assert fs.isdir(target)
func(empty, target)
assert fs.find(target, withdirs=True) == []
assert fs.find(target, withdirs=True) == [make_path_posix(target)]

# cp/get/put with slash, target directory exists
assert fs.isdir(target)
func(empty + "/", target)
assert fs.find(target, withdirs=True) == []
assert fs.find(target, withdirs=True) == [make_path_posix(target)]

fs.rmdir(target)

Expand Down
4 changes: 2 additions & 2 deletions fsspec/implementations/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,12 +316,12 @@ def test_cp_empty_directory(m):
# cp without slash, target directory exists
assert m.isdir(target)
m.cp(empty, target)
assert m.find(target, withdirs=True) == []
assert m.find(target, withdirs=True) == [target]

# cp with slash, target directory exists
assert m.isdir(target)
m.cp(empty + "/", target)
assert m.find(target, withdirs=True) == []
assert m.find(target, withdirs=True) == [target]

m.rmdir(target)

Expand Down
Loading
Loading