Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-implement glob logic #202

Merged
merged 6 commits into from
Feb 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: [3.6, 3.7, 3.8, 3.9]
python-version: [3.7, 3.8, 3.9, "3.10"]

steps:
- uses: actions/checkout@v2
Expand Down
5 changes: 5 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# cloudpathlib Changelog

## v0.7.0 (Unreleased)

- Re-implemented `glob` and `rglob` functions to use pathlib's globbing logic rather than fnmatch; update `iterdir` to not include self. ([Issue #154](https://github.com/drivendataorg/cloudpathlib/issues/154), [Issue #15](https://github.com/drivendataorg/cloudpathlib/issues/15))
- Drop support for Python 3.6 [Issue #186](https://github.com/drivendataorg/cloudpathlib/issues/186)

## v0.6.5 (2022-01-25)

- Fixed error when "directories" created on AWS S3 were reported as files. ([Issue #148](https://github.com/drivendataorg/cloudpathlib/issues/148), [PR #190](https://github.com/drivendataorg/cloudpathlib/pull/190))
Expand Down
17 changes: 11 additions & 6 deletions cloudpathlib/azure/azblobclient.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from datetime import datetime
import os
from pathlib import Path, PurePosixPath
from typing import Any, Dict, Iterable, Optional, Union
from typing import Any, Dict, Iterable, Optional, Tuple, Union


from ..client import Client, register_client_class
Expand Down Expand Up @@ -140,7 +140,7 @@ def _exists(self, cloud_path: AzureBlobPath) -> bool:

def _list_dir(
self, cloud_path: AzureBlobPath, recursive: bool = False
) -> Iterable[AzureBlobPath]:
) -> Iterable[Tuple[AzureBlobPath, bool]]:
container_client = self.service_client.get_container_client(cloud_path.container)

prefix = cloud_path.blob
Expand All @@ -155,21 +155,24 @@ def _list_dir(
# get directory from this path
for parent in PurePosixPath(o.name[len(prefix) :]).parents:

# if we haven't surfaced thei directory already
# if we haven't surfaced this directory already
if parent not in yielded_dirs and str(parent) != ".":

# skip if not recursive and this is beyond our depth
if not recursive and "/" in str(parent):
continue

yield self.CloudPath(f"az://{cloud_path.container}/{prefix}{parent}")
yield (
self.CloudPath(f"az://{cloud_path.container}/{prefix}{parent}"),
True, # is a directory
)
yielded_dirs.add(parent)

# skip file if not recursive and this is beyond our depth
if not recursive and "/" in o.name[len(prefix) :]:
continue

yield self.CloudPath(f"az://{cloud_path.container}/{o.name}")
yield (self.CloudPath(f"az://{cloud_path.container}/{o.name}"), False) # is a file

def _move_file(
self, src: AzureBlobPath, dst: AzureBlobPath, remove_src: bool = True
Expand Down Expand Up @@ -198,7 +201,9 @@ def _move_file(

def _remove(self, cloud_path: AzureBlobPath) -> None:
if self._is_file_or_dir(cloud_path) == "dir":
blobs = [b.blob for b in self._list_dir(cloud_path, recursive=True) if b.is_file()]
blobs = [
b.blob for b, is_dir in self._list_dir(cloud_path, recursive=True) if not is_dir
]
container_client = self.service_client.get_container_client(cloud_path.container)
container_client.delete_blobs(*blobs)
elif self._is_file_or_dir(cloud_path) == "file":
Expand Down
9 changes: 7 additions & 2 deletions cloudpathlib/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Generic, Callable, Iterable, Optional, TypeVar, Union
from typing import Generic, Callable, Iterable, Optional, Tuple, TypeVar, Union

from .cloudpath import CloudImplementation, CloudPath, implementation_registry

Expand Down Expand Up @@ -70,7 +70,7 @@ def _exists(self, cloud_path: BoundedCloudPath) -> bool:
@abc.abstractmethod
def _list_dir(
self, cloud_path: BoundedCloudPath, recursive: bool
) -> Iterable[BoundedCloudPath]:
) -> Iterable[Tuple[BoundedCloudPath, bool]]:
"""List all the files and folders in a directory.

Parameters
Expand All @@ -79,6 +79,11 @@ def _list_dir(
The folder to start from.
recursive : bool
Whether or not to list recursively.

Returns
-------
contents : Iterable[Tuple]
Of the form [(CloudPath, is_dir), ...] for every child of the dir.
"""
pass

Expand Down
147 changes: 124 additions & 23 deletions cloudpathlib/cloudpath.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import abc
from collections import defaultdict
import collections.abc
import fnmatch
from contextlib import contextmanager
import os
from pathlib import Path, PosixPath, PurePosixPath, WindowsPath
from typing import Any, IO, Iterable, Optional, TYPE_CHECKING, Union
from pathlib import Path, PosixPath, PurePosixPath, WindowsPath, _make_selector, _posix_flavour # type: ignore
from typing import Any, IO, Iterable, Dict, Optional, TYPE_CHECKING, Union
from urllib.parse import urlparse
from warnings import warn

Expand All @@ -15,6 +15,7 @@
CloudPathFileExistsError,
CloudPathIsADirectoryError,
CloudPathNotADirectoryError,
CloudPathNotImplementedError,
DirectoryNotEmptyError,
IncompleteImplementationError,
InvalidPrefixError,
Expand Down Expand Up @@ -305,28 +306,49 @@ def exists(self) -> bool:
def fspath(self) -> str:
return self.__fspath__()

def glob(self, pattern: str) -> Iterable["CloudPath"]:
# strip cloud prefix from pattern if it is included
if pattern.startswith(self.cloud_prefix):
pattern = pattern[len(self.cloud_prefix) :]
def _glob_checks(self, pattern):
if ".." in pattern:
raise CloudPathNotImplementedError(
"Relative paths with '..' not supported in glob patterns."
)

if pattern.startswith(self.cloud_prefix) or pattern.startswith("/"):
raise CloudPathNotImplementedError("Non-relative patterns are unsupported")

def _glob(self, selector):
root = _CloudPathSelectable(
PurePosixPath(self._no_prefix_no_drive),
{
PurePosixPath(c._no_prefix_no_drive): is_dir
for c, is_dir in self.client._list_dir(self, recursive=True)
},
is_dir=True,
exists=True,
)

# strip "drive" from pattern if it is included
if pattern.startswith(self.drive + "/"):
pattern = pattern[len(self.drive + "/") :]
for p in selector.select_from(root):
yield self.client.CloudPath(f"{self.cloud_prefix}{self.drive}{p}")

# identify if pattern is recursive or not
recursive = False
if pattern.startswith("**/"):
pattern = pattern.split("/", 1)[-1]
recursive = True
def glob(self, pattern):
self._glob_checks(pattern)

for f in self.client._list_dir(self, recursive=recursive):
if fnmatch.fnmatch(f._no_prefix_no_drive, pattern):
yield f
pattern_parts = PurePosixPath(pattern).parts
selector = _make_selector(tuple(pattern_parts), _posix_flavour)

yield from self._glob(selector)

def rglob(self, pattern):
self._glob_checks(pattern)

pattern_parts = PurePosixPath(pattern).parts
selector = _make_selector(("**",) + tuple(pattern_parts), _posix_flavour)

yield from self._glob(selector)

def iterdir(self) -> Iterable["CloudPath"]:
for f in self.client._list_dir(self, recursive=False):
yield f
for f, _ in self.client._list_dir(self, recursive=False):
if f != self: # iterdir does not include itself in pathlib
yield f

def open(
self,
Expand Down Expand Up @@ -411,9 +433,6 @@ def rename(self, target: "CloudPath") -> "CloudPath":
# we actually move files
return self.replace(target)

def rglob(self, pattern: str) -> Iterable["CloudPath"]:
return self.glob("**/" + pattern)

def rmdir(self):
if self.is_file():
raise CloudPathNotADirectoryError(
Expand Down Expand Up @@ -866,3 +885,85 @@ def _resolve(path: PurePosixPath) -> str:
newpath = newpath + sep + name

return newpath or sep


# These objects are used to wrap CloudPaths in a context where we can use
# the python pathlib implementations for `glob` and `rglob`, which depend
# on the Selector created by the `_make_selector` method being passed
# an object like the below when `select_from` is called. We implement these methods
# in a simple wrapper to use the same glob recursion and pattern logic without
# rolling our own.
#
# Designed to be compatible when used by these selector implementations from pathlib:
# https://github.com/python/cpython/blob/3.10/Lib/pathlib.py#L385-L500
class _CloudPathSelectableAccessor:
def __init__(self, scandir_func):
self.scandir = scandir_func


class _CloudPathSelectable:
def __init__(
self,
relative_cloud_path: PurePosixPath,
children: Dict[PurePosixPath, bool],
is_dir: bool,
exists: bool,
):
self._path = relative_cloud_path
self._all_children = children

self._accessor = _CloudPathSelectableAccessor(self.scandir)

self._is_dir = is_dir
self._exists = exists

def __repr__(self):
return str(self._path)

def is_dir(self):
return self._is_dir

def exists(self):
return self._exists

def is_symlink(self):
return False

@property
def name(self):
return self._path.name

@staticmethod
@contextmanager
def scandir(root):
yield (
root._make_child_relpath(c.name)
for c, _ in root._all_children.items()
if c.parent == root._path
)

def _filter_children(self, rel_to):
return {
c: is_dir
for c, is_dir in self._all_children.items()
if self._is_relative_to(c, rel_to)
}

@staticmethod
def _is_relative_to(maybe_child, maybe_parent):
try:
maybe_child.relative_to(maybe_parent)
return True
except ValueError:
return False

def _make_child_relpath(self, part):
child = self._path / part
filtered_children = self._filter_children(child)

return _CloudPathSelectable(
child,
filtered_children,
is_dir=self._all_children.get(child, False),
exists=child in self._all_children,
)
4 changes: 4 additions & 0 deletions cloudpathlib/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ class CloudPathNotADirectoryError(CloudPathException, NotADirectoryError):
pass


class CloudPathNotImplementedError(CloudPathException, NotImplementedError):
pass


class DirectoryNotEmptyError(CloudPathException):
pass

Expand Down
15 changes: 10 additions & 5 deletions cloudpathlib/gs/gsclient.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from datetime import datetime
import os
from pathlib import Path, PurePosixPath
from typing import Any, Dict, Iterable, Optional, TYPE_CHECKING, Union
from typing import Any, Dict, Iterable, Optional, TYPE_CHECKING, Tuple, Union

from ..client import Client, register_client_class
from ..cloudpath import implementation_registry
Expand Down Expand Up @@ -132,7 +132,7 @@ def _is_file_or_dir(self, cloud_path: GSPath) -> Optional[str]:
def _exists(self, cloud_path: GSPath) -> bool:
return self._is_file_or_dir(cloud_path) in ["file", "dir"]

def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[GSPath]:
def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[Tuple[GSPath, bool]]:
bucket = self.client.bucket(cloud_path.bucket)

prefix = cloud_path.blob
Expand All @@ -154,14 +154,17 @@ def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[GSPath]:
if not recursive and "/" in str(parent):
continue

yield self.CloudPath(f"gs://{cloud_path.bucket}/{prefix}{parent}")
yield (
self.CloudPath(f"gs://{cloud_path.bucket}/{prefix}{parent}"),
True, # is a directory
)
yielded_dirs.add(parent)

# skip file if not recursive and this is beyond our depth
if not recursive and "/" in o.name[len(prefix) :]:
continue

yield self.CloudPath(f"gs://{cloud_path.bucket}/{o.name}")
yield (self.CloudPath(f"gs://{cloud_path.bucket}/{o.name}"), False) # is a file

def _move_file(self, src: GSPath, dst: GSPath, remove_src: bool = True) -> GSPath:
# just a touch, so "REPLACE" metadata
Expand Down Expand Up @@ -190,7 +193,9 @@ def _move_file(self, src: GSPath, dst: GSPath, remove_src: bool = True) -> GSPat

def _remove(self, cloud_path: GSPath) -> None:
if self._is_file_or_dir(cloud_path) == "dir":
blobs = [b.blob for b in self._list_dir(cloud_path, recursive=True) if b.is_file()]
blobs = [
b.blob for b, is_dir in self._list_dir(cloud_path, recursive=True) if not is_dir
]
bucket = self.client.bucket(cloud_path.bucket)
for blob in blobs:
bucket.get_blob(blob).delete()
Expand Down
10 changes: 6 additions & 4 deletions cloudpathlib/local/localclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path, PurePosixPath
import shutil
from tempfile import TemporaryDirectory
from typing import Iterable, List, Optional, Union
from typing import Iterable, List, Optional, Tuple, Union

from ..client import Client
from .localpath import LocalPath
Expand Down Expand Up @@ -67,14 +67,16 @@ def _is_dir(self, cloud_path: "LocalPath") -> bool:
def _is_file(self, cloud_path: "LocalPath") -> bool:
return self._cloud_path_to_local(cloud_path).is_file()

def _list_dir(self, cloud_path: "LocalPath", recursive=False) -> Iterable["LocalPath"]:
def _list_dir(
self, cloud_path: "LocalPath", recursive=False
) -> Iterable[Tuple["LocalPath", bool]]:
if recursive:
return (
self._local_to_cloud_path(obj)
(self._local_to_cloud_path(obj), obj.is_dir())
for obj in self._cloud_path_to_local(cloud_path).glob("**/*")
)
return (
self._local_to_cloud_path(obj)
(self._local_to_cloud_path(obj), obj.is_dir())
for obj in self._cloud_path_to_local(cloud_path).iterdir()
)

Expand Down
Loading