Skip to content

Commit

Permalink
GH-125413: pathlib: use scandir() to speed up copy()
Browse files Browse the repository at this point in the history
Use the new `PathBase.scandir()` method in `PathBase.copy()`, which greatly
reduces the number of `PathBase.stat()` calls needed when copying. This
also speeds up `Path.copy()`, which inherits the superclass implementation.

Under the hood, we use directory entries to distinguish between files,
directories and symlinks, and to retrieve a `stat_result` when reading
metadata. This logic is extracted into a new `pathlib._abc.CopierBase`
class, which helps reduce the number of underscore-prefixed support
methods in the path interface.
  • Loading branch information
barneygale committed Nov 1, 2024
1 parent 260843d commit 9b223ca
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 144 deletions.
206 changes: 115 additions & 91 deletions Lib/pathlib/_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,110 @@ def isabs(self, path):
raise UnsupportedOperation(self._unsupported_msg('isabs()'))


class CopierBase:
"""Base class for path copiers, which transfer files and directories from
one path object to another.
A reference to this class is available as PathBase._copier. When
PathBase.copy() is called, it uses the copier type of the *target* path to
perform the copy; this allows writing of data and metadata to occur
together (or in a particular order) where supported or required by the
path type.
"""
__slots__ = ('follow_symlinks', 'dirs_exist_ok', 'preserve_metadata')

def __init__(self, follow_symlinks=True, dirs_exist_ok=False,
preserve_metadata=False):
self.follow_symlinks = follow_symlinks
self.dirs_exist_ok = dirs_exist_ok
self.preserve_metadata = preserve_metadata

@classmethod
def ensure_different_files(cls, source, target):
"""Raise OSError(EINVAL) if both paths refer to the same file."""
try:
if not target.samefile(source):
return
except (OSError, ValueError):
return
err = OSError(EINVAL, "Source and target are the same file")
err.filename = str(source)
err.filename2 = str(target)
raise err

@classmethod
def ensure_distinct_paths(cls, source, target):
"""Raise OSError(EINVAL) if the target is within the source path."""
# Note: there is no straightforward, foolproof algorithm to determine
# if one directory is within another (a particularly perverse example
# would be a single network share mounted in one location via NFS, and
# in another location via CIFS), so we simply checks whether the
# other path is lexically equal to, or within, this path.
if source == target:
err = OSError(EINVAL, "Source and target are the same path")
elif source in target.parents:
err = OSError(EINVAL, "Source path is a parent of target path")
else:
return
err.filename = str(source)
err.filename2 = str(target)
raise err

def copy(self, source, target):
"""Copy the given file or directory tree to the given target."""
self.ensure_distinct_paths(source, target)
if self.preserve_metadata:
metadata_keys = source._readable_metadata & target._writable_metadata
else:
metadata_keys = frozenset()
if not self.follow_symlinks and source.is_symlink():
self.copy_symlink(source, target, metadata_keys)
elif source.is_dir():
self.copy_dir(source, target, metadata_keys)
else:
self.copy_file(source, target, metadata_keys)

def copy_dir(self, source, target, metadata_keys, dir_entry=None):
"""Copy the given directory to the given target."""
metadata = source._read_metadata(metadata_keys, dir_entry=dir_entry)
with source.scandir() as entries:
target.mkdir(exist_ok=self.dirs_exist_ok)
for entry in entries:
src = source.joinpath(entry.name)
dst = target.joinpath(entry.name)
if not self.follow_symlinks and entry.is_symlink():
self.copy_symlink(src, dst, metadata_keys, entry)
elif entry.is_dir():
self.copy_dir(src, dst, metadata_keys, entry)
else:
self.copy_file(src, dst, metadata_keys, entry)
target._write_metadata(metadata)

def copy_file(self, source, target, metadata_keys, dir_entry=None):
"""Copy the given file to the given target."""
self.ensure_different_files(source, target)
metadata = source._read_metadata(metadata_keys, dir_entry=dir_entry)
with source.open('rb') as source_f:
try:
with target.open('wb') as target_f:
copyfileobj(source_f, target_f)
except IsADirectoryError as e:
if not target.exists():
# Raise a less confusing exception.
raise FileNotFoundError(
f'Directory does not exist: {target}') from e
else:
raise
target._write_metadata(metadata)

def copy_symlink(self, source, target, metadata_keys, dir_entry=None):
"""Copy the given symlink to the given target."""
metadata = source._read_metadata(
metadata_keys, follow_symlinks=False, dir_entry=dir_entry)
target.symlink_to(source.readlink())
target._write_metadata(metadata, follow_symlinks=False)


class PathGlobber(_GlobberBase):
"""
Class providing shell-style globbing for path objects.
Expand Down Expand Up @@ -425,6 +529,9 @@ class PathBase(PurePathBase):

# Maximum number of symlinks to follow in resolve()
_max_symlinks = 40
_copier = CopierBase
_readable_metadata = frozenset()
_writable_metadata = frozenset()

@classmethod
def _unsupported_msg(cls, attribute):
Expand Down Expand Up @@ -565,39 +672,6 @@ def samefile(self, other_path):
return (st.st_ino == other_st.st_ino and
st.st_dev == other_st.st_dev)

def _ensure_different_file(self, other_path):
"""
Raise OSError(EINVAL) if both paths refer to the same file.
"""
try:
if not self.samefile(other_path):
return
except (OSError, ValueError):
return
err = OSError(EINVAL, "Source and target are the same file")
err.filename = str(self)
err.filename2 = str(other_path)
raise err

def _ensure_distinct_path(self, other_path):
"""
Raise OSError(EINVAL) if the other path is within this path.
"""
# Note: there is no straightforward, foolproof algorithm to determine
# if one directory is within another (a particularly perverse example
# would be a single network share mounted in one location via NFS, and
# in another location via CIFS), so we simply checks whether the
# other path is lexically equal to, or within, this path.
if self == other_path:
err = OSError(EINVAL, "Source and target are the same path")
elif self in other_path.parents:
err = OSError(EINVAL, "Source path is a parent of target path")
else:
return
err.filename = str(self)
err.filename2 = str(other_path)
raise err

def open(self, mode='r', buffering=-1, encoding=None,
errors=None, newline=None):
"""
Expand Down Expand Up @@ -805,13 +879,6 @@ def symlink_to(self, target, target_is_directory=False):
"""
raise UnsupportedOperation(self._unsupported_msg('symlink_to()'))

def _symlink_to_target_of(self, link):
"""
Make this path a symlink with the same target as the given link. This
is used by copy().
"""
self.symlink_to(link.readlink())

def hardlink_to(self, target):
"""
Make this path a hard link pointing to the same file as *target*.
Expand All @@ -832,74 +899,31 @@ def mkdir(self, mode=0o777, parents=False, exist_ok=False):
"""
raise UnsupportedOperation(self._unsupported_msg('mkdir()'))

# Metadata keys supported by this path type.
_readable_metadata = _writable_metadata = frozenset()

def _read_metadata(self, keys=None, *, follow_symlinks=True):
def _read_metadata(self, metadata_keys, *, follow_symlinks=True, dir_entry=None):
"""
Returns path metadata as a dict with string keys.
"""
if not metadata_keys:
return {}
raise UnsupportedOperation(self._unsupported_msg('_read_metadata()'))

def _write_metadata(self, metadata, *, follow_symlinks=True):
"""
Sets path metadata from the given dict with string keys.
"""
if not metadata:
return
raise UnsupportedOperation(self._unsupported_msg('_write_metadata()'))

def _copy_metadata(self, target, *, follow_symlinks=True):
"""
Copies metadata (permissions, timestamps, etc) from this path to target.
"""
# Metadata types supported by both source and target.
keys = self._readable_metadata & target._writable_metadata
if keys:
metadata = self._read_metadata(keys, follow_symlinks=follow_symlinks)
target._write_metadata(metadata, follow_symlinks=follow_symlinks)

def _copy_file(self, target):
"""
Copy the contents of this file to the given target.
"""
self._ensure_different_file(target)
with self.open('rb') as source_f:
try:
with target.open('wb') as target_f:
copyfileobj(source_f, target_f)
except IsADirectoryError as e:
if not target.exists():
# Raise a less confusing exception.
raise FileNotFoundError(
f'Directory does not exist: {target}') from e
else:
raise

def copy(self, target, *, follow_symlinks=True, dirs_exist_ok=False,
preserve_metadata=False):
"""
Recursively copy this file or directory tree to the given destination.
"""
if not isinstance(target, PathBase):
target = self.with_segments(target)
self._ensure_distinct_path(target)
stack = [(self, target)]
while stack:
src, dst = stack.pop()
if not follow_symlinks and src.is_symlink():
dst._symlink_to_target_of(src)
if preserve_metadata:
src._copy_metadata(dst, follow_symlinks=False)
elif src.is_dir():
children = src.iterdir()
dst.mkdir(exist_ok=dirs_exist_ok)
stack.extend((child, dst.joinpath(child.name))
for child in children)
if preserve_metadata:
src._copy_metadata(dst)
else:
src._copy_file(dst)
if preserve_metadata:
src._copy_metadata(dst)
copier = target._copier(follow_symlinks, dirs_exist_ok, preserve_metadata)
copier.copy(self, target)
return target

def copy_into(self, target_dir, *, follow_symlinks=True,
Expand Down Expand Up @@ -946,7 +970,7 @@ def move(self, target):
"""
Recursively move this file or directory tree to the given destination.
"""
self._ensure_different_file(target)
target._copier.ensure_different_files(self, target)
try:
return self.replace(target)
except UnsupportedOperation:
Expand Down
60 changes: 33 additions & 27 deletions Lib/pathlib/_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from pathlib._os import (copyfile, file_metadata_keys, read_file_metadata,
write_file_metadata)
from pathlib._abc import UnsupportedOperation, PurePathBase, PathBase
from pathlib._abc import UnsupportedOperation, PurePathBase, PathBase, CopierBase


__all__ = [
Expand Down Expand Up @@ -57,6 +57,33 @@ def __repr__(self):
return "<{}.parents>".format(type(self._path).__name__)


class _Copier(CopierBase):
"""Copier class that uses fast OS copy routine where possible, and ensures
symlinks' target_is_directory argument is properly set on Windows.
"""
__slots__ = ()

if copyfile:
def copy_file(self, source, target, metadata_keys, dir_entry=None):
"""Copy the given file to the given target."""
try:
source = os.fspath(source)
except TypeError:
if not isinstance(source, PathBase):
raise
CopierBase.copy_file(self, source, target, metadata_keys, dir_entry)
else:
copyfile(source, os.fspath(target))

if os.name == 'nt':
def copy_symlink(self, source, target, metadata_keys, dir_entry=None):
"""Copy the given symlink to the given target."""
metadata = source._read_metadata(
metadata_keys, follow_symlinks=False, dir_entry=dir_entry)
target.symlink_to(source.readlink(), (dir_entry or source).is_dir())
target._write_metadata(metadata, follow_symlinks=False)


class PurePath(PurePathBase):
"""Base class for manipulating paths without I/O.
Expand Down Expand Up @@ -512,6 +539,11 @@ class Path(PathBase, PurePath):
but cannot instantiate a WindowsPath on a POSIX system or vice versa.
"""
__slots__ = ()
_copier = _Copier
_readable_metadata = file_metadata_keys
_writable_metadata = file_metadata_keys
_read_metadata = read_file_metadata
_write_metadata = write_file_metadata
as_uri = PurePath.as_uri

@classmethod
Expand Down Expand Up @@ -789,24 +821,6 @@ def mkdir(self, mode=0o777, parents=False, exist_ok=False):
if not exist_ok or not self.is_dir():
raise

_readable_metadata = _writable_metadata = file_metadata_keys
_read_metadata = read_file_metadata
_write_metadata = write_file_metadata

if copyfile:
def _copy_file(self, target):
"""
Copy the contents of this file to the given target.
"""
try:
target = os.fspath(target)
except TypeError:
if not isinstance(target, PathBase):
raise
PathBase._copy_file(self, target)
else:
copyfile(os.fspath(self), target)

def chmod(self, mode, *, follow_symlinks=True):
"""
Change the permissions of the path, like os.chmod().
Expand Down Expand Up @@ -869,14 +883,6 @@ def symlink_to(self, target, target_is_directory=False):
"""
os.symlink(target, self, target_is_directory)

if os.name == 'nt':
def _symlink_to_target_of(self, link):
"""
Make this path a symlink with the same target as the given link.
This is used by copy().
"""
self.symlink_to(link.readlink(), link.is_dir())

if hasattr(os, "link"):
def hardlink_to(self, target):
"""
Expand Down
Loading

0 comments on commit 9b223ca

Please sign in to comment.