From 9b223ca1013689c4eed16709be9ecc69d51b4c4f Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 1 Nov 2024 04:18:46 +0000 Subject: [PATCH] GH-125413: pathlib: use `scandir()` to speed up `copy()` Use the new `PathBase.scandir()` method in `PathBase.copy()`, which greatly reduces the number of `PathBase.stat()` calls needed when copying. This also speeds up `Path.copy()`, which inherits the superclass implementation. Under the hood, we use directory entries to distinguish between files, directories and symlinks, and to retrieve a `stat_result` when reading metadata. This logic is extracted into a new `pathlib._abc.CopierBase` class, which helps reduce the number of underscore-prefixed support methods in the path interface. --- Lib/pathlib/_abc.py | 206 ++++++++++-------- Lib/pathlib/_local.py | 60 ++--- Lib/pathlib/_os.py | 44 ++-- Lib/test/test_pathlib/test_pathlib_abc.py | 2 +- ...-11-01-04-21-26.gh-issue-125413.Z-jjZq.rst | 2 + 5 files changed, 170 insertions(+), 144 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-11-01-04-21-26.gh-issue-125413.Z-jjZq.rst diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index dfff8b460d1bf1..58a43bb4c0a2f2 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -87,6 +87,110 @@ def isabs(self, path): raise UnsupportedOperation(self._unsupported_msg('isabs()')) +class CopierBase: + """Base class for path copiers, which transfer files and directories from + one path object to another. + + A reference to this class is available as PathBase._copier. When + PathBase.copy() is called, it uses the copier type of the *target* path to + perform the copy; this allows writing of data and metadata to occur + together (or in a particular order) where supported or required by the + path type. + """ + __slots__ = ('follow_symlinks', 'dirs_exist_ok', 'preserve_metadata') + + def __init__(self, follow_symlinks=True, dirs_exist_ok=False, + preserve_metadata=False): + self.follow_symlinks = follow_symlinks + self.dirs_exist_ok = dirs_exist_ok + self.preserve_metadata = preserve_metadata + + @classmethod + def ensure_different_files(cls, source, target): + """Raise OSError(EINVAL) if both paths refer to the same file.""" + try: + if not target.samefile(source): + return + except (OSError, ValueError): + return + err = OSError(EINVAL, "Source and target are the same file") + err.filename = str(source) + err.filename2 = str(target) + raise err + + @classmethod + def ensure_distinct_paths(cls, source, target): + """Raise OSError(EINVAL) if the target is within the source path.""" + # Note: there is no straightforward, foolproof algorithm to determine + # if one directory is within another (a particularly perverse example + # would be a single network share mounted in one location via NFS, and + # in another location via CIFS), so we simply checks whether the + # other path is lexically equal to, or within, this path. + if source == target: + err = OSError(EINVAL, "Source and target are the same path") + elif source in target.parents: + err = OSError(EINVAL, "Source path is a parent of target path") + else: + return + err.filename = str(source) + err.filename2 = str(target) + raise err + + def copy(self, source, target): + """Copy the given file or directory tree to the given target.""" + self.ensure_distinct_paths(source, target) + if self.preserve_metadata: + metadata_keys = source._readable_metadata & target._writable_metadata + else: + metadata_keys = frozenset() + if not self.follow_symlinks and source.is_symlink(): + self.copy_symlink(source, target, metadata_keys) + elif source.is_dir(): + self.copy_dir(source, target, metadata_keys) + else: + self.copy_file(source, target, metadata_keys) + + def copy_dir(self, source, target, metadata_keys, dir_entry=None): + """Copy the given directory to the given target.""" + metadata = source._read_metadata(metadata_keys, dir_entry=dir_entry) + with source.scandir() as entries: + target.mkdir(exist_ok=self.dirs_exist_ok) + for entry in entries: + src = source.joinpath(entry.name) + dst = target.joinpath(entry.name) + if not self.follow_symlinks and entry.is_symlink(): + self.copy_symlink(src, dst, metadata_keys, entry) + elif entry.is_dir(): + self.copy_dir(src, dst, metadata_keys, entry) + else: + self.copy_file(src, dst, metadata_keys, entry) + target._write_metadata(metadata) + + def copy_file(self, source, target, metadata_keys, dir_entry=None): + """Copy the given file to the given target.""" + self.ensure_different_files(source, target) + metadata = source._read_metadata(metadata_keys, dir_entry=dir_entry) + with source.open('rb') as source_f: + try: + with target.open('wb') as target_f: + copyfileobj(source_f, target_f) + except IsADirectoryError as e: + if not target.exists(): + # Raise a less confusing exception. + raise FileNotFoundError( + f'Directory does not exist: {target}') from e + else: + raise + target._write_metadata(metadata) + + def copy_symlink(self, source, target, metadata_keys, dir_entry=None): + """Copy the given symlink to the given target.""" + metadata = source._read_metadata( + metadata_keys, follow_symlinks=False, dir_entry=dir_entry) + target.symlink_to(source.readlink()) + target._write_metadata(metadata, follow_symlinks=False) + + class PathGlobber(_GlobberBase): """ Class providing shell-style globbing for path objects. @@ -425,6 +529,9 @@ class PathBase(PurePathBase): # Maximum number of symlinks to follow in resolve() _max_symlinks = 40 + _copier = CopierBase + _readable_metadata = frozenset() + _writable_metadata = frozenset() @classmethod def _unsupported_msg(cls, attribute): @@ -565,39 +672,6 @@ def samefile(self, other_path): return (st.st_ino == other_st.st_ino and st.st_dev == other_st.st_dev) - def _ensure_different_file(self, other_path): - """ - Raise OSError(EINVAL) if both paths refer to the same file. - """ - try: - if not self.samefile(other_path): - return - except (OSError, ValueError): - return - err = OSError(EINVAL, "Source and target are the same file") - err.filename = str(self) - err.filename2 = str(other_path) - raise err - - def _ensure_distinct_path(self, other_path): - """ - Raise OSError(EINVAL) if the other path is within this path. - """ - # Note: there is no straightforward, foolproof algorithm to determine - # if one directory is within another (a particularly perverse example - # would be a single network share mounted in one location via NFS, and - # in another location via CIFS), so we simply checks whether the - # other path is lexically equal to, or within, this path. - if self == other_path: - err = OSError(EINVAL, "Source and target are the same path") - elif self in other_path.parents: - err = OSError(EINVAL, "Source path is a parent of target path") - else: - return - err.filename = str(self) - err.filename2 = str(other_path) - raise err - def open(self, mode='r', buffering=-1, encoding=None, errors=None, newline=None): """ @@ -805,13 +879,6 @@ def symlink_to(self, target, target_is_directory=False): """ raise UnsupportedOperation(self._unsupported_msg('symlink_to()')) - def _symlink_to_target_of(self, link): - """ - Make this path a symlink with the same target as the given link. This - is used by copy(). - """ - self.symlink_to(link.readlink()) - def hardlink_to(self, target): """ Make this path a hard link pointing to the same file as *target*. @@ -832,48 +899,22 @@ def mkdir(self, mode=0o777, parents=False, exist_ok=False): """ raise UnsupportedOperation(self._unsupported_msg('mkdir()')) - # Metadata keys supported by this path type. - _readable_metadata = _writable_metadata = frozenset() - - def _read_metadata(self, keys=None, *, follow_symlinks=True): + def _read_metadata(self, metadata_keys, *, follow_symlinks=True, dir_entry=None): """ Returns path metadata as a dict with string keys. """ + if not metadata_keys: + return {} raise UnsupportedOperation(self._unsupported_msg('_read_metadata()')) def _write_metadata(self, metadata, *, follow_symlinks=True): """ Sets path metadata from the given dict with string keys. """ + if not metadata: + return raise UnsupportedOperation(self._unsupported_msg('_write_metadata()')) - def _copy_metadata(self, target, *, follow_symlinks=True): - """ - Copies metadata (permissions, timestamps, etc) from this path to target. - """ - # Metadata types supported by both source and target. - keys = self._readable_metadata & target._writable_metadata - if keys: - metadata = self._read_metadata(keys, follow_symlinks=follow_symlinks) - target._write_metadata(metadata, follow_symlinks=follow_symlinks) - - def _copy_file(self, target): - """ - Copy the contents of this file to the given target. - """ - self._ensure_different_file(target) - with self.open('rb') as source_f: - try: - with target.open('wb') as target_f: - copyfileobj(source_f, target_f) - except IsADirectoryError as e: - if not target.exists(): - # Raise a less confusing exception. - raise FileNotFoundError( - f'Directory does not exist: {target}') from e - else: - raise - def copy(self, target, *, follow_symlinks=True, dirs_exist_ok=False, preserve_metadata=False): """ @@ -881,25 +922,8 @@ def copy(self, target, *, follow_symlinks=True, dirs_exist_ok=False, """ if not isinstance(target, PathBase): target = self.with_segments(target) - self._ensure_distinct_path(target) - stack = [(self, target)] - while stack: - src, dst = stack.pop() - if not follow_symlinks and src.is_symlink(): - dst._symlink_to_target_of(src) - if preserve_metadata: - src._copy_metadata(dst, follow_symlinks=False) - elif src.is_dir(): - children = src.iterdir() - dst.mkdir(exist_ok=dirs_exist_ok) - stack.extend((child, dst.joinpath(child.name)) - for child in children) - if preserve_metadata: - src._copy_metadata(dst) - else: - src._copy_file(dst) - if preserve_metadata: - src._copy_metadata(dst) + copier = target._copier(follow_symlinks, dirs_exist_ok, preserve_metadata) + copier.copy(self, target) return target def copy_into(self, target_dir, *, follow_symlinks=True, @@ -946,7 +970,7 @@ def move(self, target): """ Recursively move this file or directory tree to the given destination. """ - self._ensure_different_file(target) + target._copier.ensure_different_files(self, target) try: return self.replace(target) except UnsupportedOperation: diff --git a/Lib/pathlib/_local.py b/Lib/pathlib/_local.py index ef072b83d96904..423a2cc6b41cb6 100644 --- a/Lib/pathlib/_local.py +++ b/Lib/pathlib/_local.py @@ -19,7 +19,7 @@ from pathlib._os import (copyfile, file_metadata_keys, read_file_metadata, write_file_metadata) -from pathlib._abc import UnsupportedOperation, PurePathBase, PathBase +from pathlib._abc import UnsupportedOperation, PurePathBase, PathBase, CopierBase __all__ = [ @@ -57,6 +57,33 @@ def __repr__(self): return "<{}.parents>".format(type(self._path).__name__) +class _Copier(CopierBase): + """Copier class that uses fast OS copy routine where possible, and ensures + symlinks' target_is_directory argument is properly set on Windows. + """ + __slots__ = () + + if copyfile: + def copy_file(self, source, target, metadata_keys, dir_entry=None): + """Copy the given file to the given target.""" + try: + source = os.fspath(source) + except TypeError: + if not isinstance(source, PathBase): + raise + CopierBase.copy_file(self, source, target, metadata_keys, dir_entry) + else: + copyfile(source, os.fspath(target)) + + if os.name == 'nt': + def copy_symlink(self, source, target, metadata_keys, dir_entry=None): + """Copy the given symlink to the given target.""" + metadata = source._read_metadata( + metadata_keys, follow_symlinks=False, dir_entry=dir_entry) + target.symlink_to(source.readlink(), (dir_entry or source).is_dir()) + target._write_metadata(metadata, follow_symlinks=False) + + class PurePath(PurePathBase): """Base class for manipulating paths without I/O. @@ -512,6 +539,11 @@ class Path(PathBase, PurePath): but cannot instantiate a WindowsPath on a POSIX system or vice versa. """ __slots__ = () + _copier = _Copier + _readable_metadata = file_metadata_keys + _writable_metadata = file_metadata_keys + _read_metadata = read_file_metadata + _write_metadata = write_file_metadata as_uri = PurePath.as_uri @classmethod @@ -789,24 +821,6 @@ def mkdir(self, mode=0o777, parents=False, exist_ok=False): if not exist_ok or not self.is_dir(): raise - _readable_metadata = _writable_metadata = file_metadata_keys - _read_metadata = read_file_metadata - _write_metadata = write_file_metadata - - if copyfile: - def _copy_file(self, target): - """ - Copy the contents of this file to the given target. - """ - try: - target = os.fspath(target) - except TypeError: - if not isinstance(target, PathBase): - raise - PathBase._copy_file(self, target) - else: - copyfile(os.fspath(self), target) - def chmod(self, mode, *, follow_symlinks=True): """ Change the permissions of the path, like os.chmod(). @@ -869,14 +883,6 @@ def symlink_to(self, target, target_is_directory=False): """ os.symlink(target, self, target_is_directory) - if os.name == 'nt': - def _symlink_to_target_of(self, link): - """ - Make this path a symlink with the same target as the given link. - This is used by copy(). - """ - self.symlink_to(link.readlink(), link.is_dir()) - if hasattr(os, "link"): def hardlink_to(self, target): """ diff --git a/Lib/pathlib/_os.py b/Lib/pathlib/_os.py index 642b3a57c59a1d..bf03d574780871 100644 --- a/Lib/pathlib/_os.py +++ b/Lib/pathlib/_os.py @@ -174,40 +174,34 @@ def copyfileobj(source_f, target_f): file_metadata_keys = frozenset(file_metadata_keys) -def read_file_metadata(path, keys=None, *, follow_symlinks=True): +def read_file_metadata(path, metadata_keys, *, follow_symlinks=True, dir_entry=None): """ Returns local path metadata as a dict with string keys. """ - if keys is None: - keys = file_metadata_keys - assert keys.issubset(file_metadata_keys) - result = {} - for key in keys: - if key == 'xattrs': - try: - result['xattrs'] = [ - (attr, os.getxattr(path, attr, follow_symlinks=follow_symlinks)) - for attr in os.listxattr(path, follow_symlinks=follow_symlinks)] - except OSError as err: - if err.errno not in (EPERM, ENOTSUP, ENODATA, EINVAL, EACCES): - raise - continue - st = os.stat(path, follow_symlinks=follow_symlinks) - if key == 'mode': - result['mode'] = stat.S_IMODE(st.st_mode) - elif key == 'times_ns': - result['times_ns'] = st.st_atime_ns, st.st_mtime_ns - elif key == 'flags': - result['flags'] = st.st_flags - return result + metadata = {} + if 'mode' in metadata_keys or 'times_ns' in metadata_keys or 'flags' in metadata_keys: + st = (dir_entry or path).stat(follow_symlinks=follow_symlinks) + if 'mode' in metadata_keys: + metadata['mode'] = stat.S_IMODE(st.st_mode) + if 'times_ns' in metadata_keys: + metadata['times_ns'] = st.st_atime_ns, st.st_mtime_ns + if 'flags' in metadata_keys: + metadata['flags'] = st.st_flags + if 'xattrs' in metadata_keys: + try: + metadata['xattrs'] = [ + (attr, os.getxattr(path, attr, follow_symlinks=follow_symlinks)) + for attr in os.listxattr(path, follow_symlinks=follow_symlinks)] + except OSError as err: + if err.errno not in (EPERM, ENOTSUP, ENODATA, EINVAL, EACCES): + raise + return metadata def write_file_metadata(path, metadata, *, follow_symlinks=True): """ Sets local path metadata from the given dict with string keys. """ - assert frozenset(metadata.keys()).issubset(file_metadata_keys) - def _nop(*args, ns=None, follow_symlinks=None): pass diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 11e34f5d378a58..1f1d375a2166d4 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1949,7 +1949,7 @@ def ordered_walk(path): if self.can_symlink: # Add some symlinks source.joinpath('linkC').symlink_to('fileC') - source.joinpath('linkD').symlink_to('dirD') + source.joinpath('linkD').symlink_to('dirD', target_is_directory=True) # Perform the copy target = base / 'copyC' diff --git a/Misc/NEWS.d/next/Library/2024-11-01-04-21-26.gh-issue-125413.Z-jjZq.rst b/Misc/NEWS.d/next/Library/2024-11-01-04-21-26.gh-issue-125413.Z-jjZq.rst new file mode 100644 index 00000000000000..854780cccba0b2 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-01-04-21-26.gh-issue-125413.Z-jjZq.rst @@ -0,0 +1,2 @@ +Speed up :meth:`pathlib.Path.copy` by making use of +:meth:`~pathlib.Path.scandir` internally.