From 41397cee70f4933b539ab1a9232f04419ac8e2bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20F=C3=B6rberg?= Date: Tue, 2 Jul 2024 15:35:20 +0200 Subject: [PATCH] gh-121267: Improve performance of tarfile (#121267) Tarfile in the default write mode spends much of its time resolving UIDs into usernames and GIDs into group names. By caching these mappings, a significant speedup can be achieved. In my simple benchmark[1], this extra caching speeds up tarfile by 8x. [1] https://gist.github.com/jforberg/86af759c796199740c31547ae828aef2 --- Lib/tarfile.py | 28 +++++++++++++------ ...-07-02-15-56-42.gh-issue-121267.yFBWkh.rst | 2 ++ 2 files changed, 22 insertions(+), 8 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst diff --git a/Lib/tarfile.py b/Lib/tarfile.py index d5d8a469779f50b..3e16dcd519b1536 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -1658,6 +1658,9 @@ class TarFile(object): extraction_filter = None # The default filter for extraction. + uname_cache = {} # Cached mappings of uid -> uname, gid -> gname + gname_cache = {} + def __init__(self, name=None, mode="r", fileobj=None, format=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, errors="surrogateescape", pax_headers=None, debug=None, @@ -2105,16 +2108,25 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None): tarinfo.mtime = statres.st_mtime tarinfo.type = type tarinfo.linkname = linkname + + # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To + # speed things up, cache the resolved usernames and group names. if pwd: - try: - tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] - except KeyError: - pass + if not tarinfo.uid in self.uname_cache: + try: + self.uname_cache[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0] + except KeyError: + pass + + tarinfo.uname = self.uname_cache.get(tarinfo.uid, None) if grp: - try: - tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] - except KeyError: - pass + if not tarinfo.gid in self.gname_cache: + try: + self.gname_cache[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0] + except KeyError: + pass + + tarinfo.gname = self.gname_cache.get(tarinfo.gid, None) if type in (CHRTYPE, BLKTYPE): if hasattr(os, "major") and hasattr(os, "minor"): diff --git a/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst new file mode 100644 index 000000000000000..ca18bf37471bad3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst @@ -0,0 +1,2 @@ +Improve the performance of tarfile when writing files, by caching user names +and group names.