Skip to content

Commit

Permalink
gh-121267: Improve performance of tarfile (#121267)
Browse files Browse the repository at this point in the history
Tarfile in the default write mode spends much of its time resolving UIDs
into usernames and GIDs into group names. By caching these mappings, a
significant speedup can be achieved.

In my simple benchmark[1], this extra caching speeds up tarfile by 8x.

[1] https://gist.github.com/jforberg/86af759c796199740c31547ae828aef2
  • Loading branch information
jforberg committed Jul 2, 2024
1 parent 6343486 commit 41397ce
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 8 deletions.
28 changes: 20 additions & 8 deletions Lib/tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1658,6 +1658,9 @@ class TarFile(object):

extraction_filter = None # The default filter for extraction.

uname_cache = {} # Cached mappings of uid -> uname, gid -> gname
gname_cache = {}

def __init__(self, name=None, mode="r", fileobj=None, format=None,
tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
errors="surrogateescape", pax_headers=None, debug=None,
Expand Down Expand Up @@ -2105,16 +2108,25 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None):
tarinfo.mtime = statres.st_mtime
tarinfo.type = type
tarinfo.linkname = linkname

# Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To
# speed things up, cache the resolved usernames and group names.
if pwd:
try:
tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
except KeyError:
pass
if not tarinfo.uid in self.uname_cache:
try:
self.uname_cache[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0]
except KeyError:
pass

tarinfo.uname = self.uname_cache.get(tarinfo.uid, None)
if grp:
try:
tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
except KeyError:
pass
if not tarinfo.gid in self.gname_cache:
try:
self.gname_cache[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0]
except KeyError:
pass

tarinfo.gname = self.gname_cache.get(tarinfo.gid, None)

if type in (CHRTYPE, BLKTYPE):
if hasattr(os, "major") and hasattr(os, "minor"):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Improve the performance of tarfile when writing files, by caching user names
and group names.

0 comments on commit 41397ce

Please sign in to comment.