Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[3.9] gh-121285: Remove backtracking when parsing tarfile headers (GH-121286) #123641

Merged
merged 2 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 67 additions & 38 deletions Lib/tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,9 @@ def data_filter(member, dest_path):
# Sentinel for replace() defaults, meaning "don't change the attribute"
_KEEP = object()

# Header length is digits followed by a space.
_header_length_prefix_re = re.compile(br"([0-9]{1,20}) ")

class TarInfo(object):
"""Informational class which holds the details about an
archive member given by a tar header block.
Expand Down Expand Up @@ -1399,59 +1402,76 @@ def _proc_pax(self, tarfile):
else:
pax_headers = tarfile.pax_headers.copy()

# Check if the pax header contains a hdrcharset field. This tells us
# the encoding of the path, linkpath, uname and gname fields. Normally,
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
# implementations are allowed to store them as raw binary strings if
# the translation to UTF-8 fails.
match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
if match is not None:
pax_headers["hdrcharset"] = match.group(1).decode("utf-8")

# For the time being, we don't care about anything other than "BINARY".
# The only other value that is currently allowed by the standard is
# "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
hdrcharset = pax_headers.get("hdrcharset")
if hdrcharset == "BINARY":
encoding = tarfile.encoding
else:
encoding = "utf-8"

# Parse pax header information. A record looks like that:
# "%d %s=%s\n" % (length, keyword, value). length is the size
# of the complete record including the length field itself and
# the newline. keyword and value are both UTF-8 encoded strings.
regex = re.compile(br"(\d+) ([^=]+)=")
# the newline.
pos = 0
while True:
match = regex.match(buf, pos)
if not match:
break
encoding = None
raw_headers = []
while len(buf) > pos and buf[pos] != 0x00:
if not (match := _header_length_prefix_re.match(buf, pos)):
raise InvalidHeaderError("invalid header")
try:
length = int(match.group(1))
except ValueError:
raise InvalidHeaderError("invalid header")
# Headers must be at least 5 bytes, shortest being '5 x=\n'.
# Value is allowed to be empty.
if length < 5:
raise InvalidHeaderError("invalid header")
if pos + length > len(buf):
raise InvalidHeaderError("invalid header")

length, keyword = match.groups()
length = int(length)
if length == 0:
header_value_end_offset = match.start(1) + length - 1 # Last byte of the header
keyword_and_value = buf[match.end(1) + 1:header_value_end_offset]
raw_keyword, equals, raw_value = keyword_and_value.partition(b"=")

# Check the framing of the header. The last character must be '\n' (0x0A)
if not raw_keyword or equals != b"=" or buf[header_value_end_offset] != 0x0A:
raise InvalidHeaderError("invalid header")
value = buf[match.end(2) + 1:match.start(1) + length - 1]
raw_headers.append((length, raw_keyword, raw_value))

# Check if the pax header contains a hdrcharset field. This tells us
# the encoding of the path, linkpath, uname and gname fields. Normally,
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
# implementations are allowed to store them as raw binary strings if
# the translation to UTF-8 fails. For the time being, we don't care about
# anything other than "BINARY". The only other value that is currently
# allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
# Note that we only follow the initial 'hdrcharset' setting to preserve
# the initial behavior of the 'tarfile' module.
if raw_keyword == b"hdrcharset" and encoding is None:
if raw_value == b"BINARY":
encoding = tarfile.encoding
else: # This branch ensures only the first 'hdrcharset' header is used.
encoding = "utf-8"

pos += length

# If no explicit hdrcharset is set, we use UTF-8 as a default.
if encoding is None:
encoding = "utf-8"

# After parsing the raw headers we can decode them to text.
for length, raw_keyword, raw_value in raw_headers:
# Normally, we could just use "utf-8" as the encoding and "strict"
# as the error handler, but we better not take the risk. For
# example, GNU tar <= 1.23 is known to store filenames it cannot
# translate to UTF-8 as raw strings (unfortunately without a
# hdrcharset=BINARY header).
# We first try the strict standard encoding, and if that fails we
# fall back on the user's encoding and error handler.
keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
keyword = self._decode_pax_field(raw_keyword, "utf-8", "utf-8",
tarfile.errors)
if keyword in PAX_NAME_FIELDS:
value = self._decode_pax_field(value, encoding, tarfile.encoding,
value = self._decode_pax_field(raw_value, encoding, tarfile.encoding,
tarfile.errors)
else:
value = self._decode_pax_field(value, "utf-8", "utf-8",
value = self._decode_pax_field(raw_value, "utf-8", "utf-8",
tarfile.errors)

pax_headers[keyword] = value
pos += length

# Fetch the next header.
try:
Expand All @@ -1466,7 +1486,7 @@ def _proc_pax(self, tarfile):

elif "GNU.sparse.size" in pax_headers:
# GNU extended sparse format version 0.0.
self._proc_gnusparse_00(next, pax_headers, buf)
self._proc_gnusparse_00(next, raw_headers)

elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
# GNU extended sparse format version 1.0.
Expand All @@ -1488,15 +1508,24 @@ def _proc_pax(self, tarfile):

return next

def _proc_gnusparse_00(self, next, pax_headers, buf):
def _proc_gnusparse_00(self, next, raw_headers):
"""Process a GNU tar extended sparse header, version 0.0.
"""
offsets = []
for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
offsets.append(int(match.group(1)))
numbytes = []
for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
numbytes.append(int(match.group(1)))
for _, keyword, value in raw_headers:
if keyword == b"GNU.sparse.offset":
try:
offsets.append(int(value.decode()))
except ValueError:
raise InvalidHeaderError("invalid header")

elif keyword == b"GNU.sparse.numbytes":
try:
numbytes.append(int(value.decode()))
except ValueError:
raise InvalidHeaderError("invalid header")

next.sparse = list(zip(offsets, numbytes))

def _proc_gnusparse_01(self, next, pax_headers):
Expand Down
42 changes: 42 additions & 0 deletions Lib/test/test_tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,48 @@ def test_pax_number_fields(self):
finally:
tar.close()

def test_pax_header_bad_formats(self):
# The fields from the pax header have priority over the
# TarInfo.
pax_header_replacements = (
b" foo=bar\n",
b"0 \n",
b"1 \n",
b"2 \n",
b"3 =\n",
b"4 =a\n",
b"1000000 foo=bar\n",
b"0 foo=bar\n",
b"-12 foo=bar\n",
b"000000000000000000000000036 foo=bar\n",
)
pax_headers = {"foo": "bar"}

for replacement in pax_header_replacements:
with self.subTest(header=replacement):
tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT,
encoding="iso8859-1")
try:
t = tarfile.TarInfo()
t.name = "pax" # non-ASCII
t.uid = 1
t.pax_headers = pax_headers
tar.addfile(t)
finally:
tar.close()

with open(tmpname, "rb") as f:
data = f.read()
self.assertIn(b"11 foo=bar\n", data)
data = data.replace(b"11 foo=bar\n", replacement)

with open(tmpname, "wb") as f:
f.truncate()
f.write(data)

with self.assertRaisesRegex(tarfile.ReadError, r"file could not be opened successfully"):
tarfile.open(tmpname, encoding="iso8859-1")


class WriteTestBase(TarTest):
# Put all write tests in here that are supposed to be tested
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Remove backtracking from tarfile header parsing for ``hdrcharset``, PAX, and
GNU sparse headers.
Loading