From 743acbe872485dc18df4d8ab2dc7895187f062c4 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Tue, 3 Sep 2024 10:07:53 -0500 Subject: [PATCH 1/7] [3.10] gh-121285: Remove backtracking when parsing tarfile headers (GH-121286) (#123640) * Remove backtracking when parsing tarfile headers * Rewrite PAX header parsing to be stricter * Optimize parsing of GNU extended sparse headers v0.0 (cherry picked from commit 34ddb64d088dd7ccc321f6103d23153256caa5d4) Co-authored-by: Kirill Podoprigora Co-authored-by: Gregory P. Smith --- Lib/tarfile.py | 105 +++++++++++------- Lib/test/test_tarfile.py | 42 +++++++ ...-07-02-13-39-20.gh-issue-121285.hrl-yI.rst | 2 + 3 files changed, 111 insertions(+), 38 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2024-07-02-13-39-20.gh-issue-121285.hrl-yI.rst diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 495349f08f9e76..3ab6811d63335b 100755 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -841,6 +841,9 @@ def data_filter(member, dest_path): # Sentinel for replace() defaults, meaning "don't change the attribute" _KEEP = object() +# Header length is digits followed by a space. +_header_length_prefix_re = re.compile(br"([0-9]{1,20}) ") + class TarInfo(object): """Informational class which holds the details about an archive member given by a tar header block. @@ -1410,41 +1413,59 @@ def _proc_pax(self, tarfile): else: pax_headers = tarfile.pax_headers.copy() - # Check if the pax header contains a hdrcharset field. This tells us - # the encoding of the path, linkpath, uname and gname fields. Normally, - # these fields are UTF-8 encoded but since POSIX.1-2008 tar - # implementations are allowed to store them as raw binary strings if - # the translation to UTF-8 fails. - match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf) - if match is not None: - pax_headers["hdrcharset"] = match.group(1).decode("utf-8") - - # For the time being, we don't care about anything other than "BINARY". - # The only other value that is currently allowed by the standard is - # "ISO-IR 10646 2000 UTF-8" in other words UTF-8. - hdrcharset = pax_headers.get("hdrcharset") - if hdrcharset == "BINARY": - encoding = tarfile.encoding - else: - encoding = "utf-8" - # Parse pax header information. A record looks like that: # "%d %s=%s\n" % (length, keyword, value). length is the size # of the complete record including the length field itself and - # the newline. keyword and value are both UTF-8 encoded strings. - regex = re.compile(br"(\d+) ([^=]+)=") + # the newline. pos = 0 - while True: - match = regex.match(buf, pos) - if not match: - break + encoding = None + raw_headers = [] + while len(buf) > pos and buf[pos] != 0x00: + if not (match := _header_length_prefix_re.match(buf, pos)): + raise InvalidHeaderError("invalid header") + try: + length = int(match.group(1)) + except ValueError: + raise InvalidHeaderError("invalid header") + # Headers must be at least 5 bytes, shortest being '5 x=\n'. + # Value is allowed to be empty. + if length < 5: + raise InvalidHeaderError("invalid header") + if pos + length > len(buf): + raise InvalidHeaderError("invalid header") - length, keyword = match.groups() - length = int(length) - if length == 0: + header_value_end_offset = match.start(1) + length - 1 # Last byte of the header + keyword_and_value = buf[match.end(1) + 1:header_value_end_offset] + raw_keyword, equals, raw_value = keyword_and_value.partition(b"=") + + # Check the framing of the header. The last character must be '\n' (0x0A) + if not raw_keyword or equals != b"=" or buf[header_value_end_offset] != 0x0A: raise InvalidHeaderError("invalid header") - value = buf[match.end(2) + 1:match.start(1) + length - 1] + raw_headers.append((length, raw_keyword, raw_value)) + + # Check if the pax header contains a hdrcharset field. This tells us + # the encoding of the path, linkpath, uname and gname fields. Normally, + # these fields are UTF-8 encoded but since POSIX.1-2008 tar + # implementations are allowed to store them as raw binary strings if + # the translation to UTF-8 fails. For the time being, we don't care about + # anything other than "BINARY". The only other value that is currently + # allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8. + # Note that we only follow the initial 'hdrcharset' setting to preserve + # the initial behavior of the 'tarfile' module. + if raw_keyword == b"hdrcharset" and encoding is None: + if raw_value == b"BINARY": + encoding = tarfile.encoding + else: # This branch ensures only the first 'hdrcharset' header is used. + encoding = "utf-8" + + pos += length + # If no explicit hdrcharset is set, we use UTF-8 as a default. + if encoding is None: + encoding = "utf-8" + + # After parsing the raw headers we can decode them to text. + for length, raw_keyword, raw_value in raw_headers: # Normally, we could just use "utf-8" as the encoding and "strict" # as the error handler, but we better not take the risk. For # example, GNU tar <= 1.23 is known to store filenames it cannot @@ -1452,17 +1473,16 @@ def _proc_pax(self, tarfile): # hdrcharset=BINARY header). # We first try the strict standard encoding, and if that fails we # fall back on the user's encoding and error handler. - keyword = self._decode_pax_field(keyword, "utf-8", "utf-8", + keyword = self._decode_pax_field(raw_keyword, "utf-8", "utf-8", tarfile.errors) if keyword in PAX_NAME_FIELDS: - value = self._decode_pax_field(value, encoding, tarfile.encoding, + value = self._decode_pax_field(raw_value, encoding, tarfile.encoding, tarfile.errors) else: - value = self._decode_pax_field(value, "utf-8", "utf-8", + value = self._decode_pax_field(raw_value, "utf-8", "utf-8", tarfile.errors) pax_headers[keyword] = value - pos += length # Fetch the next header. try: @@ -1477,7 +1497,7 @@ def _proc_pax(self, tarfile): elif "GNU.sparse.size" in pax_headers: # GNU extended sparse format version 0.0. - self._proc_gnusparse_00(next, pax_headers, buf) + self._proc_gnusparse_00(next, raw_headers) elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0": # GNU extended sparse format version 1.0. @@ -1499,15 +1519,24 @@ def _proc_pax(self, tarfile): return next - def _proc_gnusparse_00(self, next, pax_headers, buf): + def _proc_gnusparse_00(self, next, raw_headers): """Process a GNU tar extended sparse header, version 0.0. """ offsets = [] - for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf): - offsets.append(int(match.group(1))) numbytes = [] - for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf): - numbytes.append(int(match.group(1))) + for _, keyword, value in raw_headers: + if keyword == b"GNU.sparse.offset": + try: + offsets.append(int(value.decode())) + except ValueError: + raise InvalidHeaderError("invalid header") + + elif keyword == b"GNU.sparse.numbytes": + try: + numbytes.append(int(value.decode())) + except ValueError: + raise InvalidHeaderError("invalid header") + next.sparse = list(zip(offsets, numbytes)) def _proc_gnusparse_01(self, next, pax_headers): diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index cfc13bccb2040c..007c3e94acb876 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -1139,6 +1139,48 @@ def test_pax_number_fields(self): finally: tar.close() + def test_pax_header_bad_formats(self): + # The fields from the pax header have priority over the + # TarInfo. + pax_header_replacements = ( + b" foo=bar\n", + b"0 \n", + b"1 \n", + b"2 \n", + b"3 =\n", + b"4 =a\n", + b"1000000 foo=bar\n", + b"0 foo=bar\n", + b"-12 foo=bar\n", + b"000000000000000000000000036 foo=bar\n", + ) + pax_headers = {"foo": "bar"} + + for replacement in pax_header_replacements: + with self.subTest(header=replacement): + tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT, + encoding="iso8859-1") + try: + t = tarfile.TarInfo() + t.name = "pax" # non-ASCII + t.uid = 1 + t.pax_headers = pax_headers + tar.addfile(t) + finally: + tar.close() + + with open(tmpname, "rb") as f: + data = f.read() + self.assertIn(b"11 foo=bar\n", data) + data = data.replace(b"11 foo=bar\n", replacement) + + with open(tmpname, "wb") as f: + f.truncate() + f.write(data) + + with self.assertRaisesRegex(tarfile.ReadError, r"method tar: ReadError\('invalid header'\)"): + tarfile.open(tmpname, encoding="iso8859-1") + class WriteTestBase(TarTest): # Put all write tests in here that are supposed to be tested diff --git a/Misc/NEWS.d/next/Security/2024-07-02-13-39-20.gh-issue-121285.hrl-yI.rst b/Misc/NEWS.d/next/Security/2024-07-02-13-39-20.gh-issue-121285.hrl-yI.rst new file mode 100644 index 00000000000000..81f918bfe2b255 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2024-07-02-13-39-20.gh-issue-121285.hrl-yI.rst @@ -0,0 +1,2 @@ +Remove backtracking from tarfile header parsing for ``hdrcharset``, PAX, and +GNU sparse headers. From 06f28dc236708f72871c64d4bc4b4ea144c50147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Wed, 4 Sep 2024 17:38:31 +0200 Subject: [PATCH 2/7] [3.10] gh-121650: Encode newlines in headers, and verify headers are sound (GH-122233) (#122609) Per RFC 2047: > [...] these encoding schemes allow the > encoding of arbitrary octet values, mail readers that implement this > decoding should also ensure that display of the decoded data on the > recipient's terminal will not cause unwanted side-effects It seems that the "quoted-word" scheme is a valid way to include a newline character in a header value, just like we already allow undecodable bytes or control characters. They do need to be properly quoted when serialized to text, though. This should fail for custom fold() implementations that aren't careful about newlines. (cherry picked from commit 097633981879b3c9de9a1dd120d3aa585ecc2384) Co-authored-by: Petr Viktorin Co-authored-by: Bas Bloemsaat Co-authored-by: Serhiy Storchaka --- Doc/library/email.errors.rst | 6 ++ Doc/library/email.policy.rst | 18 ++++++ Doc/whatsnew/3.10.rst | 12 ++++ Lib/email/_header_value_parser.py | 12 +++- Lib/email/_policybase.py | 8 +++ Lib/email/errors.py | 4 ++ Lib/email/generator.py | 13 +++- Lib/test/test_email/test_generator.py | 62 +++++++++++++++++++ Lib/test/test_email/test_policy.py | 26 ++++++++ ...-07-27-16-10-41.gh-issue-121650.nf6oc9.rst | 5 ++ 10 files changed, 162 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-07-27-16-10-41.gh-issue-121650.nf6oc9.rst diff --git a/Doc/library/email.errors.rst b/Doc/library/email.errors.rst index 194a98696f437d..f737f0282c5489 100644 --- a/Doc/library/email.errors.rst +++ b/Doc/library/email.errors.rst @@ -59,6 +59,12 @@ The following exception classes are defined in the :mod:`email.errors` module: :class:`~email.mime.image.MIMEImage`). +.. exception:: HeaderWriteError() + + Raised when an error occurs when the :mod:`~email.generator` outputs + headers. + + Here is the list of the defects that the :class:`~email.parser.FeedParser` can find while parsing messages. Note that the defects are added to the message where the problem was found, so for example, if a message nested inside a diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index bf53b9520fc723..eba43b5169ddcf 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -229,6 +229,24 @@ added matters. To illustrate:: .. versionadded:: 3.6 + + .. attribute:: verify_generated_headers + + If ``True`` (the default), the generator will raise + :exc:`~email.errors.HeaderWriteError` instead of writing a header + that is improperly folded or delimited, such that it would + be parsed as multiple headers or joined with adjacent data. + Such headers can be generated by custom header classes or bugs + in the ``email`` module. + + As it's a security feature, this defaults to ``True`` even in the + :class:`~email.policy.Compat32` policy. + For backwards compatible, but unsafe, behavior, it must be set to + ``False`` explicitly. + + .. versionadded:: 3.10.15 + + The following :class:`Policy` method is intended to be called by code using the email library to create policy instances with custom settings: diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst index f71a50163f49ea..2d9f7608162863 100644 --- a/Doc/whatsnew/3.10.rst +++ b/Doc/whatsnew/3.10.rst @@ -2372,3 +2372,15 @@ ipaddress * Fixed ``is_global`` and ``is_private`` behavior in ``IPv4Address``, ``IPv6Address``, ``IPv4Network`` and ``IPv6Network``. + +email +----- + +* Headers with embedded newlines are now quoted on output. + + The :mod:`~email.generator` will now refuse to serialize (write) headers + that are improperly folded or delimited, such that they would be parsed as + multiple headers or joined with adjacent data. + If you need to turn this safety feature off, + set :attr:`~email.policy.Policy.verify_generated_headers`. + (Contributed by Bas Bloemsaat and Petr Viktorin in :gh:`121650`.) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index e637e6df06612d..e1b99d5b417253 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -92,6 +92,8 @@ ASPECIALS = TSPECIALS | set("*'%") ATTRIBUTE_ENDS = ASPECIALS | WSP EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') +NLSET = {'\n', '\r'} +SPECIALSNL = SPECIALS | NLSET def quote_string(value): return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' @@ -2778,9 +2780,13 @@ def _refold_parse_tree(parse_tree, *, policy): wrap_as_ew_blocked -= 1 continue tstr = str(part) - if part.token_type == 'ptext' and set(tstr) & SPECIALS: - # Encode if tstr contains special characters. - want_encoding = True + if not want_encoding: + if part.token_type == 'ptext': + # Encode if tstr contains special characters. + want_encoding = not SPECIALSNL.isdisjoint(tstr) + else: + # Encode if tstr contains newlines. + want_encoding = not NLSET.isdisjoint(tstr) try: tstr.encode(encoding) charset = encoding diff --git a/Lib/email/_policybase.py b/Lib/email/_policybase.py index c9cbadd2a80c48..d1f48211f90970 100644 --- a/Lib/email/_policybase.py +++ b/Lib/email/_policybase.py @@ -157,6 +157,13 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta): message_factory -- the class to use to create new message objects. If the value is None, the default is Message. + verify_generated_headers + -- if true, the generator verifies that each header + they are properly folded, so that a parser won't + treat it as multiple headers, start-of-body, or + part of another header. + This is a check against custom Header & fold() + implementations. """ raise_on_defect = False @@ -165,6 +172,7 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta): max_line_length = 78 mangle_from_ = False message_factory = None + verify_generated_headers = True def handle_defect(self, obj, defect): """Based on policy, either raise defect or call register_defect. diff --git a/Lib/email/errors.py b/Lib/email/errors.py index 3ad00565549968..02aa5eced6ae46 100644 --- a/Lib/email/errors.py +++ b/Lib/email/errors.py @@ -29,6 +29,10 @@ class CharsetError(MessageError): """An illegal charset was given.""" +class HeaderWriteError(MessageError): + """Error while writing headers.""" + + # These are parsing defects which the parser was able to work around. class MessageDefect(ValueError): """Base class for a message defect.""" diff --git a/Lib/email/generator.py b/Lib/email/generator.py index c9b121624e08d5..89224ae41cbc67 100644 --- a/Lib/email/generator.py +++ b/Lib/email/generator.py @@ -14,12 +14,14 @@ from copy import deepcopy from io import StringIO, BytesIO from email.utils import _has_surrogates +from email.errors import HeaderWriteError UNDERSCORE = '_' NL = '\n' # XXX: no longer used by the code below. NLCRE = re.compile(r'\r\n|\r|\n') fcre = re.compile(r'^From ', re.MULTILINE) +NEWLINE_WITHOUT_FWSP = re.compile(r'\r\n[^ \t]|\r[^ \n\t]|\n[^ \t]') @@ -223,7 +225,16 @@ def _dispatch(self, msg): def _write_headers(self, msg): for h, v in msg.raw_items(): - self.write(self.policy.fold(h, v)) + folded = self.policy.fold(h, v) + if self.policy.verify_generated_headers: + linesep = self.policy.linesep + if not folded.endswith(self.policy.linesep): + raise HeaderWriteError( + f'folded header does not end with {linesep!r}: {folded!r}') + if NEWLINE_WITHOUT_FWSP.search(folded.removesuffix(linesep)): + raise HeaderWriteError( + f'folded header contains newline: {folded!r}') + self.write(folded) # A blank line always separates headers from body self.write(self._NL) diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index 89e7edeb63a892..d29400f0ed1dbb 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -6,6 +6,7 @@ from email.generator import Generator, BytesGenerator from email.headerregistry import Address from email import policy +import email.errors from test.test_email import TestEmailBase, parameterize @@ -216,6 +217,44 @@ def test_rfc2231_wrapping_switches_to_default_len_if_too_narrow(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(expected)) + def test_keep_encoded_newlines(self): + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: nobody + Subject: Bad subject=?UTF-8?Q?=0A?=Bcc: injection@example.com + + None + """))) + expected = textwrap.dedent("""\ + To: nobody + Subject: Bad subject=?UTF-8?Q?=0A?=Bcc: injection@example.com + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=80)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + + def test_keep_long_encoded_newlines(self): + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: nobody + Subject: Bad subject=?UTF-8?Q?=0A?=Bcc: injection@example.com + + None + """))) + expected = textwrap.dedent("""\ + To: nobody + Subject: Bad subject + =?utf-8?q?=0A?=Bcc: + injection@example.com + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=30)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + class TestGenerator(TestGeneratorBase, TestEmailBase): @@ -224,6 +263,29 @@ class TestGenerator(TestGeneratorBase, TestEmailBase): ioclass = io.StringIO typ = str + def test_verify_generated_headers(self): + """gh-121650: by default the generator prevents header injection""" + class LiteralHeader(str): + name = 'Header' + def fold(self, **kwargs): + return self + + for text in ( + 'Value\r\nBad Injection\r\n', + 'NoNewLine' + ): + with self.subTest(text=text): + message = message_from_string( + "Header: Value\r\n\r\nBody", + policy=self.policy, + ) + + del message['Header'] + message['Header'] = LiteralHeader(text) + + with self.assertRaises(email.errors.HeaderWriteError): + message.as_string() + class TestBytesGenerator(TestGeneratorBase, TestEmailBase): diff --git a/Lib/test/test_email/test_policy.py b/Lib/test/test_email/test_policy.py index e87c275549406d..ff1ddf7d7a8fca 100644 --- a/Lib/test/test_email/test_policy.py +++ b/Lib/test/test_email/test_policy.py @@ -26,6 +26,7 @@ class PolicyAPITests(unittest.TestCase): 'raise_on_defect': False, 'mangle_from_': True, 'message_factory': None, + 'verify_generated_headers': True, } # These default values are the ones set on email.policy.default. # If any of these defaults change, the docs must be updated. @@ -277,6 +278,31 @@ def test_short_maxlen_error(self): with self.assertRaises(email.errors.HeaderParseError): policy.fold("Subject", subject) + def test_verify_generated_headers(self): + """Turning protection off allows header injection""" + policy = email.policy.default.clone(verify_generated_headers=False) + for text in ( + 'Header: Value\r\nBad: Injection\r\n', + 'Header: NoNewLine' + ): + with self.subTest(text=text): + message = email.message_from_string( + "Header: Value\r\n\r\nBody", + policy=policy, + ) + class LiteralHeader(str): + name = 'Header' + def fold(self, **kwargs): + return self + + del message['Header'] + message['Header'] = LiteralHeader(text) + + self.assertEqual( + message.as_string(), + f"{text}\nBody", + ) + # XXX: Need subclassing tests. # For adding subclassed objects, make sure the usual rules apply (subclass # wins), but that the order still works (right overrides left). diff --git a/Misc/NEWS.d/next/Library/2024-07-27-16-10-41.gh-issue-121650.nf6oc9.rst b/Misc/NEWS.d/next/Library/2024-07-27-16-10-41.gh-issue-121650.nf6oc9.rst new file mode 100644 index 00000000000000..83dd28d4ac575b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-27-16-10-41.gh-issue-121650.nf6oc9.rst @@ -0,0 +1,5 @@ +:mod:`email` headers with embedded newlines are now quoted on output. The +:mod:`~email.generator` will now refuse to serialize (write) headers that +are unsafely folded or delimited; see +:attr:`~email.policy.Policy.verify_generated_headers`. (Contributed by Bas +Bloemsaat and Petr Viktorin in :gh:`121650`.) From 2fa5d706843476cf43ff3b431e3c4a8c34737413 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Sep 2024 18:43:30 +0300 Subject: [PATCH 3/7] [3.10] gh-67693: Fix urlunparse() and urlunsplit() for URIs with path starting with multiple slashes and no authority (GH-113563) (#119026) (cherry picked from commit e237b25a4fa5626fcd1b1848aa03f725f892e40e) --- Lib/test/test_urlparse.py | 70 ++++++++++++++++++- Lib/urllib/parse.py | 2 +- ...9-08-27-01-16-50.gh-issue-67693.4NIAiy.rst | 2 + 3 files changed, 70 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index b0aed37de7dcae..5e08aa1bbad89d 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -70,7 +70,9 @@ class UrlParseTestCase(unittest.TestCase): - def checkRoundtrips(self, url, parsed, split): + def checkRoundtrips(self, url, parsed, split, url2=None): + if url2 is None: + url2 = url result = urllib.parse.urlparse(url) self.assertEqual(result, parsed) t = (result.scheme, result.netloc, result.path, @@ -78,7 +80,7 @@ def checkRoundtrips(self, url, parsed, split): self.assertEqual(t, parsed) # put it back together and it should be the same result2 = urllib.parse.urlunparse(result) - self.assertEqual(result2, url) + self.assertEqual(result2, url2) self.assertEqual(result2, result.geturl()) # the result of geturl() is a fixpoint; we can always parse it @@ -104,7 +106,7 @@ def checkRoundtrips(self, url, parsed, split): result.query, result.fragment) self.assertEqual(t, split) result2 = urllib.parse.urlunsplit(result) - self.assertEqual(result2, url) + self.assertEqual(result2, url2) self.assertEqual(result2, result.geturl()) # check the fixpoint property of re-parsing the result of geturl() @@ -142,9 +144,39 @@ def test_qs(self): def test_roundtrips(self): str_cases = [ + ('path/to/file', + ('', '', 'path/to/file', '', '', ''), + ('', '', 'path/to/file', '', '')), + ('/path/to/file', + ('', '', '/path/to/file', '', '', ''), + ('', '', '/path/to/file', '', '')), + ('//path/to/file', + ('', 'path', '/to/file', '', '', ''), + ('', 'path', '/to/file', '', '')), + ('////path/to/file', + ('', '', '//path/to/file', '', '', ''), + ('', '', '//path/to/file', '', '')), + ('scheme:path/to/file', + ('scheme', '', 'path/to/file', '', '', ''), + ('scheme', '', 'path/to/file', '', '')), + ('scheme:/path/to/file', + ('scheme', '', '/path/to/file', '', '', ''), + ('scheme', '', '/path/to/file', '', '')), + ('scheme://path/to/file', + ('scheme', 'path', '/to/file', '', '', ''), + ('scheme', 'path', '/to/file', '', '')), + ('scheme:////path/to/file', + ('scheme', '', '//path/to/file', '', '', ''), + ('scheme', '', '//path/to/file', '', '')), ('file:///tmp/junk.txt', ('file', '', '/tmp/junk.txt', '', '', ''), ('file', '', '/tmp/junk.txt', '', '')), + ('file:////tmp/junk.txt', + ('file', '', '//tmp/junk.txt', '', '', ''), + ('file', '', '//tmp/junk.txt', '', '')), + ('file://///tmp/junk.txt', + ('file', '', '///tmp/junk.txt', '', '', ''), + ('file', '', '///tmp/junk.txt', '', '')), ('imap://mail.python.org/mbox1', ('imap', 'mail.python.org', '/mbox1', '', '', ''), ('imap', 'mail.python.org', '/mbox1', '', '')), @@ -175,6 +207,38 @@ def _encode(t): for url, parsed, split in str_cases + bytes_cases: self.checkRoundtrips(url, parsed, split) + def test_roundtrips_normalization(self): + str_cases = [ + ('///path/to/file', + '/path/to/file', + ('', '', '/path/to/file', '', '', ''), + ('', '', '/path/to/file', '', '')), + ('scheme:///path/to/file', + 'scheme:/path/to/file', + ('scheme', '', '/path/to/file', '', '', ''), + ('scheme', '', '/path/to/file', '', '')), + ('file:/tmp/junk.txt', + 'file:///tmp/junk.txt', + ('file', '', '/tmp/junk.txt', '', '', ''), + ('file', '', '/tmp/junk.txt', '', '')), + ('http:/tmp/junk.txt', + 'http:///tmp/junk.txt', + ('http', '', '/tmp/junk.txt', '', '', ''), + ('http', '', '/tmp/junk.txt', '', '')), + ('https:/tmp/junk.txt', + 'https:///tmp/junk.txt', + ('https', '', '/tmp/junk.txt', '', '', ''), + ('https', '', '/tmp/junk.txt', '', '')), + ] + def _encode(t): + return (t[0].encode('ascii'), + t[1].encode('ascii'), + tuple(x.encode('ascii') for x in t[2]), + tuple(x.encode('ascii') for x in t[3])) + bytes_cases = [_encode(x) for x in str_cases] + for url, url2, parsed, split in str_cases + bytes_cases: + self.checkRoundtrips(url, parsed, split, url2) + def test_http_roundtrips(self): # urllib.parse.urlsplit treats 'http:' as an optimized special case, # so we test both 'http:' and 'https:' in all the following. diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 0ab2023843f6b5..44806e67a8fc01 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -521,7 +521,7 @@ def urlunsplit(components): empty query; the RFC states that these are equivalent).""" scheme, netloc, url, query, fragment, _coerce_result = ( _coerce_args(*components)) - if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): + if netloc or (scheme and scheme in uses_netloc) or url[:2] == '//': if url and url[:1] != '/': url = '/' + url url = '//' + (netloc or '') + url if scheme: diff --git a/Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst b/Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst new file mode 100644 index 00000000000000..22457df03e65c9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst @@ -0,0 +1,2 @@ +Fix :func:`urllib.parse.urlunparse` and :func:`urllib.parse.urlunsplit` for URIs with path starting with multiple slashes and no authority. +Based on patch by Ashwin Ramaswami. From b2f11ca7667e4d57c71c1c88b255115f16042d9a Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:50:36 +0200 Subject: [PATCH 4/7] [3.10] gh-123067: Fix quadratic complexity in parsing "-quoted cookie values with backslashes (GH-123075) (#123106) This fixes CVE-2024-7592. (cherry picked from commit 44e458357fca05ca0ae2658d62c8c595b048b5ef) Co-authored-by: Serhiy Storchaka --- Lib/http/cookies.py | 34 ++++------------- Lib/test/test_http_cookies.py | 38 +++++++++++++++++++ ...-08-16-19-13-21.gh-issue-123067.Nx9O4R.rst | 1 + 3 files changed, 47 insertions(+), 26 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-08-16-19-13-21.gh-issue-123067.Nx9O4R.rst diff --git a/Lib/http/cookies.py b/Lib/http/cookies.py index 35ac2dc6ae280c..2c1f021d0abede 100644 --- a/Lib/http/cookies.py +++ b/Lib/http/cookies.py @@ -184,8 +184,13 @@ def _quote(str): return '"' + str.translate(_Translator) + '"' -_OctalPatt = re.compile(r"\\[0-3][0-7][0-7]") -_QuotePatt = re.compile(r"[\\].") +_unquote_sub = re.compile(r'\\(?:([0-3][0-7][0-7])|(.))').sub + +def _unquote_replace(m): + if m[1]: + return chr(int(m[1], 8)) + else: + return m[2] def _unquote(str): # If there aren't any doublequotes, @@ -205,30 +210,7 @@ def _unquote(str): # \012 --> \n # \" --> " # - i = 0 - n = len(str) - res = [] - while 0 <= i < n: - o_match = _OctalPatt.search(str, i) - q_match = _QuotePatt.search(str, i) - if not o_match and not q_match: # Neither matched - res.append(str[i:]) - break - # else: - j = k = -1 - if o_match: - j = o_match.start(0) - if q_match: - k = q_match.start(0) - if q_match and (not o_match or k < j): # QuotePatt matched - res.append(str[i:k]) - res.append(str[k+1]) - i = k + 2 - else: # OctalPatt matched - res.append(str[i:j]) - res.append(chr(int(str[j+1:j+4], 8))) - i = j + 4 - return _nulljoin(res) + return _unquote_sub(_unquote_replace, str) # The _getdate() routine is used to set the expiration time in the cookie's HTTP # header. By default, _getdate() returns the current time in the appropriate diff --git a/Lib/test/test_http_cookies.py b/Lib/test/test_http_cookies.py index 6072c7e15e92be..644e75cd5b742e 100644 --- a/Lib/test/test_http_cookies.py +++ b/Lib/test/test_http_cookies.py @@ -5,6 +5,7 @@ import unittest from http import cookies import pickle +from test import support class CookieTests(unittest.TestCase): @@ -58,6 +59,43 @@ def test_basic(self): for k, v in sorted(case['dict'].items()): self.assertEqual(C[k].value, v) + def test_unquote(self): + cases = [ + (r'a="b=\""', 'b="'), + (r'a="b=\\"', 'b=\\'), + (r'a="b=\="', 'b=='), + (r'a="b=\n"', 'b=n'), + (r'a="b=\042"', 'b="'), + (r'a="b=\134"', 'b=\\'), + (r'a="b=\377"', 'b=\xff'), + (r'a="b=\400"', 'b=400'), + (r'a="b=\42"', 'b=42'), + (r'a="b=\\042"', 'b=\\042'), + (r'a="b=\\134"', 'b=\\134'), + (r'a="b=\\\""', 'b=\\"'), + (r'a="b=\\\042"', 'b=\\"'), + (r'a="b=\134\""', 'b=\\"'), + (r'a="b=\134\042"', 'b=\\"'), + ] + for encoded, decoded in cases: + with self.subTest(encoded): + C = cookies.SimpleCookie() + C.load(encoded) + self.assertEqual(C['a'].value, decoded) + + @support.requires_resource('cpu') + def test_unquote_large(self): + n = 10**6 + for encoded in r'\\', r'\134': + with self.subTest(encoded): + data = 'a="b=' + encoded*n + ';"' + C = cookies.SimpleCookie() + C.load(data) + value = C['a'].value + self.assertEqual(value[:3], 'b=\\') + self.assertEqual(value[-2:], '\\;') + self.assertEqual(len(value), n + 3) + def test_load(self): C = cookies.SimpleCookie() C.load('Customer="WILE_E_COYOTE"; Version=1; Path=/acme') diff --git a/Misc/NEWS.d/next/Library/2024-08-16-19-13-21.gh-issue-123067.Nx9O4R.rst b/Misc/NEWS.d/next/Library/2024-08-16-19-13-21.gh-issue-123067.Nx9O4R.rst new file mode 100644 index 00000000000000..6a234561fe31a3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-08-16-19-13-21.gh-issue-123067.Nx9O4R.rst @@ -0,0 +1 @@ +Fix quadratic complexity in parsing ``"``-quoted cookie values with backslashes by :mod:`http.cookies`. From 2e861ac1cd4359463f6a13efd3d3578fce71e5ab Mon Sep 17 00:00:00 2001 From: Steve Dower Date: Wed, 4 Sep 2024 16:57:40 +0100 Subject: [PATCH 5/7] [3.10] gh-119690: Fixes buffer type confusion in _winapi.CreateFile and _winapi.CreateNamedPipe audit events (#119735) gh-119690: Fixes buffer type confusion in _winapi.CreateFile and _winapi.CreateNamedPipe audit events --- .../Windows/2024-05-29-17-05-28.gh-issue-119690.U6RMtm.rst | 2 ++ Modules/_winapi.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Windows/2024-05-29-17-05-28.gh-issue-119690.U6RMtm.rst diff --git a/Misc/NEWS.d/next/Windows/2024-05-29-17-05-28.gh-issue-119690.U6RMtm.rst b/Misc/NEWS.d/next/Windows/2024-05-29-17-05-28.gh-issue-119690.U6RMtm.rst new file mode 100644 index 00000000000000..44889794d9a465 --- /dev/null +++ b/Misc/NEWS.d/next/Windows/2024-05-29-17-05-28.gh-issue-119690.U6RMtm.rst @@ -0,0 +1,2 @@ +Fixes data type confusion in audit events raised by ``_winapi.CreateFile`` +and ``_winapi.CreateNamedPipe``. diff --git a/Modules/_winapi.c b/Modules/_winapi.c index f6bb07fd8b06ef..5cc138188ffb9d 100644 --- a/Modules/_winapi.c +++ b/Modules/_winapi.c @@ -470,7 +470,7 @@ _winapi_CreateFile_impl(PyObject *module, LPCTSTR file_name, { HANDLE handle; - if (PySys_Audit("_winapi.CreateFile", "uIIII", + if (PySys_Audit("_winapi.CreateFile", "sIIII", file_name, desired_access, share_mode, creation_disposition, flags_and_attributes) < 0) { return INVALID_HANDLE_VALUE; @@ -690,7 +690,7 @@ _winapi_CreateNamedPipe_impl(PyObject *module, LPCTSTR name, DWORD open_mode, { HANDLE handle; - if (PySys_Audit("_winapi.CreateNamedPipe", "uII", + if (PySys_Audit("_winapi.CreateNamedPipe", "sII", name, open_mode, pipe_mode) < 0) { return INVALID_HANDLE_VALUE; } From d3f39cefe721db643957d48316ac98c5e279f0c3 Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Wed, 4 Sep 2024 19:26:29 +0200 Subject: [PATCH 6/7] [3.10] gh-112275: Fix HEAD_LOCK deadlock in child process after fork (GH-112336) (#123687) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HEAD_LOCK is called from _PyEval_ReInitThreads->_PyThreadState_DeleteExcept before _PyRuntimeState_ReInitThreads reinit runtime->interpreters.mutex which might be locked before fork. (cherry picked from commit 522799a05e3e820339718151ac055af6d864d463) Co-authored-by: ChuBoning <102216855+ChuBoning@users.noreply.github.com> Co-authored-by: Ɓukasz Langa --- .../2024-09-04-18-20-11.gh-issue-112275.W_iMiB.rst | 3 +++ Modules/posixmodule.c | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2024-09-04-18-20-11.gh-issue-112275.W_iMiB.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-09-04-18-20-11.gh-issue-112275.W_iMiB.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-04-18-20-11.gh-issue-112275.W_iMiB.rst new file mode 100644 index 00000000000000..d663be1867ed3d --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-04-18-20-11.gh-issue-112275.W_iMiB.rst @@ -0,0 +1,3 @@ +A deadlock involving ``pystate.c``'s ``HEAD_LOCK`` in ``posixmodule.c`` +at fork is now fixed. Patch by ChuBoning based on previous Python 3.12 +fix by Victor Stinner. diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c index 43e69fc322b595..feffa43cfd0de4 100644 --- a/Modules/posixmodule.c +++ b/Modules/posixmodule.c @@ -598,6 +598,11 @@ PyOS_AfterFork_Child(void) goto fatal_error; } + status = _PyRuntimeState_ReInitThreads(runtime); + if (_PyStatus_EXCEPTION(status)) { + goto fatal_error; + } + PyThreadState *tstate = _PyThreadState_GET(); _Py_EnsureTstateNotNULL(tstate); @@ -613,11 +618,6 @@ PyOS_AfterFork_Child(void) _PySignal_AfterFork(); - status = _PyRuntimeState_ReInitThreads(runtime); - if (_PyStatus_EXCEPTION(status)) { - goto fatal_error; - } - status = _PyInterpreterState_DeleteExceptMain(runtime); if (_PyStatus_EXCEPTION(status)) { goto fatal_error; From 0aa1ee22ab6e204e9d3d0e9dd63ea648ed691ef1 Mon Sep 17 00:00:00 2001 From: "Jason R. Coombs" Date: Wed, 4 Sep 2024 16:41:39 -0400 Subject: [PATCH 7/7] [3.10] gh-123270: Replaced SanitizedNames with a more surgical fix. (GH-123354) (#123426) Applies changes from zipp 3.20.1 and jaraco/zippGH-124 (cherry picked from commit 2231286d78d328c2f575e0b05b16fe447d1656d6) (cherry picked from commit 17b77bb) Co-authored-by: Jason R. Coombs --- Lib/test/test_zipfile.py | 76 +++++++++++++++++-- Lib/zipfile.py | 69 ++--------------- ...-08-26-13-45-20.gh-issue-123270.gXHvNJ.rst | 3 + 3 files changed, 81 insertions(+), 67 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-08-26-13-45-20.gh-issue-123270.gXHvNJ.rst diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index a60dc11688d20b..33e5dfc61c5e73 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -5,6 +5,7 @@ import itertools import os import pathlib +import platform import posixpath import string import struct @@ -3282,7 +3283,11 @@ def test_extract_orig_with_implied_dirs(self, alpharep): def test_malformed_paths(self): """ - Path should handle malformed paths. + Path should handle malformed paths gracefully. + + Paths with leading slashes are not visible. + + Paths with dots are treated like regular files. """ data = io.BytesIO() zf = zipfile.ZipFile(data, "w") @@ -3291,11 +3296,70 @@ def test_malformed_paths(self): zf.writestr("../parent.txt", b"content") zf.filename = '' root = zipfile.Path(zf) - assert list(map(str, root.iterdir())) == [ - 'one-slash.txt', - 'two-slash.txt', - 'parent.txt', - ] + assert list(map(str, root.iterdir())) == ['../'] + assert root.joinpath('..').joinpath('parent.txt').read_bytes() == b'content' + + @unittest.skipIf(platform.system() == "Windows", "GH-123693") + def test_unsupported_names(self): + """ + Path segments with special characters are readable. + + On some platforms or file systems, characters like + ``:`` and ``?`` are not allowed, but they are valid + in the zip file. + """ + data = io.BytesIO() + zf = zipfile.ZipFile(data, "w") + zf.writestr("path?", b"content") + zf.writestr("V: NMS.flac", b"fLaC...") + zf.filename = '' + root = zipfile.Path(zf) + contents = root.iterdir() + assert next(contents).name == 'path?' + item = next(contents) + assert item.name == 'V: NMS.flac', item.name + assert root.joinpath('V: NMS.flac').read_bytes() == b"fLaC..." + + @unittest.skipIf(platform.system() == "Windows", "GH-123693") + def test_backslash_not_separator(self): + """ + In a zip file, backslashes are not separators. + """ + data = io.BytesIO() + zf = zipfile.ZipFile(data, "w") + zf.writestr(DirtyZipInfo.for_name("foo\\bar", zf), b"content") + zf.filename = '' + root = zipfile.Path(zf) + (first,) = root.iterdir() + assert not first.is_dir() + assert first.name == 'foo\\bar', first.name + + +class DirtyZipInfo(zipfile.ZipInfo): + """ + Bypass name sanitization. + """ + + def __init__(self, filename, *args, **kwargs): + super().__init__(filename, *args, **kwargs) + self.filename = filename + + @classmethod + def for_name(cls, name, archive): + """ + Construct the same way that ZipFile.writestr does. + + TODO: extract this functionality and re-use + """ + self = cls(filename=name, date_time=time.localtime(time.time())[:6]) + self.compress_type = archive.compression + self.compress_level = archive.compresslevel + if self.filename.endswith('/'): # pragma: no cover + self.external_attr = 0o40775 << 16 # drwxrwxr-x + self.external_attr |= 0x10 # MS-DOS directory flag + else: + self.external_attr = 0o600 << 16 # ?rw------- + return self class StripExtraTests(unittest.TestCase): diff --git a/Lib/zipfile.py b/Lib/zipfile.py index cbac8d9160e72b..9b66a9f054dc6b 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -2152,7 +2152,7 @@ def _parents(path): def _ancestry(path): """ Given a path with elements separated by - posixpath.sep, generate all elements of that path + posixpath.sep, generate all elements of that path. >>> list(_ancestry('b/d')) ['b/d', 'b'] @@ -2164,9 +2164,14 @@ def _ancestry(path): ['b'] >>> list(_ancestry('')) [] + + Multiple separators are treated like a single. + + >>> list(_ancestry('//b//d///f//')) + ['//b//d///f', '//b//d', '//b'] """ path = path.rstrip(posixpath.sep) - while path and path != posixpath.sep: + while path.rstrip(posixpath.sep): yield path path, tail = posixpath.split(path) @@ -2183,65 +2188,7 @@ def _difference(minuend, subtrahend): return itertools.filterfalse(set(subtrahend).__contains__, minuend) -class SanitizedNames: - """ - ZipFile mix-in to ensure names are sanitized. - """ - - def namelist(self): - return list(map(self._sanitize, super().namelist())) - - @staticmethod - def _sanitize(name): - r""" - Ensure a relative path with posix separators and no dot names. - Modeled after - https://github.com/python/cpython/blob/bcc1be39cb1d04ad9fc0bd1b9193d3972835a57c/Lib/zipfile/__init__.py#L1799-L1813 - but provides consistent cross-platform behavior. - >>> san = SanitizedNames._sanitize - >>> san('/foo/bar') - 'foo/bar' - >>> san('//foo.txt') - 'foo.txt' - >>> san('foo/.././bar.txt') - 'foo/bar.txt' - >>> san('foo../.bar.txt') - 'foo../.bar.txt' - >>> san('\\foo\\bar.txt') - 'foo/bar.txt' - >>> san('D:\\foo.txt') - 'D/foo.txt' - >>> san('\\\\server\\share\\file.txt') - 'server/share/file.txt' - >>> san('\\\\?\\GLOBALROOT\\Volume3') - '?/GLOBALROOT/Volume3' - >>> san('\\\\.\\PhysicalDrive1\\root') - 'PhysicalDrive1/root' - Retain any trailing slash. - >>> san('abc/') - 'abc/' - Raises a ValueError if the result is empty. - >>> san('../..') - Traceback (most recent call last): - ... - ValueError: Empty filename - """ - - def allowed(part): - return part and part not in {'..', '.'} - - # Remove the drive letter. - # Don't use ntpath.splitdrive, because that also strips UNC paths - bare = re.sub('^([A-Z]):', r'\1', name, flags=re.IGNORECASE) - clean = bare.replace('\\', '/') - parts = clean.split('/') - joined = '/'.join(filter(allowed, parts)) - if not joined: - raise ValueError("Empty filename") - return joined + '/' * name.endswith('/') - - -class CompleteDirs(SanitizedNames, ZipFile): +class CompleteDirs(ZipFile): """ A ZipFile subclass that ensures that implied directories are always included in the namelist. diff --git a/Misc/NEWS.d/next/Library/2024-08-26-13-45-20.gh-issue-123270.gXHvNJ.rst b/Misc/NEWS.d/next/Library/2024-08-26-13-45-20.gh-issue-123270.gXHvNJ.rst new file mode 100644 index 00000000000000..ee9fde6a9ed87a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-08-26-13-45-20.gh-issue-123270.gXHvNJ.rst @@ -0,0 +1,3 @@ +Applied a more surgical fix for malformed payloads in :class:`zipfile.Path` +causing infinite loops (gh-122905) without breaking contents using +legitimate characters.