Skip to content

Commit

Permalink
Merge branch '3.10' into gh-123693/platform-agnostic-name
Browse files Browse the repository at this point in the history
  • Loading branch information
jaraco authored Sep 4, 2024
2 parents 9f1dac0 + 0aa1ee2 commit d7d5cad
Show file tree
Hide file tree
Showing 24 changed files with 403 additions and 79 deletions.
6 changes: 6 additions & 0 deletions Doc/library/email.errors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ The following exception classes are defined in the :mod:`email.errors` module:
:class:`~email.mime.image.MIMEImage`).


.. exception:: HeaderWriteError()

Raised when an error occurs when the :mod:`~email.generator` outputs
headers.


Here is the list of the defects that the :class:`~email.parser.FeedParser`
can find while parsing messages. Note that the defects are added to the message
where the problem was found, so for example, if a message nested inside a
Expand Down
18 changes: 18 additions & 0 deletions Doc/library/email.policy.rst
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,24 @@ added matters. To illustrate::

.. versionadded:: 3.6


.. attribute:: verify_generated_headers

If ``True`` (the default), the generator will raise
:exc:`~email.errors.HeaderWriteError` instead of writing a header
that is improperly folded or delimited, such that it would
be parsed as multiple headers or joined with adjacent data.
Such headers can be generated by custom header classes or bugs
in the ``email`` module.

As it's a security feature, this defaults to ``True`` even in the
:class:`~email.policy.Compat32` policy.
For backwards compatible, but unsafe, behavior, it must be set to
``False`` explicitly.

.. versionadded:: 3.10.15


The following :class:`Policy` method is intended to be called by code using
the email library to create policy instances with custom settings:

Expand Down
12 changes: 12 additions & 0 deletions Doc/whatsnew/3.10.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2372,3 +2372,15 @@ ipaddress
* Fixed ``is_global`` and ``is_private`` behavior in ``IPv4Address``,
``IPv6Address``, ``IPv4Network`` and ``IPv6Network``.
email
-----
* Headers with embedded newlines are now quoted on output.
The :mod:`~email.generator` will now refuse to serialize (write) headers
that are improperly folded or delimited, such that they would be parsed as
multiple headers or joined with adjacent data.
If you need to turn this safety feature off,
set :attr:`~email.policy.Policy.verify_generated_headers`.
(Contributed by Bas Bloemsaat and Petr Viktorin in :gh:`121650`.)
12 changes: 9 additions & 3 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@
ASPECIALS = TSPECIALS | set("*'%")
ATTRIBUTE_ENDS = ASPECIALS | WSP
EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
NLSET = {'\n', '\r'}
SPECIALSNL = SPECIALS | NLSET

def quote_string(value):
return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
Expand Down Expand Up @@ -2778,9 +2780,13 @@ def _refold_parse_tree(parse_tree, *, policy):
wrap_as_ew_blocked -= 1
continue
tstr = str(part)
if part.token_type == 'ptext' and set(tstr) & SPECIALS:
# Encode if tstr contains special characters.
want_encoding = True
if not want_encoding:
if part.token_type == 'ptext':
# Encode if tstr contains special characters.
want_encoding = not SPECIALSNL.isdisjoint(tstr)
else:
# Encode if tstr contains newlines.
want_encoding = not NLSET.isdisjoint(tstr)
try:
tstr.encode(encoding)
charset = encoding
Expand Down
8 changes: 8 additions & 0 deletions Lib/email/_policybase.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,13 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta):
message_factory -- the class to use to create new message objects.
If the value is None, the default is Message.
verify_generated_headers
-- if true, the generator verifies that each header
they are properly folded, so that a parser won't
treat it as multiple headers, start-of-body, or
part of another header.
This is a check against custom Header & fold()
implementations.
"""

raise_on_defect = False
Expand All @@ -165,6 +172,7 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta):
max_line_length = 78
mangle_from_ = False
message_factory = None
verify_generated_headers = True

def handle_defect(self, obj, defect):
"""Based on policy, either raise defect or call register_defect.
Expand Down
4 changes: 4 additions & 0 deletions Lib/email/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ class CharsetError(MessageError):
"""An illegal charset was given."""


class HeaderWriteError(MessageError):
"""Error while writing headers."""


# These are parsing defects which the parser was able to work around.
class MessageDefect(ValueError):
"""Base class for a message defect."""
Expand Down
13 changes: 12 additions & 1 deletion Lib/email/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@
from copy import deepcopy
from io import StringIO, BytesIO
from email.utils import _has_surrogates
from email.errors import HeaderWriteError

UNDERSCORE = '_'
NL = '\n' # XXX: no longer used by the code below.

NLCRE = re.compile(r'\r\n|\r|\n')
fcre = re.compile(r'^From ', re.MULTILINE)
NEWLINE_WITHOUT_FWSP = re.compile(r'\r\n[^ \t]|\r[^ \n\t]|\n[^ \t]')



Expand Down Expand Up @@ -223,7 +225,16 @@ def _dispatch(self, msg):

def _write_headers(self, msg):
for h, v in msg.raw_items():
self.write(self.policy.fold(h, v))
folded = self.policy.fold(h, v)
if self.policy.verify_generated_headers:
linesep = self.policy.linesep
if not folded.endswith(self.policy.linesep):
raise HeaderWriteError(
f'folded header does not end with {linesep!r}: {folded!r}')
if NEWLINE_WITHOUT_FWSP.search(folded.removesuffix(linesep)):
raise HeaderWriteError(
f'folded header contains newline: {folded!r}')
self.write(folded)
# A blank line always separates headers from body
self.write(self._NL)

Expand Down
34 changes: 8 additions & 26 deletions Lib/http/cookies.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,13 @@ def _quote(str):
return '"' + str.translate(_Translator) + '"'


_OctalPatt = re.compile(r"\\[0-3][0-7][0-7]")
_QuotePatt = re.compile(r"[\\].")
_unquote_sub = re.compile(r'\\(?:([0-3][0-7][0-7])|(.))').sub

def _unquote_replace(m):
if m[1]:
return chr(int(m[1], 8))
else:
return m[2]

def _unquote(str):
# If there aren't any doublequotes,
Expand All @@ -205,30 +210,7 @@ def _unquote(str):
# \012 --> \n
# \" --> "
#
i = 0
n = len(str)
res = []
while 0 <= i < n:
o_match = _OctalPatt.search(str, i)
q_match = _QuotePatt.search(str, i)
if not o_match and not q_match: # Neither matched
res.append(str[i:])
break
# else:
j = k = -1
if o_match:
j = o_match.start(0)
if q_match:
k = q_match.start(0)
if q_match and (not o_match or k < j): # QuotePatt matched
res.append(str[i:k])
res.append(str[k+1])
i = k + 2
else: # OctalPatt matched
res.append(str[i:j])
res.append(chr(int(str[j+1:j+4], 8)))
i = j + 4
return _nulljoin(res)
return _unquote_sub(_unquote_replace, str)

# The _getdate() routine is used to set the expiration time in the cookie's HTTP
# header. By default, _getdate() returns the current time in the appropriate
Expand Down
105 changes: 67 additions & 38 deletions Lib/tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,9 @@ def data_filter(member, dest_path):
# Sentinel for replace() defaults, meaning "don't change the attribute"
_KEEP = object()

# Header length is digits followed by a space.
_header_length_prefix_re = re.compile(br"([0-9]{1,20}) ")

class TarInfo(object):
"""Informational class which holds the details about an
archive member given by a tar header block.
Expand Down Expand Up @@ -1410,59 +1413,76 @@ def _proc_pax(self, tarfile):
else:
pax_headers = tarfile.pax_headers.copy()

# Check if the pax header contains a hdrcharset field. This tells us
# the encoding of the path, linkpath, uname and gname fields. Normally,
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
# implementations are allowed to store them as raw binary strings if
# the translation to UTF-8 fails.
match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
if match is not None:
pax_headers["hdrcharset"] = match.group(1).decode("utf-8")

# For the time being, we don't care about anything other than "BINARY".
# The only other value that is currently allowed by the standard is
# "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
hdrcharset = pax_headers.get("hdrcharset")
if hdrcharset == "BINARY":
encoding = tarfile.encoding
else:
encoding = "utf-8"

# Parse pax header information. A record looks like that:
# "%d %s=%s\n" % (length, keyword, value). length is the size
# of the complete record including the length field itself and
# the newline. keyword and value are both UTF-8 encoded strings.
regex = re.compile(br"(\d+) ([^=]+)=")
# the newline.
pos = 0
while True:
match = regex.match(buf, pos)
if not match:
break
encoding = None
raw_headers = []
while len(buf) > pos and buf[pos] != 0x00:
if not (match := _header_length_prefix_re.match(buf, pos)):
raise InvalidHeaderError("invalid header")
try:
length = int(match.group(1))
except ValueError:
raise InvalidHeaderError("invalid header")
# Headers must be at least 5 bytes, shortest being '5 x=\n'.
# Value is allowed to be empty.
if length < 5:
raise InvalidHeaderError("invalid header")
if pos + length > len(buf):
raise InvalidHeaderError("invalid header")

length, keyword = match.groups()
length = int(length)
if length == 0:
header_value_end_offset = match.start(1) + length - 1 # Last byte of the header
keyword_and_value = buf[match.end(1) + 1:header_value_end_offset]
raw_keyword, equals, raw_value = keyword_and_value.partition(b"=")

# Check the framing of the header. The last character must be '\n' (0x0A)
if not raw_keyword or equals != b"=" or buf[header_value_end_offset] != 0x0A:
raise InvalidHeaderError("invalid header")
value = buf[match.end(2) + 1:match.start(1) + length - 1]
raw_headers.append((length, raw_keyword, raw_value))

# Check if the pax header contains a hdrcharset field. This tells us
# the encoding of the path, linkpath, uname and gname fields. Normally,
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
# implementations are allowed to store them as raw binary strings if
# the translation to UTF-8 fails. For the time being, we don't care about
# anything other than "BINARY". The only other value that is currently
# allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
# Note that we only follow the initial 'hdrcharset' setting to preserve
# the initial behavior of the 'tarfile' module.
if raw_keyword == b"hdrcharset" and encoding is None:
if raw_value == b"BINARY":
encoding = tarfile.encoding
else: # This branch ensures only the first 'hdrcharset' header is used.
encoding = "utf-8"

pos += length

# If no explicit hdrcharset is set, we use UTF-8 as a default.
if encoding is None:
encoding = "utf-8"

# After parsing the raw headers we can decode them to text.
for length, raw_keyword, raw_value in raw_headers:
# Normally, we could just use "utf-8" as the encoding and "strict"
# as the error handler, but we better not take the risk. For
# example, GNU tar <= 1.23 is known to store filenames it cannot
# translate to UTF-8 as raw strings (unfortunately without a
# hdrcharset=BINARY header).
# We first try the strict standard encoding, and if that fails we
# fall back on the user's encoding and error handler.
keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
keyword = self._decode_pax_field(raw_keyword, "utf-8", "utf-8",
tarfile.errors)
if keyword in PAX_NAME_FIELDS:
value = self._decode_pax_field(value, encoding, tarfile.encoding,
value = self._decode_pax_field(raw_value, encoding, tarfile.encoding,
tarfile.errors)
else:
value = self._decode_pax_field(value, "utf-8", "utf-8",
value = self._decode_pax_field(raw_value, "utf-8", "utf-8",
tarfile.errors)

pax_headers[keyword] = value
pos += length

# Fetch the next header.
try:
Expand All @@ -1477,7 +1497,7 @@ def _proc_pax(self, tarfile):

elif "GNU.sparse.size" in pax_headers:
# GNU extended sparse format version 0.0.
self._proc_gnusparse_00(next, pax_headers, buf)
self._proc_gnusparse_00(next, raw_headers)

elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
# GNU extended sparse format version 1.0.
Expand All @@ -1499,15 +1519,24 @@ def _proc_pax(self, tarfile):

return next

def _proc_gnusparse_00(self, next, pax_headers, buf):
def _proc_gnusparse_00(self, next, raw_headers):
"""Process a GNU tar extended sparse header, version 0.0.
"""
offsets = []
for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
offsets.append(int(match.group(1)))
numbytes = []
for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
numbytes.append(int(match.group(1)))
for _, keyword, value in raw_headers:
if keyword == b"GNU.sparse.offset":
try:
offsets.append(int(value.decode()))
except ValueError:
raise InvalidHeaderError("invalid header")

elif keyword == b"GNU.sparse.numbytes":
try:
numbytes.append(int(value.decode()))
except ValueError:
raise InvalidHeaderError("invalid header")

next.sparse = list(zip(offsets, numbytes))

def _proc_gnusparse_01(self, next, pax_headers):
Expand Down
Loading

0 comments on commit d7d5cad

Please sign in to comment.