Skip to content

Commit

Permalink
gh-52551: Fix encoding issues in strftime() (GH-125193)
Browse files Browse the repository at this point in the history
Fix time.strftime(), the strftime() method and formatting of the
datetime classes datetime, date and time.

* Characters not encodable in the current locale are now acceptable in
  the format string.
* Surrogate pairs and sequence of surrogatescape-encoded bytes are no
  longer recombinated.
* Embedded null character no longer terminates the format string.

This fixes also gh-78662 and gh-124531.
  • Loading branch information
serhiy-storchaka authored Oct 17, 2024
1 parent 0cb20f2 commit ad3eac1
Show file tree
Hide file tree
Showing 5 changed files with 307 additions and 232 deletions.
63 changes: 55 additions & 8 deletions Lib/test/datetimetester.py
Original file line number Diff line number Diff line change
Expand Up @@ -2949,11 +2949,32 @@ def test_more_strftime(self):
self.assertEqual(t.strftime("%z"), "-0200" + z)
self.assertEqual(t.strftime("%:z"), "-02:00:" + z)

# bpo-34482: Check that surrogates don't cause a crash.
try:
t.strftime('%y\ud800%m %H\ud800%M')
except UnicodeEncodeError:
pass
def test_strftime_special(self):
t = self.theclass(2004, 12, 31, 6, 22, 33, 47)
s1 = t.strftime('%c')
s2 = t.strftime('%B')
# gh-52551, gh-78662: Unicode strings should pass through strftime,
# independently from locale.
self.assertEqual(t.strftime('\U0001f40d'), '\U0001f40d')
self.assertEqual(t.strftime('\U0001f4bb%c\U0001f40d%B'), f'\U0001f4bb{s1}\U0001f40d{s2}')
self.assertEqual(t.strftime('%c\U0001f4bb%B\U0001f40d'), f'{s1}\U0001f4bb{s2}\U0001f40d')
# Lone surrogates should pass through.
self.assertEqual(t.strftime('\ud83d'), '\ud83d')
self.assertEqual(t.strftime('\udc0d'), '\udc0d')
self.assertEqual(t.strftime('\ud83d%c\udc0d%B'), f'\ud83d{s1}\udc0d{s2}')
self.assertEqual(t.strftime('%c\ud83d%B\udc0d'), f'{s1}\ud83d{s2}\udc0d')
self.assertEqual(t.strftime('%c\udc0d%B\ud83d'), f'{s1}\udc0d{s2}\ud83d')
# Surrogate pairs should not recombine.
self.assertEqual(t.strftime('\ud83d\udc0d'), '\ud83d\udc0d')
self.assertEqual(t.strftime('%c\ud83d\udc0d%B'), f'{s1}\ud83d\udc0d{s2}')
# Surrogate-escaped bytes should not recombine.
self.assertEqual(t.strftime('\udcf0\udc9f\udc90\udc8d'), '\udcf0\udc9f\udc90\udc8d')
self.assertEqual(t.strftime('%c\udcf0\udc9f\udc90\udc8d%B'), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
# gh-124531: The null character should not terminate the format string.
self.assertEqual(t.strftime('\0'), '\0')
self.assertEqual(t.strftime('\0'*1000), '\0'*1000)
self.assertEqual(t.strftime('\0%c\0%B'), f'\0{s1}\0{s2}')
self.assertEqual(t.strftime('%c\0%B\0'), f'{s1}\0{s2}\0')

def test_extract(self):
dt = self.theclass(2002, 3, 4, 18, 45, 3, 1234)
Expand Down Expand Up @@ -3736,6 +3757,33 @@ def test_strftime(self):
# gh-85432: The parameter was named "fmt" in the pure-Python impl.
t.strftime(format="%f")

def test_strftime_special(self):
t = self.theclass(1, 2, 3, 4)
s1 = t.strftime('%I%p%Z')
s2 = t.strftime('%X')
# gh-52551, gh-78662: Unicode strings should pass through strftime,
# independently from locale.
self.assertEqual(t.strftime('\U0001f40d'), '\U0001f40d')
self.assertEqual(t.strftime('\U0001f4bb%I%p%Z\U0001f40d%X'), f'\U0001f4bb{s1}\U0001f40d{s2}')
self.assertEqual(t.strftime('%I%p%Z\U0001f4bb%X\U0001f40d'), f'{s1}\U0001f4bb{s2}\U0001f40d')
# Lone surrogates should pass through.
self.assertEqual(t.strftime('\ud83d'), '\ud83d')
self.assertEqual(t.strftime('\udc0d'), '\udc0d')
self.assertEqual(t.strftime('\ud83d%I%p%Z\udc0d%X'), f'\ud83d{s1}\udc0d{s2}')
self.assertEqual(t.strftime('%I%p%Z\ud83d%X\udc0d'), f'{s1}\ud83d{s2}\udc0d')
self.assertEqual(t.strftime('%I%p%Z\udc0d%X\ud83d'), f'{s1}\udc0d{s2}\ud83d')
# Surrogate pairs should not recombine.
self.assertEqual(t.strftime('\ud83d\udc0d'), '\ud83d\udc0d')
self.assertEqual(t.strftime('%I%p%Z\ud83d\udc0d%X'), f'{s1}\ud83d\udc0d{s2}')
# Surrogate-escaped bytes should not recombine.
self.assertEqual(t.strftime('\udcf0\udc9f\udc90\udc8d'), '\udcf0\udc9f\udc90\udc8d')
self.assertEqual(t.strftime('%I%p%Z\udcf0\udc9f\udc90\udc8d%X'), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
# gh-124531: The null character should not terminate the format string.
self.assertEqual(t.strftime('\0'), '\0')
self.assertEqual(t.strftime('\0'*1000), '\0'*1000)
self.assertEqual(t.strftime('\0%I%p%Z\0%X'), f'\0{s1}\0{s2}')
self.assertEqual(t.strftime('%I%p%Z\0%X\0'), f'{s1}\0{s2}\0')

def test_format(self):
t = self.theclass(1, 2, 3, 4)
self.assertEqual(t.__format__(''), str(t))
Expand Down Expand Up @@ -4259,9 +4307,8 @@ def tzname(self, dt): return self.tz
self.assertRaises(TypeError, t.strftime, "%Z")

# Issue #6697:
if '_Fast' in self.__class__.__name__:
Badtzname.tz = '\ud800'
self.assertRaises(ValueError, t.strftime, "%Z")
Badtzname.tz = '\ud800'
self.assertEqual(t.strftime("%Z"), '\ud800')

def test_hash_edge_cases(self):
# Offsets that overflow a basic time.
Expand Down
29 changes: 27 additions & 2 deletions Lib/test/test_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,33 @@ def test_strftime(self):
self.fail('conversion specifier: %r failed.' % format)

self.assertRaises(TypeError, time.strftime, b'%S', tt)
# embedded null character
self.assertRaises(ValueError, time.strftime, '%S\0', tt)

def test_strftime_special(self):
tt = time.gmtime(self.t)
s1 = time.strftime('%c', tt)
s2 = time.strftime('%B', tt)
# gh-52551, gh-78662: Unicode strings should pass through strftime,
# independently from locale.
self.assertEqual(time.strftime('\U0001f40d', tt), '\U0001f40d')
self.assertEqual(time.strftime('\U0001f4bb%c\U0001f40d%B', tt), f'\U0001f4bb{s1}\U0001f40d{s2}')
self.assertEqual(time.strftime('%c\U0001f4bb%B\U0001f40d', tt), f'{s1}\U0001f4bb{s2}\U0001f40d')
# Lone surrogates should pass through.
self.assertEqual(time.strftime('\ud83d', tt), '\ud83d')
self.assertEqual(time.strftime('\udc0d', tt), '\udc0d')
self.assertEqual(time.strftime('\ud83d%c\udc0d%B', tt), f'\ud83d{s1}\udc0d{s2}')
self.assertEqual(time.strftime('%c\ud83d%B\udc0d', tt), f'{s1}\ud83d{s2}\udc0d')
self.assertEqual(time.strftime('%c\udc0d%B\ud83d', tt), f'{s1}\udc0d{s2}\ud83d')
# Surrogate pairs should not recombine.
self.assertEqual(time.strftime('\ud83d\udc0d', tt), '\ud83d\udc0d')
self.assertEqual(time.strftime('%c\ud83d\udc0d%B', tt), f'{s1}\ud83d\udc0d{s2}')
# Surrogate-escaped bytes should not recombine.
self.assertEqual(time.strftime('\udcf0\udc9f\udc90\udc8d', tt), '\udcf0\udc9f\udc90\udc8d')
self.assertEqual(time.strftime('%c\udcf0\udc9f\udc90\udc8d%B', tt), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
# gh-124531: The null character should not terminate the format string.
self.assertEqual(time.strftime('\0', tt), '\0')
self.assertEqual(time.strftime('\0'*1000, tt), '\0'*1000)
self.assertEqual(time.strftime('\0%c\0%B', tt), f'\0{s1}\0{s2}')
self.assertEqual(time.strftime('%c\0%B\0', tt), f'{s1}\0{s2}\0')

def _bounds_checking(self, func):
# Make sure that strftime() checks the bounds of the various parts
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Fix encoding issues in :func:`time.strftime`, the
:meth:`~datetime.datetime.strftime` method of the :mod:`datetime` classes
:class:`~datetime.datetime`, :class:`~datetime.date` and
:class:`~datetime.time` and formatting of these classes. Characters not
encodable in the current locale are now acceptable in the format string.
Surrogate pairs and sequence of surrogatescape-encoded bytes are no longer
recombinated. Embedded null character no longer terminates the format
string.
Loading

0 comments on commit ad3eac1

Please sign in to comment.