email: Fix RFC 2047 header decoding with line folding

srinivasreddy · Dec 20, 2024 · a4e1f04 · a4e1f04
1 parent 39e69a7
commit a4e1f04
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 0 deletions.
diff --git a/Lib/email/header.py b/Lib/email/header.py
@@ -102,6 +102,8 @@ def decode_header(header):
     for n, w in enumerate(words):
         if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
             droplist.append(n-1)
+            if n < len(words):
+                words[n] = (words[n][0].lstrip(), words[n][1], words[n][2])
     for d in reversed(droplist):
         del words[d]
 

diff --git a/Lib/test/test_email/test_message.py b/Lib/test/test_email/test_message.py
@@ -1055,5 +1055,80 @@ def test_string_payload_with_multipart_content_type(self):
         self.assertEqual(list(attachments), [])
 
 
+class TestHeaderDecoding(unittest.TestCase):
+    def test_encoded_word_splitting(self):
+        # Test case with accented characters that forces line splitting
+        address = "Bérénice-Amélie Rosemonde Dûbois-Bénard <[email protected]>"
+        message = EmailMessage()
+        message["From"] = address
+        message_bytes = message.as_bytes()
+
+        # Test with default policy
+        parsed = message_from_bytes(message_bytes, policy=policy.default)
+        self.assertEqual(str(parsed["From"].addresses[0]), address)
+        self.assertEqual(parsed["From"].addresses[0].display_name,
+                        "Bérénice-Amélie Rosemonde Dûbois-Bénard")
+
+    def test_multiple_encoded_words(self):
+        # Test multiple encoded-words in sequence
+        headers = [
+            ("From", "André von Müller <[email protected]>"),
+            ("To", "José García López <[email protected]>"),
+            ("Subject", "Re: études à l'université"),
+        ]
+
+        message = EmailMessage()
+        for header, value in headers:
+            message[header] = value
+        message_bytes = message.as_bytes()
+
+        parsed = message_from_bytes(message_bytes, policy=policy.default)
+        for header, value in headers:
+            with self.subTest(header=header):
+                self.assertEqual(str(parsed[header]), value)
+
+    def test_long_encoded_words(self):
+        # Test very long names that force multiple encoded-word splits
+        long_name = "Maximilian-Friedrich von Württemberg-Höchstadt III"
+        address = f"{long_name} <[email protected]>"
+
+        message = EmailMessage()
+        message["From"] = address
+        message_bytes = message.as_bytes()
+
+        parsed = message_from_bytes(message_bytes, policy=policy.default)
+        self.assertEqual(str(parsed["From"].addresses[0]), address)
+        self.assertEqual(parsed["From"].addresses[0].display_name, long_name)
+
+    def test_mixed_ascii_and_encoded(self):
+        # Test mixing ASCII and encoded-words
+        address = 'ACME Corp (アクメ) <[email protected]>'
+        message = EmailMessage()
+        message["From"] = address
+        message_bytes = message.as_bytes()
+
+        parsed = message_from_bytes(message_bytes, policy=policy.default)
+        self.assertEqual(str(parsed["From"].addresses[0]), address)
+        self.assertEqual(parsed["From"].addresses[0].display_name, 'ACME Corp (アクメ)')
+
+    def test_whitespace_handling(self):
+        # Test various whitespace scenarios between encoded-words
+        headers = [
+            ("From", "María  José <[email protected]>"),  # Double space
+            ("To", "André\tvon\tMüller <[email protected]>"),  # Tabs
+            ("Cc", "José\n García <[email protected]>"),  # Newline
+        ]
+
+        message = EmailMessage()
+        for header, value in headers:
+            message[header] = value
+        message_bytes = message.as_bytes()
+
+        parsed = message_from_bytes(message_bytes, policy=policy.default)
+        for header, value in headers:
+            with self.subTest(header=header):
+                self.assertEqual(str(parsed[header]), value)
+
+
 if __name__ == '__main__':
     unittest.main()