bpo-27240 Rewrite the email header folding algorithm. (#3488) The original algorithm tried to delegate the folding to the tokens so that those tokens whose folding rules differed could specify the differences. However, this resulted in a lot of duplicated code because most of the rules were the same. The new algorithm moves all folding logic into a set of functions external to the token classes, but puts the information about which tokens can be folded in which ways on the tokens...with the exception of mime-parameters, which are a special case (which was not even implemented in the old folder). This algorithm can still probably be improved and hopefully simplified somewhat. Note that some of the test expectations are changed. I believe the changes are toward more desirable and consistent behavior: in general when (re) folding a line the canonical version of the tokens is generated, rather than preserving errors or extra whitespace.

commit: 85d5c18c9d83a1d54eecc4c2ad4dce63194107c6 [log] [tgz]
author: R. David Murray <rdmurray@bitdance.com> Sun Dec 03 18:51:41 2017 -0500
committer: GitHub <noreply@github.com> Sun Dec 03 18:51:41 2017 -0500
tree: 39edcca989e60b95a64e317440057ee45aea8fc0
parent: 29ba688034fc4eef0693b86002cf7bee55d692af [diff] [blame]
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index e0ec87d..1667617 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py

@@ -14,18 +14,7 @@
         self.assertEqual(x, ' \t')
         self.assertEqual(str(x), '')
         self.assertEqual(x.value, '')
-        self.assertEqual(x.encoded, ' \t')
-
-    # UnstructuredTokenList
-
-    def test_undecodable_bytes_error_preserved(self):
-        badstr = b"le pouf c\xaflebre".decode('ascii', 'surrogateescape')
-        unst = parser.get_unstructured(badstr)
-        self.assertDefectsEqual(unst.all_defects, [errors.UndecodableBytesDefect])
-        parts = list(unst.parts)
-        self.assertDefectsEqual(parts[0].all_defects, [])
-        self.assertDefectsEqual(parts[1].all_defects, [])
-        self.assertDefectsEqual(parts[2].all_defects, [errors.UndecodableBytesDefect])
+        self.assertEqual(x.token_type, 'fws')
 
 
 class TestParserMixin:
@@ -139,7 +128,6 @@
                          'first second',
                          [],
                          '')
-        self.assertEqual(ew.encoded, '=?us-ascii*jive?q?first_second?=')
         self.assertEqual(ew.charset, 'us-ascii')
         self.assertEqual(ew.lang, 'jive')
 
@@ -150,7 +138,6 @@
                          'first second',
                          [],
                          '')
-        self.assertEqual(ew.encoded, '=?us-ascii?q?first_second?=')
         self.assertEqual(ew.charset, 'us-ascii')
         self.assertEqual(ew.lang, '')
 
@@ -2700,28 +2687,37 @@
     # and with unicode tokens in the comments.  Spaces inside the quotes
     # currently don't do the right thing.
 
-    def test_initial_whitespace_splitting(self):
+    def test_split_at_whitespace_after_header_before_long_token(self):
         body = parser.get_unstructured('   ' + 'x'*77)
         header = parser.Header([
             parser.HeaderLabel([parser.ValueTerminal('test:', 'atext')]),
             parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), body])
         self._test(header, 'test:   \n ' + 'x'*77 + '\n')
 
-    def test_whitespace_splitting(self):
+    def test_split_at_whitespace_before_long_token(self):
         self._test(parser.get_unstructured('xxx   ' + 'y'*77),
                    'xxx  \n ' + 'y'*77 + '\n')
 
+    def test_overlong_encodeable_is_wrapped(self):
+        first_token_with_whitespace = 'xxx   '
+        chrome_leader = '=?utf-8?q?'
+        len_chrome = len(chrome_leader) + 2
+        len_non_y = len_chrome + len(first_token_with_whitespace)
+        self._test(parser.get_unstructured(first_token_with_whitespace +
+                                           'y'*80),
+                   first_token_with_whitespace + chrome_leader +
+                       'y'*(78-len_non_y) + '?=\n' +
+                       ' ' + chrome_leader + 'y'*(80-(78-len_non_y)) + '?=\n')
+
     def test_long_filename_attachment(self):
-        folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"')
-        self.assertEqual(
-            'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"\n',
-            folded
-        )
-        folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"')
-        self.assertEqual(
-            'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"\n',
-            folded
-        )
+        self._test(parser.parse_content_disposition_header(
+            'attachment; filename="TEST_TEST_TEST_TEST'
+                '_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"'),
+            "attachment;\n"
+            " filename*0*=us-ascii''TEST_TEST_TEST_TEST_TEST_TEST"
+                "_TEST_TEST_TEST_TEST_TEST;\n"
+            " filename*1*=_TEST_TES.txt\n",
+            )
 
 if __name__ == '__main__':
     unittest.main()
commit	85d5c18c9d83a1d54eecc4c2ad4dce63194107c6	[log] [tgz]
author	R. David Murray <rdmurray@bitdance.com>	Sun Dec 03 18:51:41 2017 -0500
committer	GitHub <noreply@github.com>	Sun Dec 03 18:51:41 2017 -0500
tree	39edcca989e60b95a64e317440057ee45aea8fc0
parent	29ba688034fc4eef0693b86002cf7bee55d692af [diff] [blame]