#1079: Fix parsing of encoded words. This is a behavior change: before this leading and trailing spaces were stripped from ASCII parts, now they are preserved. Without this fix we didn't parse the examples in the RFC correctly, so I think breaking backward compatibility here is justified. Patch by Ralf Schlatterbeck.

commit: 07ea53cb218812404cdbde820647ce6e4b2d0f8e [log] [tgz]
author: R David Murray <rdmurray@bitdance.com> Sat Jun 02 17:56:49 2012 -0400
committer: R David Murray <rdmurray@bitdance.com> Sat Jun 02 17:56:49 2012 -0400
tree: 153fbb31a5056379715475ed55a5c91a0fcbd8a9
parent: e11eb0f21b8107d7cf61efd37ff3555258577d51 [diff]
diff --git a/Lib/email/header.py b/Lib/email/header.py
index 3250d36..a89219d 100644
--- a/Lib/email/header.py
+++ b/Lib/email/header.py

@@ -40,7 +40,6 @@
   \?                    # literal ?
   (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
   \?=                   # literal ?=
-  (?=[ \t]|$)           # whitespace or the end of the string
   ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
 
 # Field name regexp, including trailing colon, but not separating whitespace,
@@ -86,8 +85,12 @@
     words = []
     for line in header.splitlines():
         parts = ecre.split(line)
+        first = True
         while parts:
-            unencoded = parts.pop(0).strip()
+            unencoded = parts.pop(0)
+            if first:
+                unencoded = unencoded.lstrip()
+                first = False
             if unencoded:
                 words.append((unencoded, None, None))
             if parts:
@@ -95,6 +98,16 @@
                 encoding = parts.pop(0).lower()
                 encoded = parts.pop(0)
                 words.append((encoded, encoding, charset))
+    # Now loop over words and remove words that consist of whitespace
+    # between two encoded strings.
+    import sys
+    droplist = []
+    for n, w in enumerate(words):
+        if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
+            droplist.append(n-1)
+    for d in reversed(droplist):
+        del words[d]
+
     # The next step is to decode each encoded word by applying the reverse
     # base64 or quopri transformation.  decoded_words is now a list of the
     # form (decoded_word, charset).
@@ -217,22 +230,27 @@
         self._normalize()
         uchunks = []
         lastcs = None
+        lastspace = None
         for string, charset in self._chunks:
             # We must preserve spaces between encoded and non-encoded word
             # boundaries, which means for us we need to add a space when we go
             # from a charset to None/us-ascii, or from None/us-ascii to a
             # charset.  Only do this for the second and subsequent chunks.
+            # Don't add a space if the None/us-ascii string already has
+            # a space (trailing or leading depending on transition)
             nextcs = charset
             if nextcs == _charset.UNKNOWN8BIT:
                 original_bytes = string.encode('ascii', 'surrogateescape')
                 string = original_bytes.decode('ascii', 'replace')
             if uchunks:
+                hasspace = string and self._nonctext(string[0])
                 if lastcs not in (None, 'us-ascii'):
-                    if nextcs in (None, 'us-ascii'):
+                    if nextcs in (None, 'us-ascii') and not hasspace:
                         uchunks.append(SPACE)
                         nextcs = None
-                elif nextcs not in (None, 'us-ascii'):
+                elif nextcs not in (None, 'us-ascii') and not lastspace:
                     uchunks.append(SPACE)
+            lastspace = string and self._nonctext(string[-1])
             lastcs = nextcs
             uchunks.append(string)
         return EMPTYSTRING.join(uchunks)
@@ -291,6 +309,11 @@
                 charset = UTF8
         self._chunks.append((s, charset))
 
+    def _nonctext(self, s):
+        """True if string s is not a ctext character of RFC822.
+        """
+        return s.isspace() or s in ('(', ')', '\\')
+
     def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
         r"""Encode a message header into an RFC-compliant format.
 
@@ -334,7 +357,20 @@
             maxlinelen = 1000000
         formatter = _ValueFormatter(self._headerlen, maxlinelen,
                                     self._continuation_ws, splitchars)
+        lastcs = None
+        hasspace = lastspace = None
         for string, charset in self._chunks:
+            if hasspace is not None:
+                hasspace = string and self._nonctext(string[0])
+                import sys
+                if lastcs not in (None, 'us-ascii'):
+                    if not hasspace or charset not in (None, 'us-ascii'):
+                        formatter.add_transition()
+                elif charset not in (None, 'us-ascii') and not lastspace:
+                    formatter.add_transition()
+            lastspace = string and self._nonctext(string[-1])
+            lastcs = charset
+            hasspace = False
             lines = string.splitlines()
             if lines:
                 formatter.feed('', lines[0], charset)
@@ -351,6 +387,7 @@
                     formatter.feed(fws, sline, charset)
             if len(lines) > 1:
                 formatter.newline()
+        if self._chunks:
             formatter.add_transition()
         value = formatter._str(linesep)
         if _embeded_header.search(value):
commit	07ea53cb218812404cdbde820647ce6e4b2d0f8e	[log] [tgz]
author	R David Murray <rdmurray@bitdance.com>	Sat Jun 02 17:56:49 2012 -0400
committer	R David Murray <rdmurray@bitdance.com>	Sat Jun 02 17:56:49 2012 -0400
tree	153fbb31a5056379715475ed55a5c91a0fcbd8a9
parent	e11eb0f21b8107d7cf61efd37ff3555258577d51 [diff]