#4661: add bytes parsing and generation to email (email version bump to 5.1.0) The work on this is not 100% complete, but everything is present to allow real-world testing of the code. The only remaining major todo item is to (hopefully!) enhance the handling of non-ASCII bytes in headers converted to unicode by RFC2047 encoding them rather than replacing them with '?'s.

commit: 96fd54eaec700cc50e5960f45ee79bc25c2c48c5 [log] [tgz]
author: R. David Murray <rdmurray@bitdance.com> Fri Oct 08 15:55:28 2010 +0000
committer: R. David Murray <rdmurray@bitdance.com> Fri Oct 08 15:55:28 2010 +0000
tree: 4e4fc3f48d8957b6b0fccc372410e8374ce4fb70
parent: 59fdd6736bbf1ba14083a4bb777abaefc364f876 [diff] [blame]
diff --git a/Lib/email/message.py b/Lib/email/message.py
index 923b26c..a835ce2 100644
--- a/Lib/email/message.py
+++ b/Lib/email/message.py

@@ -24,8 +24,26 @@
 # existence of which force quoting of the parameter value.
 tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]')
 
+# How to figure out if we are processing strings that come from a byte
+# source with undecodable characters.
+_has_surrogates = re.compile(
+    '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
+
 
 # Helper functions
+def _sanitize_surrogates(value):
+    # If the value contains surrogates, re-decode and replace the original
+    # non-ascii bytes with '?'s.  Used to sanitize header values before letting
+    # them escape as strings.
+    if not isinstance(value, str):
+        # Header object
+        return value
+    if _has_surrogates(value):
+        original_bytes = value.encode('ascii', 'surrogateescape')
+        return original_bytes.decode('ascii', 'replace').replace('\ufffd', '?')
+    else:
+        return value
+
 def _splitparam(param):
     # Split header parameters.  BAW: this may be too simple.  It isn't
     # strictly RFC 2045 (section 5.1) compliant, but it catches most headers
@@ -184,44 +202,72 @@
         If the message is a multipart and the decode flag is True, then None
         is returned.
         """
-        if i is None:
-            payload = self._payload
-        elif not isinstance(self._payload, list):
+        # Here is the logic table for this code, based on the email5.0.0 code:
+        #   i     decode  is_multipart  result
+        # ------  ------  ------------  ------------------------------
+        #  None   True    True          None
+        #   i     True    True          None
+        #  None   False   True          _payload (a list)
+        #   i     False   True          _payload element i (a Message)
+        #   i     False   False         error (not a list)
+        #   i     True    False         error (not a list)
+        #  None   False   False         _payload
+        #  None   True    False         _payload decoded (bytes)
+        # Note that Barry planned to factor out the 'decode' case, but that
+        # isn't so easy now that we handle the 8 bit data, which needs to be
+        # converted in both the decode and non-decode path.
+        if self.is_multipart():
+            if decode:
+                return None
+            if i is None:
+                return self._payload
+            else:
+                return self._payload[i]
+        # For backward compatibility, Use isinstance and this error message
+        # instead of the more logical is_multipart test.
+        if i is not None and not isinstance(self._payload, list):
             raise TypeError('Expected list, got %s' % type(self._payload))
-        else:
-            payload = self._payload[i]
+        payload = self._payload
+        cte = self.get('content-transfer-encoding', '').lower()
+        # payload can be bytes here, (I wonder if that is actually a bug?)
+        if isinstance(payload, str):
+            if _has_surrogates(payload):
+                bpayload = payload.encode('ascii', 'surrogateescape')
+                if not decode:
+                    try:
+                        payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
+                    except LookupError:
+                        payload = bpayload.decode('ascii', 'replace')
+            elif decode:
+                try:
+                    bpayload = payload.encode('ascii')
+                except UnicodeError:
+                    # This won't happen for RFC compliant messages (messages
+                    # containing only ASCII codepoints in the unicode input).
+                    # If it does happen, turn the string into bytes in a way
+                    # guaranteed not to fail.
+                    bpayload = payload.encode('raw-unicode-escape')
         if not decode:
             return payload
-        # Decoded payloads always return bytes.  XXX split this part out into
-        # a new method called .get_decoded_payload().
-        if self.is_multipart():
-            return None
-        cte = self.get('content-transfer-encoding', '').lower()
         if cte == 'quoted-printable':
-            if isinstance(payload, str):
-                payload = payload.encode('ascii')
-            return utils._qdecode(payload)
+            return utils._qdecode(bpayload)
         elif cte == 'base64':
             try:
-                if isinstance(payload, str):
-                    payload = payload.encode('ascii')
-                return base64.b64decode(payload)
+                return base64.b64decode(bpayload)
             except binascii.Error:
                 # Incorrect padding
-                pass
+                return bpayload
         elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'):
-            in_file = BytesIO(payload.encode('ascii'))
+            in_file = BytesIO(bpayload)
             out_file = BytesIO()
             try:
                 uu.decode(in_file, out_file, quiet=True)
                 return out_file.getvalue()
             except uu.Error:
                 # Some decoding problem
-                pass
-        # Is there a better way to do this?  We can't use the bytes
-        # constructor.
+                return bpayload
         if isinstance(payload, str):
-            return payload.encode('raw-unicode-escape')
+            return bpayload
         return payload
 
     def set_payload(self, payload, charset=None):
@@ -340,7 +386,7 @@
         Any fields deleted and re-inserted are always appended to the header
         list.
         """
-        return [v for k, v in self._headers]
+        return [_sanitize_surrogates(v) for k, v in self._headers]
 
     def items(self):
         """Get all the message's header fields and values.
@@ -350,7 +396,7 @@
         Any fields deleted and re-inserted are always appended to the header
         list.
         """
-        return self._headers[:]
+        return [(k, _sanitize_surrogates(v)) for k, v in self._headers]
 
     def get(self, name, failobj=None):
         """Get a header value.
@@ -361,7 +407,7 @@
         name = name.lower()
         for k, v in self._headers:
             if k.lower() == name:
-                return v
+                return _sanitize_surrogates(v)
         return failobj
 
     #
@@ -381,7 +427,7 @@
         name = name.lower()
         for k, v in self._headers:
             if k.lower() == name:
-                values.append(v)
+                values.append(_sanitize_surrogates(v))
         if not values:
             return failobj
         return values
commit	96fd54eaec700cc50e5960f45ee79bc25c2c48c5	[log] [tgz]
author	R. David Murray <rdmurray@bitdance.com>	Fri Oct 08 15:55:28 2010 +0000
committer	R. David Murray <rdmurray@bitdance.com>	Fri Oct 08 15:55:28 2010 +0000
tree	4e4fc3f48d8957b6b0fccc372410e8374ce4fb70
parent	59fdd6736bbf1ba14083a4bb777abaefc364f876 [diff] [blame]