#10686: recode non-ASCII headers to 'unknown-8bit' instead of ?s. This applies only when generating strings from non-RFC compliant binary input; it makes the existing recoding behavior more consistent (ie: now no data is lost when recoding).

commit: 9253214fd9fe22b8b2b4ca5bb28952df8cab3e8c [log] [tgz]
author: R. David Murray <rdmurray@bitdance.com> Fri Jan 07 23:25:30 2011 +0000
committer: R. David Murray <rdmurray@bitdance.com> Fri Jan 07 23:25:30 2011 +0000
tree: 30d925a75c0b3bd542c00d6dbd667e72178056a7
parent: 6f0022d84af15d51ffa1606991f2b6e9e56448ed [diff] [blame]
diff --git a/Lib/email/charset.py b/Lib/email/charset.py
index 898beed..8591527 100644
--- a/Lib/email/charset.py
+++ b/Lib/email/charset.py

@@ -28,6 +28,7 @@
 RFC2047_CHROME_LEN = 7
 
 DEFAULT_CHARSET = 'us-ascii'
+UNKNOWN8BIT = 'unknown-8bit'
 EMPTYSTRING = ''
 
 
@@ -153,6 +154,16 @@
 
 
 
+# Convenience function for encoding strings, taking into account
+# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
+def _encode(string, codec):
+    if codec == UNKNOWN8BIT:
+        return string.encode('ascii', 'surrogateescape')
+    else:
+        return string.encode(codec)
+
+
+
 class Charset:
     """Map character sets to their email properties.
 
@@ -282,8 +293,7 @@
         :return: The encoded string, with RFC 2047 chrome.
         """
         codec = self.output_codec or 'us-ascii'
-        charset = self.get_output_charset()
-        header_bytes = string.encode(codec)
+        header_bytes = _encode(string, codec)
         # 7bit/8bit encodings return the string unchanged (modulo conversions)
         encoder_module = self._get_encoder(header_bytes)
         if encoder_module is None:
@@ -309,7 +319,7 @@
         """
         # See which encoding we should use.
         codec = self.output_codec or 'us-ascii'
-        header_bytes = string.encode(codec)
+        header_bytes = _encode(string, codec)
         encoder_module = self._get_encoder(header_bytes)
         encoder = partial(encoder_module.header_encode, charset=str(self))
         # Calculate the number of characters that the RFC 2047 chrome will
@@ -333,7 +343,7 @@
         for character in string:
             current_line.append(character)
             this_line = EMPTYSTRING.join(current_line)
-            length = encoder_module.header_length(this_line.encode(charset))
+            length = encoder_module.header_length(_encode(this_line, charset))
             if length > maxlen:
                 # This last character doesn't fit so pop it off.
                 current_line.pop()
@@ -343,12 +353,12 @@
                 else:
                     separator = (' ' if lines else '')
                     joined_line = EMPTYSTRING.join(current_line)
-                    header_bytes = joined_line.encode(codec)
+                    header_bytes = _encode(joined_line, codec)
                     lines.append(encoder(header_bytes))
                 current_line = [character]
                 maxlen = next(maxlengths) - extra
         joined_line = EMPTYSTRING.join(current_line)
-        header_bytes = joined_line.encode(codec)
+        header_bytes = _encode(joined_line, codec)
         lines.append(encoder(header_bytes))
         return lines
commit	9253214fd9fe22b8b2b4ca5bb28952df8cab3e8c	[log] [tgz]
author	R. David Murray <rdmurray@bitdance.com>	Fri Jan 07 23:25:30 2011 +0000
committer	R. David Murray <rdmurray@bitdance.com>	Fri Jan 07 23:25:30 2011 +0000
tree	30d925a75c0b3bd542c00d6dbd667e72178056a7
parent	6f0022d84af15d51ffa1606991f2b6e9e56448ed [diff] [blame]