#10686: recode non-ASCII headers to 'unknown-8bit' instead of ?s.
This applies only when generating strings from non-RFC compliant binary
input; it makes the existing recoding behavior more consistent (ie:
now no data is lost when recoding).
diff --git a/Lib/email/charset.py b/Lib/email/charset.py
index 898beed..8591527 100644
--- a/Lib/email/charset.py
+++ b/Lib/email/charset.py
@@ -28,6 +28,7 @@
RFC2047_CHROME_LEN = 7
DEFAULT_CHARSET = 'us-ascii'
+UNKNOWN8BIT = 'unknown-8bit'
EMPTYSTRING = ''
@@ -153,6 +154,16 @@
+# Convenience function for encoding strings, taking into account
+# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
+def _encode(string, codec):
+ if codec == UNKNOWN8BIT:
+ return string.encode('ascii', 'surrogateescape')
+ else:
+ return string.encode(codec)
+
+
+
class Charset:
"""Map character sets to their email properties.
@@ -282,8 +293,7 @@
:return: The encoded string, with RFC 2047 chrome.
"""
codec = self.output_codec or 'us-ascii'
- charset = self.get_output_charset()
- header_bytes = string.encode(codec)
+ header_bytes = _encode(string, codec)
# 7bit/8bit encodings return the string unchanged (modulo conversions)
encoder_module = self._get_encoder(header_bytes)
if encoder_module is None:
@@ -309,7 +319,7 @@
"""
# See which encoding we should use.
codec = self.output_codec or 'us-ascii'
- header_bytes = string.encode(codec)
+ header_bytes = _encode(string, codec)
encoder_module = self._get_encoder(header_bytes)
encoder = partial(encoder_module.header_encode, charset=str(self))
# Calculate the number of characters that the RFC 2047 chrome will
@@ -333,7 +343,7 @@
for character in string:
current_line.append(character)
this_line = EMPTYSTRING.join(current_line)
- length = encoder_module.header_length(this_line.encode(charset))
+ length = encoder_module.header_length(_encode(this_line, charset))
if length > maxlen:
# This last character doesn't fit so pop it off.
current_line.pop()
@@ -343,12 +353,12 @@
else:
separator = (' ' if lines else '')
joined_line = EMPTYSTRING.join(current_line)
- header_bytes = joined_line.encode(codec)
+ header_bytes = _encode(joined_line, codec)
lines.append(encoder(header_bytes))
current_line = [character]
maxlen = next(maxlengths) - extra
joined_line = EMPTYSTRING.join(current_line)
- header_bytes = joined_line.encode(codec)
+ header_bytes = _encode(joined_line, codec)
lines.append(encoder(header_bytes))
return lines