Forward port some fixes that were in email 2.5 but for some reason didn't make
it into email 4.0. Specifically, in Message.get_content_charset(), handle RFC
2231 headers that contain an encoding not known to Python, or a character in
the data that isn't in the charset encoding. Also forward port the
appropriate unit tests.
diff --git a/Lib/email/message.py b/Lib/email/message.py
index 50d90b4..79c5c4c 100644
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@@ -747,7 +747,18 @@
if isinstance(charset, tuple):
# RFC 2231 encoded, so decode it, and it better end up as ascii.
pcharset = charset[0] or 'us-ascii'
- charset = unicode(charset[2], pcharset).encode('us-ascii')
+ try:
+ # LookupError will be raised if the charset isn't known to
+ # Python. UnicodeError will be raised if the encoded text
+ # contains a character not in the charset.
+ charset = unicode(charset[2], pcharset).encode('us-ascii')
+ except (LookupError, UnicodeError):
+ charset = charset[2]
+ # charset character must be in us-ascii range
+ try:
+ charset = unicode(charset, 'us-ascii').encode('us-ascii')
+ except UnicodeError:
+ return failobj
# RFC 2046, $4.1.2 says charsets are not case sensitive
return charset.lower()