Issue #19619: Blacklist non-text codecs in method API str.encode, bytes.decode and bytearray.decode now use an internal API to throw LookupError for known non-text encodings, rather than attempting the encoding or decoding operation and then throwing a TypeError for an unexpected output type. The latter mechanism remains in place for third party non-text encodings. Backported changeset d68df99d7a57.

commit: 94ee389308ec9e0e07b3f7a944d5179aba540c5e [log] [tgz]
author: Serhiy Storchaka <storchaka@gmail.com> Mon Feb 24 14:43:03 2014 +0200
committer: Serhiy Storchaka <storchaka@gmail.com> Mon Feb 24 14:43:03 2014 +0200
tree: 80bc231aff27723119beacbcfa2654b90f793060
parent: 20f8728bf0cce877c1908b15ddc59e2d1011ad0f [diff]
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 01ae0f3..c2065da 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py

@@ -73,9 +73,19 @@
 ### Codec base classes (defining the API)
 
 class CodecInfo(tuple):
+    """Codec details when looking up the codec registry"""
+
+    # Private API to allow Python 3.4 to blacklist the known non-Unicode
+    # codecs in the standard library. A more general mechanism to
+    # reliably distinguish test encodings from other codecs will hopefully
+    # be defined for Python 3.5
+    #
+    # See http://bugs.python.org/issue19619
+    _is_text_encoding = True # Assume codecs are text encodings by default
 
     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
-        incrementalencoder=None, incrementaldecoder=None, name=None):
+        incrementalencoder=None, incrementaldecoder=None, name=None,
+        *, _is_text_encoding=None):
         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
         self.name = name
         self.encode = encode
@@ -84,6 +94,8 @@
         self.incrementaldecoder = incrementaldecoder
         self.streamwriter = streamwriter
         self.streamreader = streamreader
+        if _is_text_encoding is not None:
+            self._is_text_encoding = _is_text_encoding
         return self
 
     def __repr__(self):

diff --git a/Lib/encodings/base64_codec.py b/Lib/encodings/base64_codec.py
index 321a961..881d1ba 100644
--- a/Lib/encodings/base64_codec.py
+++ b/Lib/encodings/base64_codec.py

@@ -52,4 +52,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )

diff --git a/Lib/encodings/bz2_codec.py b/Lib/encodings/bz2_codec.py
index e65d226..fd9495e 100644
--- a/Lib/encodings/bz2_codec.py
+++ b/Lib/encodings/bz2_codec.py

@@ -74,4 +74,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )

diff --git a/Lib/encodings/hex_codec.py b/Lib/encodings/hex_codec.py
index e003fc3..f2ed0a7 100644
--- a/Lib/encodings/hex_codec.py
+++ b/Lib/encodings/hex_codec.py

@@ -52,4 +52,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )

diff --git a/Lib/encodings/quopri_codec.py b/Lib/encodings/quopri_codec.py
index 9243fc4..70f7083 100644
--- a/Lib/encodings/quopri_codec.py
+++ b/Lib/encodings/quopri_codec.py

@@ -53,4 +53,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )

diff --git a/Lib/encodings/rot_13.py b/Lib/encodings/rot_13.py
index 3140c14..fff9153 100755
--- a/Lib/encodings/rot_13.py
+++ b/Lib/encodings/rot_13.py

@@ -43,6 +43,7 @@
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
 
 ### Map

diff --git a/Lib/encodings/uu_codec.py b/Lib/encodings/uu_codec.py
index 69c6f17..e3269e4 100644
--- a/Lib/encodings/uu_codec.py
+++ b/Lib/encodings/uu_codec.py

@@ -96,4 +96,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_text_encoding=False,
     )

diff --git a/Lib/encodings/zlib_codec.py b/Lib/encodings/zlib_codec.py
index e0b9cda..4c81ca1 100644
--- a/Lib/encodings/zlib_codec.py
+++ b/Lib/encodings/zlib_codec.py

@@ -74,4 +74,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_text_encoding=False,
     )

diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 1a199f7..a8b3da0 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py

@@ -4,6 +4,7 @@
 import sys
 import unittest
 import warnings
+import encodings
 
 from test import support
 
@@ -2408,6 +2409,47 @@
             sout = reader.readline()
             self.assertEqual(sout, b"\x80")
 
+    def test_text_to_binary_blacklists_binary_transforms(self):
+        # Check binary -> binary codecs give a good error for str input
+        bad_input = "bad input type"
+        for encoding in bytes_transform_encodings:
+            fmt = (r"{!r} is not a text encoding; "
+                   r"use codecs.encode\(\) to handle arbitrary codecs")
+            msg = fmt.format(encoding)
+            with self.assertRaisesRegex(LookupError, msg) as failure:
+                bad_input.encode(encoding)
+            self.assertIsNone(failure.exception.__cause__)
+
+    def test_text_to_binary_blacklists_text_transforms(self):
+        # Check str.encode gives a good error message for str -> str codecs
+        msg = (r"^'rot_13' is not a text encoding; "
+               r"use codecs.encode\(\) to handle arbitrary codecs")
+        with self.assertRaisesRegex(LookupError, msg):
+            "just an example message".encode("rot_13")
+
+    def test_binary_to_text_blacklists_binary_transforms(self):
+        # Check bytes.decode and bytearray.decode give a good error
+        # message for binary -> binary codecs
+        data = b"encode first to ensure we meet any format restrictions"
+        for encoding in bytes_transform_encodings:
+            encoded_data = codecs.encode(data, encoding)
+            fmt = (r"{!r} is not a text encoding; "
+                   r"use codecs.decode\(\) to handle arbitrary codecs")
+            msg = fmt.format(encoding)
+            with self.assertRaisesRegex(LookupError, msg):
+                encoded_data.decode(encoding)
+            with self.assertRaisesRegex(LookupError, msg):
+                bytearray(encoded_data).decode(encoding)
+
+    def test_binary_to_text_blacklists_text_transforms(self):
+        # Check str -> str codec gives a good error for binary input
+        for bad_input in (b"immutable", bytearray(b"mutable")):
+            msg = (r"^'rot_13' is not a text encoding; "
+                   r"use codecs.decode\(\) to handle arbitrary codecs")
+            with self.assertRaisesRegex(LookupError, msg) as failure:
+                bad_input.decode("rot_13")
+            self.assertIsNone(failure.exception.__cause__)
+
 
 @unittest.skipUnless(sys.platform == 'win32',
                      'code pages are specific to Windows')
commit	94ee389308ec9e0e07b3f7a944d5179aba540c5e	[log] [tgz]
author	Serhiy Storchaka <storchaka@gmail.com>	Mon Feb 24 14:43:03 2014 +0200
committer	Serhiy Storchaka <storchaka@gmail.com>	Mon Feb 24 14:43:03 2014 +0200
tree	80bc231aff27723119beacbcfa2654b90f793060
parent	20f8728bf0cce877c1908b15ddc59e2d1011ad0f [diff]