Issue #19543: Emit deprecation warning for known non-text encodings. Backported issues #19619: encode() and decode() methods and constructors of str, unicode and bytearray classes now emit deprecation warning for known non-text encodings when Python is ran with the -3 option. Backported issues #20404: io.TextIOWrapper (and hence io.open()) now uses the internal codec marking system added to emit deprecation warning for known non-text encodings at stream construction time when Python is ran with the -3 option.

commit: c7797dc7482035ee166ca2e941b623382b92e1fc [log] [tgz]
author: Serhiy Storchaka <storchaka@gmail.com> Sun May 31 20:21:00 2015 +0300
committer: Serhiy Storchaka <storchaka@gmail.com> Sun May 31 20:21:00 2015 +0300
tree: 526e26fa4dac506f02859fdbe946d33ed4165f5e
parent: cfb7028df4bdf12325786e48ebef3b4982efa119 [diff]
diff --git a/Lib/_pyio.py b/Lib/_pyio.py
index a7f4301..694b778 100644
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py

@@ -7,6 +7,7 @@
 import os
 import abc
 import codecs
+import sys
 import warnings
 import errno
 # Import thread instead of threading to reduce startup cost
@@ -1497,6 +1498,11 @@
         if not isinstance(encoding, basestring):
             raise ValueError("invalid encoding: %r" % encoding)
 
+        if sys.py3kwarning and not codecs.lookup(encoding)._is_text_encoding:
+            msg = ("%r is not a text encoding; "
+                   "use codecs.open() to handle arbitrary codecs")
+            warnings.warnpy3k(msg % encoding, stacklevel=2)
+
         if errors is None:
             errors = "strict"
         else:

diff --git a/Lib/codecs.py b/Lib/codecs.py
index 049a3f0..12213e2 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py

@@ -79,9 +79,19 @@
 ### Codec base classes (defining the API)
 
 class CodecInfo(tuple):
+    """Codec details when looking up the codec registry"""
+
+    # Private API to allow Python to blacklist the known non-Unicode
+    # codecs in the standard library. A more general mechanism to
+    # reliably distinguish test encodings from other codecs will hopefully
+    # be defined for Python 3.5
+    #
+    # See http://bugs.python.org/issue19619
+    _is_text_encoding = True # Assume codecs are text encodings by default
 
     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
-        incrementalencoder=None, incrementaldecoder=None, name=None):
+        incrementalencoder=None, incrementaldecoder=None, name=None,
+        _is_text_encoding=None):
         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
         self.name = name
         self.encode = encode
@@ -90,6 +100,8 @@
         self.incrementaldecoder = incrementaldecoder
         self.streamwriter = streamwriter
         self.streamreader = streamreader
+        if _is_text_encoding is not None:
+            self._is_text_encoding = _is_text_encoding
         return self
 
     def __repr__(self):

diff --git a/Lib/encodings/base64_codec.py b/Lib/encodings/base64_codec.py
index f84e780..34ac555 100644
--- a/Lib/encodings/base64_codec.py
+++ b/Lib/encodings/base64_codec.py

@@ -76,4 +76,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )

diff --git a/Lib/encodings/bz2_codec.py b/Lib/encodings/bz2_codec.py
index 054b36b..136503a 100644
--- a/Lib/encodings/bz2_codec.py
+++ b/Lib/encodings/bz2_codec.py

@@ -99,4 +99,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )

diff --git a/Lib/encodings/hex_codec.py b/Lib/encodings/hex_codec.py
index 91b38d9..154488c 100644
--- a/Lib/encodings/hex_codec.py
+++ b/Lib/encodings/hex_codec.py

@@ -76,4 +76,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )

diff --git a/Lib/encodings/quopri_codec.py b/Lib/encodings/quopri_codec.py
index d8683fd..f259149 100644
--- a/Lib/encodings/quopri_codec.py
+++ b/Lib/encodings/quopri_codec.py

@@ -72,4 +72,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )

diff --git a/Lib/encodings/rot_13.py b/Lib/encodings/rot_13.py
index 52b6431..4eaf433 100755
--- a/Lib/encodings/rot_13.py
+++ b/Lib/encodings/rot_13.py

@@ -44,6 +44,7 @@
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
 
 ### Decoding Map

diff --git a/Lib/encodings/uu_codec.py b/Lib/encodings/uu_codec.py
index 4b137a5..5cb0d2b 100644
--- a/Lib/encodings/uu_codec.py
+++ b/Lib/encodings/uu_codec.py

@@ -126,4 +126,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_text_encoding=False,
     )

diff --git a/Lib/encodings/zlib_codec.py b/Lib/encodings/zlib_codec.py
index 3419f9f..0c2599d 100644
--- a/Lib/encodings/zlib_codec.py
+++ b/Lib/encodings/zlib_codec.py

@@ -99,4 +99,5 @@
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_text_encoding=False,
     )

diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
index 1b43238..5141f87 100644
--- a/Lib/json/decoder.py
+++ b/Lib/json/decoder.py

@@ -15,10 +15,8 @@
 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
 
 def _floatconstants():
-    _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
-    if sys.byteorder != 'big':
-        _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
-    nan, inf = struct.unpack('dd', _BYTES)
+    nan, = struct.unpack('>d', b'\x7f\xf8\x00\x00\x00\x00\x00\x00')
+    inf, = struct.unpack('>d', b'\x7f\xf0\x00\x00\x00\x00\x00\x00')
     return nan, inf, -inf
 
 NaN, PosInf, NegInf = _floatconstants()

diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py
index 6d87eb6..b2f837b 100644
--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py

@@ -1295,8 +1295,10 @@
                   ('hex', '68656c6c6f20776f726c64'),
                   ('uu', 'begin 666 <data>\n+:&5L;&\\@=V]R;&0 \n \nend\n')]
         for encoding, data in codecs:
-            self.checkequal(data, 'hello world', 'encode', encoding)
-            self.checkequal('hello world', data, 'decode', encoding)
+            with test_support.check_py3k_warnings():
+                self.checkequal(data, 'hello world', 'encode', encoding)
+            with test_support.check_py3k_warnings():
+                self.checkequal('hello world', data, 'decode', encoding)
         # zlib is optional, so we make the test optional too...
         try:
             import zlib
@@ -1304,8 +1306,10 @@
             pass
         else:
             data = 'x\x9c\xcbH\xcd\xc9\xc9W(\xcf/\xcaI\x01\x00\x1a\x0b\x04]'
-            self.checkequal(data, 'hello world', 'encode', 'zlib')
-            self.checkequal('hello world', data, 'decode', 'zlib')
+            with test_support.check_py3k_warnings():
+                self.checkequal(data, 'hello world', 'encode', 'zlib')
+            with test_support.check_py3k_warnings():
+                self.checkequal('hello world', data, 'decode', 'zlib')
 
         self.checkraises(TypeError, 'xyz', 'decode', 42)
         self.checkraises(TypeError, 'xyz', 'encode', 42)

diff --git a/Lib/test/test_calendar.py b/Lib/test/test_calendar.py
index 5692642..46c4a6f 100644
--- a/Lib/test/test_calendar.py
+++ b/Lib/test/test_calendar.py

@@ -513,8 +513,8 @@
     def test_option_encoding(self):
         self.assertFailure('-e')
         self.assertFailure('--encoding')
-        stdout = self.run_ok('--encoding', 'rot-13', '2004')
-        self.assertEqual(stdout.strip(), conv(result_2004_text.encode('rot-13')).strip())
+        stdout = self.run_ok('--encoding', 'utf-16-le', '2004')
+        self.assertEqual(stdout.strip(), conv(result_2004_text.encode('utf-16-le')).strip())
 
     def test_option_locale(self):
         self.assertFailure('-L')

diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index de80b07..c7072a6 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py

@@ -1395,14 +1395,14 @@
 class Str2StrTest(unittest.TestCase):
 
     def test_read(self):
-        sin = "\x80".encode("base64_codec")
+        sin = codecs.encode("\x80", "base64_codec")
         reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
         sout = reader.read()
         self.assertEqual(sout, "\x80")
         self.assertIsInstance(sout, str)
 
     def test_readline(self):
-        sin = "\x80".encode("base64_codec")
+        sin = codecs.encode("\x80", "base64_codec")
         reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
         sout = reader.readline()
         self.assertEqual(sout, "\x80")
@@ -1536,6 +1536,9 @@
 ]
 broken_incremental_coders = broken_unicode_with_streams[:]
 
+if sys.flags.py3k_warning:
+    broken_unicode_with_streams.append("rot_13")
+
 # The following encodings only support "strict" mode
 only_strict_mode = [
     "idna",
@@ -2135,6 +2138,47 @@
         # Missing "begin" line
         self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
 
+    def test_text_to_binary_blacklists_binary_transforms(self):
+        # Check binary -> binary codecs give a good error for str input
+        bad_input = "bad input type"
+        for encoding in bytes_transform_encodings:
+            fmt = (r"{!r} is not a text encoding; "
+                   r"use codecs.encode\(\) to handle arbitrary codecs")
+            msg = fmt.format(encoding)
+            with self.assertRaisesRegex(LookupError, msg) as failure:
+                bad_input.encode(encoding)
+            self.assertIsNone(failure.exception.__cause__)
+
+    def test_text_to_binary_blacklists_text_transforms(self):
+        # Check str.encode gives a good error message for str -> str codecs
+        msg = (r"^'rot_13' is not a text encoding; "
+               r"use codecs.encode\(\) to handle arbitrary codecs")
+        with self.assertRaisesRegex(LookupError, msg):
+            "just an example message".encode("rot_13")
+
+    def test_binary_to_text_blacklists_binary_transforms(self):
+        # Check bytes.decode and bytearray.decode give a good error
+        # message for binary -> binary codecs
+        data = b"encode first to ensure we meet any format restrictions"
+        for encoding in bytes_transform_encodings:
+            encoded_data = codecs.encode(data, encoding)
+            fmt = (r"{!r} is not a text encoding; "
+                   r"use codecs.decode\(\) to handle arbitrary codecs")
+            msg = fmt.format(encoding)
+            with self.assertRaisesRegex(LookupError, msg):
+                encoded_data.decode(encoding)
+            with self.assertRaisesRegex(LookupError, msg):
+                bytearray(encoded_data).decode(encoding)
+
+    def test_binary_to_text_blacklists_text_transforms(self):
+        # Check str -> str codec gives a good error for binary input
+        for bad_input in (b"immutable", bytearray(b"mutable")):
+            msg = (r"^'rot_13' is not a text encoding; "
+                   r"use codecs.decode\(\) to handle arbitrary codecs")
+            with self.assertRaisesRegex(LookupError, msg) as failure:
+                bad_input.decode("rot_13")
+            self.assertIsNone(failure.exception.__cause__)
+
 
 if __name__ == "__main__":
     test_main()

diff --git a/Lib/test/test_fileinput.py b/Lib/test/test_fileinput.py
index c15ad84..facc56e 100644
--- a/Lib/test/test_fileinput.py
+++ b/Lib/test/test_fileinput.py

@@ -211,10 +211,11 @@
         except ValueError:
             pass
         try:
-            t1 = writeTmp(1, ["A\nB"], mode="wb")
-            fi = FileInput(files=t1, openhook=hook_encoded("rot13"))
+            # UTF-7 is a convenient, seldom used encoding
+            t1 = writeTmp(1, ['+AEE-\n+AEI-'], mode="wb")
+            fi = FileInput(files=t1, openhook=hook_encoded("utf-7"))
             lines = list(fi)
-            self.assertEqual(lines, ["N\n", "O"])
+            self.assertEqual(lines, [u'A\n', u'B'])
         finally:
             remove_tempfiles(t1)
 

diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index bbc804b..1a17d81 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py

@@ -2001,6 +2001,15 @@
         t.__init__(self.MockRawIO())
         self.assertEqual(t.read(0), u'')
 
+    def test_non_text_encoding_codecs_are_rejected(self):
+        # Ensure the constructor complains if passed a codec that isn't
+        # marked as a text encoding
+        # http://bugs.python.org/issue20404
+        r = self.BytesIO()
+        b = self.BufferedWriter(r)
+        with support.check_py3k_warnings():
+            self.TextIOWrapper(b, encoding="hex_codec")
+
     def test_detach(self):
         r = self.BytesIO()
         b = self.BufferedWriter(r)
@@ -2617,19 +2626,39 @@
 
     def test_illegal_decoder(self):
         # Issue #17106
+        # Bypass the early encoding check added in issue 20404
+        def _make_illegal_wrapper():
+            quopri = codecs.lookup("quopri_codec")
+            quopri._is_text_encoding = True
+            try:
+                t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'),
+                                       newline='\n', encoding="quopri_codec")
+            finally:
+                quopri._is_text_encoding = False
+            return t
         # Crash when decoder returns non-string
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        with support.check_py3k_warnings():
+            t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
+                                   encoding='quopri_codec')
         with self.maybeRaises(TypeError):
             t.read(1)
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        with support.check_py3k_warnings():
+            t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
+                                   encoding='quopri_codec')
         with self.maybeRaises(TypeError):
             t.readline()
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        with support.check_py3k_warnings():
+            t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
+                                   encoding='quopri_codec')
         with self.maybeRaises(TypeError):
             t.read()
+        #else:
+            #t = _make_illegal_wrapper()
+            #self.assertRaises(TypeError, t.read, 1)
+            #t = _make_illegal_wrapper()
+            #self.assertRaises(TypeError, t.readline)
+            #t = _make_illegal_wrapper()
+            #self.assertRaises(TypeError, t.read)
 
 
 class CTextIOWrapperTest(TextIOWrapperTest):
@@ -3002,9 +3031,11 @@
 
 class CMiscIOTest(MiscIOTest):
     io = io
+    shutdown_error = "RuntimeError: could not find io module state"
 
 class PyMiscIOTest(MiscIOTest):
     io = pyio
+    shutdown_error = "LookupError: unknown encoding: ascii"
 
 
 @unittest.skipIf(os.name == 'nt', 'POSIX signals required for this test.')
commit	c7797dc7482035ee166ca2e941b623382b92e1fc	[log] [tgz]
author	Serhiy Storchaka <storchaka@gmail.com>	Sun May 31 20:21:00 2015 +0300
committer	Serhiy Storchaka <storchaka@gmail.com>	Sun May 31 20:21:00 2015 +0300
tree	526e26fa4dac506f02859fdbe946d33ed4165f5e
parent	cfb7028df4bdf12325786e48ebef3b4982efa119 [diff]