Close #17828: better handling of codec errors

- output type errors now redirect users to the type-neutral
  convenience functions in the codecs module
- stateless errors that occur during encoding and decoding
  will now be automatically wrapped in exceptions that give
  the name of the codec involved
diff --git a/Doc/whatsnew/3.4.rst b/Doc/whatsnew/3.4.rst
index dd992ed..a04aee8 100644
--- a/Doc/whatsnew/3.4.rst
+++ b/Doc/whatsnew/3.4.rst
@@ -102,6 +102,7 @@
 * :ref:`PEP 446: Make newly created file descriptors non-inheritable <pep-446>`.
 * command line option for :ref:`isolated mode <using-on-misc-options>`,
   (:issue:`16499`).
+* improvements to handling of non-Unicode codecs
 
 Significantly Improved Library Modules:
 
@@ -170,6 +171,70 @@
       PEP written and implemented by Victor Stinner.
 
 
+Improvements to handling of non-Unicode codecs
+==============================================
+
+Since it was first introduced, the :mod:`codecs` module has always been
+intended to operate as a type-neutral dynamic encoding and decoding
+system. However, its close coupling with the Python text model, especially
+the type restricted convenience methods on the builtin :class:`str`,
+:class:`bytes` and :class:`bytearray` types, has historically obscured that
+fact.
+
+As a key step in clarifying the situation, the :meth:`codecs.encode` and
+:meth:`codecs.decode` convenience functions are now properly documented in
+Python 2.7, 3.3 and 3.4. These functions have existed in the :mod:`codecs`
+module and have been covered by the regression test suite since Python 2.4,
+but were previously only discoverable through runtime introspection.
+
+Unlike the convenience methods on :class:`str`, :class:`bytes` and
+:class:`bytearray`, these convenience functions support arbitrary codecs
+in both Python 2 and Python 3, rather than being limited to Unicode text
+encodings (in Python 3) or ``basestring`` <-> ``basestring`` conversions
+(in Python 2).
+
+In Python 3.4, the errors raised by the convenience methods when a codec
+produces the incorrect output type have also been updated to direct users
+towards these general purpose convenience functions::
+
+    >>> import codecs
+
+    >>> codecs.encode(b"hello", "bz2_codec").decode("bz2_codec")
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: 'bz2_codec' decoder returned 'bytes' instead of 'str'; use codecs.decode() to decode to arbitrary types
+
+    >>> "hello".encode("rot_13")
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: 'rot_13' encoder returned 'str' instead of 'bytes'; use codecs.encode() to encode to arbitrary types
+
+In a related change, whenever it is feasible without breaking backwards
+compatibility, exceptions raised during encoding and decoding operations
+will be wrapped in a chained exception of the same type that mentions the
+name of the codec responsible for producing the error::
+
+    >>> b"hello".decode("uu_codec")
+    ValueError: Missing "begin" line in input data
+
+    The above exception was the direct cause of the following exception:
+
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ValueError: decoding with 'uu_codec' codec failed (ValueError: Missing "begin" line in input data)
+
+    >>> "hello".encode("bz2_codec")
+    TypeError: 'str' does not support the buffer interface
+
+    The above exception was the direct cause of the following exception:
+
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: encoding with 'bz2_codec' codec failed (TypeError: 'str' does not support the buffer interface)
+
+(Contributed by Nick Coghlan in :issue:`17827` and :issue:`17828`)
+
+
 Other Language Changes
 ======================
 
@@ -262,19 +327,6 @@
 Added support for 24-bit samples (:issue:`12866`).
 
 
-codecs
-------
-
-The :meth:`codecs.encode` and :meth:`codecs.decode` convenience functions are
-now properly documented. These functions have existed in the :mod:`codecs`
-module since ~2004, but were previously only discoverable through runtime
-introspection.
-
-Unlike the convenience methods on :class:`str`, :class:`bytes` and
-:class:`bytearray`, these convenience functions support arbitrary codecs,
-rather than being limited to Unicode text encodings.
-
-
 colorsys
 --------
 
diff --git a/Include/pyerrors.h b/Include/pyerrors.h
index 224567b..6a8e0e8 100644
--- a/Include/pyerrors.h
+++ b/Include/pyerrors.h
@@ -285,6 +285,28 @@
     const char *name, const char *doc, PyObject *base, PyObject *dict);
 PyAPI_FUNC(void) PyErr_WriteUnraisable(PyObject *);
 
+/* In exceptions.c */
+#ifndef Py_LIMITED_API
+/* Helper that attempts to replace the current exception with one of the
+ * same type but with a prefix added to the exception text. The resulting
+ * exception description looks like:
+ *
+ *     prefix (exc_type: original_exc_str)
+ *
+ * Only some exceptions can be safely replaced. If the function determines
+ * it isn't safe to perform the replacement, it will leave the original
+ * unmodified exception in place.
+ *
+ * Returns a borrowed reference to the new exception (if any), NULL if the
+ * existing exception was left in place.
+ */
+PyAPI_FUNC(PyObject *) _PyErr_TrySetFromCause(
+    const char *prefix_format,   /* ASCII-encoded string  */
+    ...
+    );
+#endif
+
+
 /* In sigcheck.c or signalmodule.c */
 PyAPI_FUNC(int) PyErr_CheckSignals(void);
 PyAPI_FUNC(void) PyErr_SetInterrupt(void);
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 5cef4da..f43ac3a 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1,5 +1,6 @@
 import _testcapi
 import codecs
+import contextlib
 import io
 import locale
 import sys
@@ -2292,28 +2293,31 @@
     def test_basics(self):
         binput = bytes(range(256))
         for encoding in bytes_transform_encodings:
-            # generic codecs interface
-            (o, size) = codecs.getencoder(encoding)(binput)
-            self.assertEqual(size, len(binput))
-            (i, size) = codecs.getdecoder(encoding)(o)
-            self.assertEqual(size, len(o))
-            self.assertEqual(i, binput)
+            with self.subTest(encoding=encoding):
+                # generic codecs interface
+                (o, size) = codecs.getencoder(encoding)(binput)
+                self.assertEqual(size, len(binput))
+                (i, size) = codecs.getdecoder(encoding)(o)
+                self.assertEqual(size, len(o))
+                self.assertEqual(i, binput)
 
     def test_read(self):
         for encoding in bytes_transform_encodings:
-            sin = codecs.encode(b"\x80", encoding)
-            reader = codecs.getreader(encoding)(io.BytesIO(sin))
-            sout = reader.read()
-            self.assertEqual(sout, b"\x80")
+            with self.subTest(encoding=encoding):
+                sin = codecs.encode(b"\x80", encoding)
+                reader = codecs.getreader(encoding)(io.BytesIO(sin))
+                sout = reader.read()
+                self.assertEqual(sout, b"\x80")
 
     def test_readline(self):
         for encoding in bytes_transform_encodings:
             if encoding in ['uu_codec', 'zlib_codec']:
                 continue
-            sin = codecs.encode(b"\x80", encoding)
-            reader = codecs.getreader(encoding)(io.BytesIO(sin))
-            sout = reader.readline()
-            self.assertEqual(sout, b"\x80")
+            with self.subTest(encoding=encoding):
+                sin = codecs.encode(b"\x80", encoding)
+                reader = codecs.getreader(encoding)(io.BytesIO(sin))
+                sout = reader.readline()
+                self.assertEqual(sout, b"\x80")
 
     def test_buffer_api_usage(self):
         # We check all the transform codecs accept memoryview input
@@ -2321,17 +2325,158 @@
         # and also that they roundtrip correctly
         original = b"12345\x80"
         for encoding in bytes_transform_encodings:
-            data = original
-            view = memoryview(data)
-            data = codecs.encode(data, encoding)
-            view_encoded = codecs.encode(view, encoding)
-            self.assertEqual(view_encoded, data)
-            view = memoryview(data)
-            data = codecs.decode(data, encoding)
-            self.assertEqual(data, original)
-            view_decoded = codecs.decode(view, encoding)
-            self.assertEqual(view_decoded, data)
+            with self.subTest(encoding=encoding):
+                data = original
+                view = memoryview(data)
+                data = codecs.encode(data, encoding)
+                view_encoded = codecs.encode(view, encoding)
+                self.assertEqual(view_encoded, data)
+                view = memoryview(data)
+                data = codecs.decode(data, encoding)
+                self.assertEqual(data, original)
+                view_decoded = codecs.decode(view, encoding)
+                self.assertEqual(view_decoded, data)
 
+    def test_type_error_for_text_input(self):
+        # Check binary -> binary codecs give a good error for str input
+        bad_input = "bad input type"
+        for encoding in bytes_transform_encodings:
+            with self.subTest(encoding=encoding):
+                msg = "^encoding with '{}' codec failed".format(encoding)
+                with self.assertRaisesRegex(TypeError, msg) as failure:
+                    bad_input.encode(encoding)
+                self.assertTrue(isinstance(failure.exception.__cause__,
+                                           TypeError))
+
+    def test_type_error_for_binary_input(self):
+        # Check str -> str codec gives a good error for binary input
+        for bad_input in (b"immutable", bytearray(b"mutable")):
+            with self.subTest(bad_input=bad_input):
+                msg = "^decoding with 'rot_13' codec failed"
+                with self.assertRaisesRegex(AttributeError, msg) as failure:
+                    bad_input.decode("rot_13")
+                self.assertTrue(isinstance(failure.exception.__cause__,
+                                           AttributeError))
+
+    def test_bad_decoding_output_type(self):
+        # Check bytes.decode and bytearray.decode give a good error
+        # message for binary -> binary codecs
+        data = b"encode first to ensure we meet any format restrictions"
+        for encoding in bytes_transform_encodings:
+            with self.subTest(encoding=encoding):
+                encoded_data = codecs.encode(data, encoding)
+                fmt = ("'{}' decoder returned 'bytes' instead of 'str'; "
+                       "use codecs.decode\(\) to decode to arbitrary types")
+                msg = fmt.format(encoding)
+                with self.assertRaisesRegex(TypeError, msg):
+                    encoded_data.decode(encoding)
+                with self.assertRaisesRegex(TypeError, msg):
+                    bytearray(encoded_data).decode(encoding)
+
+    def test_bad_encoding_output_type(self):
+        # Check str.encode gives a good error message for str -> str codecs
+        msg = ("'rot_13' encoder returned 'str' instead of 'bytes'; "
+               "use codecs.encode\(\) to encode to arbitrary types")
+        with self.assertRaisesRegex(TypeError, msg):
+            "just an example message".encode("rot_13")
+
+
+# The codec system tries to wrap exceptions in order to ensure the error
+# mentions the operation being performed and the codec involved. We
+# currently *only* want this to happen for relatively stateless
+# exceptions, where the only significant information they contain is their
+# type and a single str argument.
+class ExceptionChainingTest(unittest.TestCase):
+
+    def setUp(self):
+        # There's no way to unregister a codec search function, so we just
+        # ensure we render this one fairly harmless after the test
+        # case finishes by using the test case repr as the codec name
+        # The codecs module normalizes codec names, although this doesn't
+        # appear to be formally documented...
+        self.codec_name = repr(self).lower().replace(" ", "-")
+        self.codec_info = None
+        codecs.register(self.get_codec)
+
+    def get_codec(self, codec_name):
+        if codec_name != self.codec_name:
+            return None
+        return self.codec_info
+
+    def set_codec(self, obj_to_raise):
+        def raise_obj(*args, **kwds):
+            raise obj_to_raise
+        self.codec_info = codecs.CodecInfo(raise_obj, raise_obj,
+                                           name=self.codec_name)
+
+    @contextlib.contextmanager
+    def assertWrapped(self, operation, exc_type, msg):
+        full_msg = "{} with '{}' codec failed \({}: {}\)".format(
+                  operation, self.codec_name, exc_type.__name__, msg)
+        with self.assertRaisesRegex(exc_type, full_msg) as caught:
+            yield caught
+
+    def check_wrapped(self, obj_to_raise, msg):
+        self.set_codec(obj_to_raise)
+        with self.assertWrapped("encoding", RuntimeError, msg):
+            "str_input".encode(self.codec_name)
+        with self.assertWrapped("encoding", RuntimeError, msg):
+            codecs.encode("str_input", self.codec_name)
+        with self.assertWrapped("decoding", RuntimeError, msg):
+            b"bytes input".decode(self.codec_name)
+        with self.assertWrapped("decoding", RuntimeError, msg):
+            codecs.decode(b"bytes input", self.codec_name)
+
+    def test_raise_by_type(self):
+        self.check_wrapped(RuntimeError, "")
+
+    def test_raise_by_value(self):
+        msg = "This should be wrapped"
+        self.check_wrapped(RuntimeError(msg), msg)
+
+    @contextlib.contextmanager
+    def assertNotWrapped(self, operation, exc_type, msg):
+        with self.assertRaisesRegex(exc_type, msg) as caught:
+            yield caught
+        actual_msg = str(caught.exception)
+        self.assertNotIn(operation, actual_msg)
+        self.assertNotIn(self.codec_name, actual_msg)
+
+    def check_not_wrapped(self, obj_to_raise, msg):
+        self.set_codec(obj_to_raise)
+        with self.assertNotWrapped("encoding", RuntimeError, msg):
+            "str input".encode(self.codec_name)
+        with self.assertNotWrapped("encoding", RuntimeError, msg):
+            codecs.encode("str input", self.codec_name)
+        with self.assertNotWrapped("decoding", RuntimeError, msg):
+            b"bytes input".decode(self.codec_name)
+        with self.assertNotWrapped("decoding", RuntimeError, msg):
+            codecs.decode(b"bytes input", self.codec_name)
+
+    def test_init_override_is_not_wrapped(self):
+        class CustomInit(RuntimeError):
+            def __init__(self):
+                pass
+        self.check_not_wrapped(CustomInit, "")
+
+    def test_new_override_is_not_wrapped(self):
+        class CustomNew(RuntimeError):
+            def __new__(cls):
+                return super().__new__(cls)
+        self.check_not_wrapped(CustomNew, "")
+
+    def test_instance_attribute_is_not_wrapped(self):
+        msg = "This should NOT be wrapped"
+        exc = RuntimeError(msg)
+        exc.attr = 1
+        self.check_not_wrapped(exc, msg)
+
+    def test_non_str_arg_is_not_wrapped(self):
+        self.check_not_wrapped(RuntimeError(1), "1")
+
+    def test_multiple_args_is_not_wrapped(self):
+        msg = "\('a', 'b', 'c'\)"
+        self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg)
 
 
 @unittest.skipUnless(sys.platform == 'win32',
diff --git a/Misc/NEWS b/Misc/NEWS
index 16c0475..0083fe0 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,15 @@
 Core and Builtins
 -----------------
 
+- Issue #17828: Output type errors in str.encode(), bytes.decode() and
+  bytearray.decode() now direct users to codecs.encode() or codecs.decode()
+  as appropriate.
+
+- Issue #17828: The interpreter now attempts to chain errors that occur in
+  codec processing with a replacement exception of the same type that
+  includes the codec name in the error message. It ensures it only does this
+  when the creation of the replacement exception won't lose any information.
+
 - Issue #19466: Clear the frames of daemon threads earlier during the
   Python shutdown to call objects destructors. So "unclosed file" resource
   warnings are now corretly emitted for daemon threads.
diff --git a/Objects/exceptions.c b/Objects/exceptions.c
index de5d746..53d8b66 100644
--- a/Objects/exceptions.c
+++ b/Objects/exceptions.c
@@ -2591,3 +2591,116 @@
     free_preallocated_memerrors();
     Py_CLEAR(errnomap);
 }
+
+/* Helper to do the equivalent of "raise X from Y" in C, but always using
+ * the current exception rather than passing one in.
+ *
+ * We currently limit this to *only* exceptions that use the BaseException
+ * tp_init and tp_new methods, since we can be reasonably sure we can wrap
+ * those correctly without losing data and without losing backwards
+ * compatibility.
+ *
+ * We also aim to rule out *all* exceptions that might be storing additional
+ * state, whether by having a size difference relative to BaseException,
+ * additional arguments passed in during construction or by having a
+ * non-empty instance dict.
+ *
+ * We need to be very careful with what we wrap, since changing types to
+ * a broader exception type would be backwards incompatible for
+ * existing codecs, and with different init or new method implementations
+ * may either not support instantiation with PyErr_Format or lose
+ * information when instantiated that way.
+ *
+ * XXX (ncoghlan): This could be made more comprehensive by exploiting the
+ * fact that exceptions are expected to support pickling. If more builtin
+ * exceptions (e.g. AttributeError) start to be converted to rich
+ * exceptions with additional attributes, that's probably a better approach
+ * to pursue over adding special cases for particular stateful subclasses.
+ *
+ * Returns a borrowed reference to the new exception (if any), NULL if the
+ * existing exception was left in place.
+ */
+PyObject *
+_PyErr_TrySetFromCause(const char *format, ...)
+{
+    PyObject* msg_prefix;
+    PyObject *exc, *val, *tb;
+    PyTypeObject *caught_type;
+    PyObject *instance_dict;
+    PyObject *instance_args;
+    Py_ssize_t num_args;
+    PyObject *new_exc, *new_val, *new_tb;
+    va_list vargs;
+
+#ifdef HAVE_STDARG_PROTOTYPES
+    va_start(vargs, format);
+#else
+    va_start(vargs);
+#endif
+
+    PyErr_Fetch(&exc, &val, &tb);
+    caught_type = (PyTypeObject *) exc;
+    /* Ensure type info indicates no extra state is stored at the C level */
+    if (caught_type->tp_init != (initproc) BaseException_init ||
+        caught_type->tp_new != BaseException_new ||
+        caught_type->tp_basicsize != _PyExc_BaseException.tp_basicsize ||
+        caught_type->tp_itemsize != _PyExc_BaseException.tp_itemsize
+       ) {
+        /* We can't be sure we can wrap this safely, since it may contain
+         * more state than just the exception type. Accordingly, we just
+         * leave it alone.
+         */
+        PyErr_Restore(exc, val, tb);
+        return NULL;
+    }
+
+    /* Check the args are empty or contain a single string */
+    PyErr_NormalizeException(&exc, &val, &tb);
+    instance_args = ((PyBaseExceptionObject *) val)->args;
+    num_args = PyTuple_GET_SIZE(instance_args);
+    if ((num_args > 1) ||
+        (num_args == 1 &&
+         !PyUnicode_CheckExact(PyTuple_GET_ITEM(instance_args, 0))
+        )
+       ) {
+        /* More than 1 arg, or the one arg we do have isn't a string
+         */
+        PyErr_Restore(exc, val, tb);
+        return NULL;
+    }
+
+    /* Ensure the instance dict is also empty */
+    instance_dict = *_PyObject_GetDictPtr(val);
+    if (instance_dict != NULL && PyObject_Length(instance_dict) > 0) {
+        /* While we could potentially copy a non-empty instance dictionary
+         * to the replacement exception, for now we take the more
+         * conservative path of leaving exceptions with attributes set
+         * alone.
+         */
+        PyErr_Restore(exc, val, tb);
+        return NULL;
+    }
+
+    /* For exceptions that we can wrap safely, we chain the original
+     * exception to a new one of the exact same type with an
+     * error message that mentions the additional details and the
+     * original exception.
+     *
+     * It would be nice to wrap OSError and various other exception
+     * types as well, but that's quite a bit trickier due to the extra
+     * state potentially stored on OSError instances.
+     */
+    msg_prefix = PyUnicode_FromFormatV(format, vargs);
+    if (msg_prefix == NULL)
+        return NULL;
+
+    PyErr_Format(exc, "%U (%s: %S)",
+                 msg_prefix, Py_TYPE(val)->tp_name, val);
+    Py_DECREF(exc);
+    Py_XDECREF(tb);
+    PyErr_Fetch(&new_exc, &new_val, &new_tb);
+    PyErr_NormalizeException(&new_exc, &new_val, &new_tb);
+    PyException_SetCause(new_val, val);
+    PyErr_Restore(new_exc, new_val, new_tb);
+    return new_val;
+}
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 224a80b..7789816 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3054,8 +3054,10 @@
         goto onError;
     if (!PyUnicode_Check(unicode)) {
         PyErr_Format(PyExc_TypeError,
-                     "decoder did not return a str object (type=%.400s)",
-                     Py_TYPE(unicode)->tp_name);
+                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
+                     "use codecs.decode() to decode to arbitrary types",
+                     encoding,
+                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
         Py_DECREF(unicode);
         goto onError;
     }
@@ -3113,8 +3115,10 @@
         goto onError;
     if (!PyUnicode_Check(v)) {
         PyErr_Format(PyExc_TypeError,
-                     "decoder did not return a str object (type=%.400s)",
-                     Py_TYPE(v)->tp_name);
+                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
+                     "use codecs.decode() to decode to arbitrary types",
+                     encoding,
+                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
         Py_DECREF(v);
         goto onError;
     }
@@ -3425,7 +3429,8 @@
         PyObject *b;
 
         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-            "encoder %s returned bytearray instead of bytes",
+            "encoder %s returned bytearray instead of bytes; "
+            "use codecs.encode() to encode to arbitrary types",
             encoding);
         if (error) {
             Py_DECREF(v);
@@ -3438,8 +3443,10 @@
     }
 
     PyErr_Format(PyExc_TypeError,
-                 "encoder did not return a bytes object (type=%.400s)",
-                 Py_TYPE(v)->tp_name);
+                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
+                 "use codecs.encode() to encode to arbitrary types",
+                 encoding,
+                 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
     Py_DECREF(v);
     return NULL;
 }
@@ -3465,8 +3472,10 @@
         goto onError;
     if (!PyUnicode_Check(v)) {
         PyErr_Format(PyExc_TypeError,
-                     "encoder did not return an str object (type=%.400s)",
-                     Py_TYPE(v)->tp_name);
+                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
+                     "use codecs.encode() to encode to arbitrary types",
+                     encoding,
+                     Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
         Py_DECREF(v);
         goto onError;
     }
diff --git a/Python/codecs.c b/Python/codecs.c
index c541ba0..e2edc26 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -332,6 +332,22 @@
     return codec_getstreamcodec(encoding, stream, errors, 3);
 }
 
+/* Helper that tries to ensure the reported exception chain indicates the
+ * codec that was invoked to trigger the failure without changing the type
+ * of the exception raised.
+ */
+static void
+wrap_codec_error(const char *operation,
+                 const char *encoding)
+{
+    /* TrySetFromCause will replace the active exception with a suitably
+     * updated clone if it can, otherwise it will leave the original
+     * exception alone.
+     */
+    _PyErr_TrySetFromCause("%s with '%s' codec failed",
+                           operation, encoding);
+}
+
 /* Encode an object (e.g. an Unicode object) using the given encoding
    and return the resulting encoded object (usually a Python string).
 
@@ -376,6 +392,7 @@
     Py_XDECREF(result);
     Py_XDECREF(args);
     Py_XDECREF(encoder);
+    wrap_codec_error("encoding", encoding);
     return NULL;
 }
 
@@ -422,6 +439,7 @@
     Py_XDECREF(args);
     Py_XDECREF(decoder);
     Py_XDECREF(result);
+    wrap_codec_error("decoding", encoding);
     return NULL;
 }