Close #17828: better handling of codec errors

- output type errors now redirect users to the type-neutral
  convenience functions in the codecs module
- stateless errors that occur during encoding and decoding
  will now be automatically wrapped in exceptions that give
  the name of the codec involved
diff --git a/Objects/exceptions.c b/Objects/exceptions.c
index de5d746..53d8b66 100644
--- a/Objects/exceptions.c
+++ b/Objects/exceptions.c
@@ -2591,3 +2591,116 @@
     free_preallocated_memerrors();
     Py_CLEAR(errnomap);
 }
+
+/* Helper to do the equivalent of "raise X from Y" in C, but always using
+ * the current exception rather than passing one in.
+ *
+ * We currently limit this to *only* exceptions that use the BaseException
+ * tp_init and tp_new methods, since we can be reasonably sure we can wrap
+ * those correctly without losing data and without losing backwards
+ * compatibility.
+ *
+ * We also aim to rule out *all* exceptions that might be storing additional
+ * state, whether by having a size difference relative to BaseException,
+ * additional arguments passed in during construction or by having a
+ * non-empty instance dict.
+ *
+ * We need to be very careful with what we wrap, since changing types to
+ * a broader exception type would be backwards incompatible for
+ * existing codecs, and with different init or new method implementations
+ * may either not support instantiation with PyErr_Format or lose
+ * information when instantiated that way.
+ *
+ * XXX (ncoghlan): This could be made more comprehensive by exploiting the
+ * fact that exceptions are expected to support pickling. If more builtin
+ * exceptions (e.g. AttributeError) start to be converted to rich
+ * exceptions with additional attributes, that's probably a better approach
+ * to pursue over adding special cases for particular stateful subclasses.
+ *
+ * Returns a borrowed reference to the new exception (if any), NULL if the
+ * existing exception was left in place.
+ */
+PyObject *
+_PyErr_TrySetFromCause(const char *format, ...)
+{
+    PyObject* msg_prefix;
+    PyObject *exc, *val, *tb;
+    PyTypeObject *caught_type;
+    PyObject *instance_dict;
+    PyObject *instance_args;
+    Py_ssize_t num_args;
+    PyObject *new_exc, *new_val, *new_tb;
+    va_list vargs;
+
+#ifdef HAVE_STDARG_PROTOTYPES
+    va_start(vargs, format);
+#else
+    va_start(vargs);
+#endif
+
+    PyErr_Fetch(&exc, &val, &tb);
+    caught_type = (PyTypeObject *) exc;
+    /* Ensure type info indicates no extra state is stored at the C level */
+    if (caught_type->tp_init != (initproc) BaseException_init ||
+        caught_type->tp_new != BaseException_new ||
+        caught_type->tp_basicsize != _PyExc_BaseException.tp_basicsize ||
+        caught_type->tp_itemsize != _PyExc_BaseException.tp_itemsize
+       ) {
+        /* We can't be sure we can wrap this safely, since it may contain
+         * more state than just the exception type. Accordingly, we just
+         * leave it alone.
+         */
+        PyErr_Restore(exc, val, tb);
+        return NULL;
+    }
+
+    /* Check the args are empty or contain a single string */
+    PyErr_NormalizeException(&exc, &val, &tb);
+    instance_args = ((PyBaseExceptionObject *) val)->args;
+    num_args = PyTuple_GET_SIZE(instance_args);
+    if ((num_args > 1) ||
+        (num_args == 1 &&
+         !PyUnicode_CheckExact(PyTuple_GET_ITEM(instance_args, 0))
+        )
+       ) {
+        /* More than 1 arg, or the one arg we do have isn't a string
+         */
+        PyErr_Restore(exc, val, tb);
+        return NULL;
+    }
+
+    /* Ensure the instance dict is also empty */
+    instance_dict = *_PyObject_GetDictPtr(val);
+    if (instance_dict != NULL && PyObject_Length(instance_dict) > 0) {
+        /* While we could potentially copy a non-empty instance dictionary
+         * to the replacement exception, for now we take the more
+         * conservative path of leaving exceptions with attributes set
+         * alone.
+         */
+        PyErr_Restore(exc, val, tb);
+        return NULL;
+    }
+
+    /* For exceptions that we can wrap safely, we chain the original
+     * exception to a new one of the exact same type with an
+     * error message that mentions the additional details and the
+     * original exception.
+     *
+     * It would be nice to wrap OSError and various other exception
+     * types as well, but that's quite a bit trickier due to the extra
+     * state potentially stored on OSError instances.
+     */
+    msg_prefix = PyUnicode_FromFormatV(format, vargs);
+    if (msg_prefix == NULL)
+        return NULL;
+
+    PyErr_Format(exc, "%U (%s: %S)",
+                 msg_prefix, Py_TYPE(val)->tp_name, val);
+    Py_DECREF(exc);
+    Py_XDECREF(tb);
+    PyErr_Fetch(&new_exc, &new_val, &new_tb);
+    PyErr_NormalizeException(&new_exc, &new_val, &new_tb);
+    PyException_SetCause(new_val, val);
+    PyErr_Restore(new_exc, new_val, new_tb);
+    return new_val;
+}
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 224a80b..7789816 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3054,8 +3054,10 @@
         goto onError;
     if (!PyUnicode_Check(unicode)) {
         PyErr_Format(PyExc_TypeError,
-                     "decoder did not return a str object (type=%.400s)",
-                     Py_TYPE(unicode)->tp_name);
+                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
+                     "use codecs.decode() to decode to arbitrary types",
+                     encoding,
+                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
         Py_DECREF(unicode);
         goto onError;
     }
@@ -3113,8 +3115,10 @@
         goto onError;
     if (!PyUnicode_Check(v)) {
         PyErr_Format(PyExc_TypeError,
-                     "decoder did not return a str object (type=%.400s)",
-                     Py_TYPE(v)->tp_name);
+                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
+                     "use codecs.decode() to decode to arbitrary types",
+                     encoding,
+                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
         Py_DECREF(v);
         goto onError;
     }
@@ -3425,7 +3429,8 @@
         PyObject *b;
 
         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-            "encoder %s returned bytearray instead of bytes",
+            "encoder %s returned bytearray instead of bytes; "
+            "use codecs.encode() to encode to arbitrary types",
             encoding);
         if (error) {
             Py_DECREF(v);
@@ -3438,8 +3443,10 @@
     }
 
     PyErr_Format(PyExc_TypeError,
-                 "encoder did not return a bytes object (type=%.400s)",
-                 Py_TYPE(v)->tp_name);
+                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
+                 "use codecs.encode() to encode to arbitrary types",
+                 encoding,
+                 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
     Py_DECREF(v);
     return NULL;
 }
@@ -3465,8 +3472,10 @@
         goto onError;
     if (!PyUnicode_Check(v)) {
         PyErr_Format(PyExc_TypeError,
-                     "encoder did not return an str object (type=%.400s)",
-                     Py_TYPE(v)->tp_name);
+                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
+                     "use codecs.encode() to encode to arbitrary types",
+                     encoding,
+                     Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
         Py_DECREF(v);
         goto onError;
     }