Issue #13417: speed up utf-8 decoding by around 2x for the non-fully-ASCII case.
This almost catches up with pre-PEP 393 performance, when decoding needed
only one pass.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 6307a98..9dedf0b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -523,6 +523,7 @@
 #include "stringlib/fastsearch.h"
 #include "stringlib/count.h"
 #include "stringlib/find.h"
+#include "stringlib/undef.h"
 
 /* --- Unicode Object ----------------------------------------------------- */
 
@@ -4190,6 +4191,18 @@
     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
 }
 
+#include "stringlib/ucs1lib.h"
+#include "stringlib/codecs.h"
+#include "stringlib/undef.h"
+
+#include "stringlib/ucs2lib.h"
+#include "stringlib/codecs.h"
+#include "stringlib/undef.h"
+
+#include "stringlib/ucs4lib.h"
+#include "stringlib/codecs.h"
+#include "stringlib/undef.h"
+
 /* Mask to check or force alignment of a pointer to C 'long' boundaries */
 #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
 
@@ -4203,33 +4216,41 @@
 # error C 'long' size should be either 4 or 8!
 #endif
 
-/* Scans a UTF-8 string and returns the maximum character to be expected,
-   the size of the decoded unicode string and if any major errors were
-   encountered.
+/* Scans a UTF-8 string and returns the maximum character to be expected
+   and the size of the decoded unicode string.
 
-   This function does check basic UTF-8 sanity, it does however NOT CHECK
-   if the string contains surrogates, and if all continuation bytes are
-   within the correct ranges, these checks are performed in
+   This function doesn't check for errors, these checks are performed in
    PyUnicode_DecodeUTF8Stateful.
-
-   If it sets has_errors to 1, it means the value of unicode_size and max_char
-   will be bogus and you should not rely on useful information in them.
    */
 static Py_UCS4
-utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
-                                  Py_ssize_t *unicode_size, Py_ssize_t* consumed,
-                                  int *has_errors)
+utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
+                                  Py_ssize_t *unicode_size)
 {
-    Py_ssize_t n;
     Py_ssize_t char_count = 0;
-    Py_UCS4 max_char = 127, new_max;
-    Py_UCS4 upper_bound;
     const unsigned char *p = (const unsigned char *)s;
     const unsigned char *end = p + string_size;
     const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
-    int err = 0;
 
-    for (; p < end && !err; ++p, ++char_count) {
+    assert(unicode_size != NULL);
+
+    /* By having a cascade of independent loops which fallback onto each
+       other, we minimize the amount of work done in the average loop
+       iteration, and we also maximize the CPU's ability to predict
+       branches correctly (because a given condition will have always the
+       same boolean outcome except perhaps in the last iteration of the
+       corresponding loop).
+       In the general case this brings us rather close to decoding
+       performance pre-PEP 393, despite the two-pass decoding.
+
+       Note that the pure ASCII loop is not duplicated once a non-ASCII
+       character has been encountered. It is actually a pessimization (by
+       a significant factor) to use this loop on text with many non-ASCII
+       characters, and it is important to avoid bad performance on valid
+       utf-8 data (invalid utf-8 being a different can of worms).
+    */
+
+    /* ASCII */
+    for (; p < end; ++p) {
         /* Only check value if it's not a ASCII char... */
         if (*p < 0x80) {
             /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
@@ -4249,76 +4270,59 @@
                     break;
             }
         }
-        if (*p >= 0x80) {
-            n = utf8_code_length[*p];
-            new_max = max_char;
-            switch (n) {
-            /* invalid start byte */
-            case 0:
-                err = 1;
-                break;
-            case 2:
-                /* Code points between 0x00FF and 0x07FF inclusive.
-                   Approximate the upper bound of the code point,
-                   if this flips over 255 we can be sure it will be more
-                   than 255 and the string will need 2 bytes per code coint,
-                   if it stays under or equal to 255, we can be sure 1 byte
-                   is enough.
-                   ((*p & 0b00011111) << 6) | 0b00111111 */
-                upper_bound = ((*p & 0x1F) << 6) | 0x3F;
-                if (max_char < upper_bound)
-                    new_max = upper_bound;
-                /* Ensure we track at least that we left ASCII space. */
-                if (new_max < 128)
-                    new_max = 128;
-                break;
-            case 3:
-                /* Between 0x0FFF and 0xFFFF inclusive, so values are
-                   always > 255 and <= 65535 and will always need 2 bytes. */
-                if (max_char < 65535)
-                    new_max = 65535;
-                break;
-            case 4:
-                /* Code point will be above 0xFFFF for sure in this case. */
-                new_max = 65537;
-                break;
-            /* Internal error, this should be caught by the first if */
-            case 1:
-            default:
-                assert(0 && "Impossible case in utf8_max_char_and_size");
-                err = 1;
-            }
-            /* Instead of number of overall bytes for this code point,
-               n contains the number of following bytes: */
-            --n;
-            /* Check if the follow up chars are all valid continuation bytes */
-            if (n >= 1) {
-                const unsigned char *cont;
-                if ((p + n) >= end) {
-                    if (consumed == 0)
-                        /* incomplete data, non-incremental decoding */
-                        err = 1;
-                    break;
-                }
-                for (cont = p + 1; cont <= (p + n); ++cont) {
-                    if ((*cont & 0xc0) != 0x80) {
-                        err = 1;
-                        break;
-                    }
-                }
-                p += n;
-            }
-            else
-                err = 1;
-            max_char = new_max;
-        }
+        if (*p < 0x80)
+            ++char_count;
+        else
+            goto _ucs1loop;
     }
+    *unicode_size = char_count;
+    return 127;
 
-    if (unicode_size)
-        *unicode_size = char_count;
-    if (has_errors)
-        *has_errors = err;
-    return max_char;
+_ucs1loop:
+    for (; p < end; ++p) {
+        if (*p < 0xc4)
+            char_count += ((*p & 0xc0) != 0x80);
+        else
+            goto _ucs2loop;
+    }
+    *unicode_size = char_count;
+    return 255;
+
+_ucs2loop:
+    for (; p < end; ++p) {
+        if (*p < 0xf0)
+            char_count += ((*p & 0xc0) != 0x80);
+        else
+            goto _ucs4loop;
+    }
+    *unicode_size = char_count;
+    return 65535;
+
+_ucs4loop:
+    for (; p < end; ++p) {
+        char_count += ((*p & 0xc0) != 0x80);
+    }
+    *unicode_size = char_count;
+    return 65537;
+}
+
+/* Called when we encountered some error that wasn't detected in the original
+   scan, e.g. an encoded surrogate character. The original maxchar computation
+   may have been incorrect, so redo it. */
+static int
+refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
+{
+    PyObject *tmp;
+    Py_ssize_t k, maxchar;
+    for (k = 0, maxchar = 0; k < n; k++)
+        maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
+    tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
+    if (tmp == NULL)
+        return -1;
+    PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
+    Py_DECREF(*unicode);
+    *unicode = tmp;
+    return 0;
 }
 
 /* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
@@ -4361,35 +4365,56 @@
     Py_ssize_t i;
     int kind;
     void *data;
-    int has_errors;
+    int has_errors = 0;
 
     if (size == 0) {
         if (consumed)
             *consumed = 0;
         return (PyObject *)PyUnicode_New(0, 0);
     }
-    maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
-                                                consumed, &has_errors);
-    if (has_errors)
-        /* maxchar and size computation might be incorrect;
-           code below widens and resizes as necessary. */
-        unicode = PyUnicode_New(size, 127);
-    else
-        unicode = PyUnicode_New(unicode_size, maxchar);
+    maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
+    /* In case of errors, maxchar and size computation might be incorrect;
+       code below refits and resizes as necessary. */
+    unicode = PyUnicode_New(unicode_size, maxchar);
     if (!unicode)
         return NULL;
     /* When the string is ASCII only, just use memcpy and return.
        unicode_size may be != size if there is an incomplete UTF-8
        sequence at the end of the ASCII block.  */
-    if (!has_errors && maxchar < 128 && size == unicode_size) {
+    if (maxchar < 128 && size == unicode_size) {
         Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
         return unicode;
     }
     kind = PyUnicode_KIND(unicode);
     data = PyUnicode_DATA(unicode);
+
     /* Unpack UTF-8 encoded data */
     i = 0;
     e = s + size;
+    switch (kind) {
+    case PyUnicode_1BYTE_KIND:
+        has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
+        break;
+    case PyUnicode_2BYTE_KIND:
+        has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
+        break;
+    case PyUnicode_4BYTE_KIND:
+        has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
+        break;
+    }
+    if (!has_errors) {
+        /* Ensure the unicode size calculation was correct */
+        assert(i == unicode_size);
+        assert(s == e);
+        if (consumed)
+            *consumed = s-starts;
+        return unicode;
+    }
+    /* Fall through to the generic decoding loop for the rest of
+       the string */
+    if (refit_partial_string(&unicode, kind, data, i) < 0)
+        goto onError;
+
     aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
 
     while (s < e) {
@@ -4541,19 +4566,8 @@
 
       utf8Error:
         if (!has_errors) {
-            PyObject *tmp;
-            Py_ssize_t k;
-            /* We encountered some error that wasn't detected in the original scan,
-               e.g. an encoded surrogate character. The original maxchar computation may
-               have been incorrect, so redo it now. */
-            for (k = 0, maxchar = 0; k < i; k++)
-                maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
-            tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar);
-            if (tmp == NULL)
+            if (refit_partial_string(&unicode, kind, data, i) < 0)
                 goto onError;
-            PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i);
-            Py_DECREF(unicode);
-            unicode = tmp;
             has_errors = 1;
         }
         if (unicode_decode_call_errorhandler(