Issue #14744: Use the new _PyUnicodeWriter internal API to speed up str%args and str.format(args)

 * Formatting string, int, float and complex use the _PyUnicodeWriter API. It
   avoids a temporary buffer in most cases.
 * Add _PyUnicodeWriter_WriteStr() to restore the PyAccu optimization: just
   keep a reference to the string if the output is only composed of one string
 * Disable overallocation when formatting the last argument of str%args and
   str.format(args)
 * Overallocation allocates at least 100 characters: add min_length attribute
   to the _PyUnicodeWriter structure
 * Add new private functions: _PyUnicode_FastCopyCharacters(),
   _PyUnicode_FastFill() and _PyUnicode_FromASCII()

The speed up is around 20% in average.
diff --git a/Objects/stringlib/asciilib.h b/Objects/stringlib/asciilib.h
index ab5bae7..f62813d 100644
--- a/Objects/stringlib/asciilib.h
+++ b/Objects/stringlib/asciilib.h
@@ -18,7 +18,7 @@
 #define STRINGLIB_TODECIMAL      Py_UNICODE_TODECIMAL
 #define STRINGLIB_STR            PyUnicode_1BYTE_DATA
 #define STRINGLIB_LEN            PyUnicode_GET_LENGTH
-#define STRINGLIB_NEW            unicode_fromascii
+#define STRINGLIB_NEW(STR,LEN)   _PyUnicode_FromASCII((char*)(STR),(LEN))
 #define STRINGLIB_RESIZE         not_supported
 #define STRINGLIB_CHECK          PyUnicode_Check
 #define STRINGLIB_CHECK_EXACT    PyUnicode_CheckExact
diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h
index 9c0b0cf..d71cf44 100644
--- a/Objects/stringlib/unicode_format.h
+++ b/Objects/stringlib/unicode_format.h
@@ -499,26 +499,26 @@
     int ok = 0;
     PyObject *result = NULL;
     PyObject *format_spec_object = NULL;
-    PyObject *(*formatter)(PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
-    Py_ssize_t len;
+    int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
+    int err;
 
     /* If we know the type exactly, skip the lookup of __format__ and just
        call the formatter directly. */
     if (PyUnicode_CheckExact(fieldobj))
-        formatter = _PyUnicode_FormatAdvanced;
+        formatter = _PyUnicode_FormatAdvancedWriter;
     else if (PyLong_CheckExact(fieldobj))
-        formatter =_PyLong_FormatAdvanced;
+        formatter = _PyLong_FormatAdvancedWriter;
     else if (PyFloat_CheckExact(fieldobj))
-        formatter = _PyFloat_FormatAdvanced;
-
-    /* XXX: for 2.6, convert format_spec to the appropriate type
-       (unicode, str) */
+        formatter = _PyFloat_FormatAdvancedWriter;
+    else if (PyComplex_CheckExact(fieldobj))
+        formatter = _PyComplex_FormatAdvancedWriter;
 
     if (formatter) {
         /* we know exactly which formatter will be called when __format__ is
            looked up, so call it directly, instead. */
-        result = formatter(fieldobj, format_spec->str,
-                           format_spec->start, format_spec->end);
+        err = formatter(writer, fieldobj, format_spec->str,
+                        format_spec->start, format_spec->end);
+        return (err == 0);
     }
     else {
         /* We need to create an object out of the pointers we have, because
@@ -536,17 +536,11 @@
     }
     if (result == NULL)
         goto done;
-    if (PyUnicode_READY(result) == -1)
-        goto done;
 
-    len = PyUnicode_GET_LENGTH(result);
-    if (_PyUnicodeWriter_Prepare(writer,
-                               len, PyUnicode_MAX_CHAR_VALUE(result)) == -1)
+    if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
         goto done;
-    copy_characters(writer->buffer, writer->pos,
-                    result, 0, len);
-    writer->pos += len;
     ok = 1;
+
 done:
     Py_XDECREF(format_spec_object);
     Py_XDECREF(result);
@@ -897,16 +891,19 @@
             err = _PyUnicodeWriter_Prepare(writer, sublen, maxchar);
             if (err == -1)
                 return 0;
-            copy_characters(writer->buffer, writer->pos,
-                            literal.str, literal.start, sublen);
+            _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
+                                          literal.str, literal.start, sublen);
             writer->pos += sublen;
         }
 
-        if (field_present)
+        if (field_present) {
+            if (iter.str.start == iter.str.end)
+                writer->flags.overallocate = 0;
             if (!output_markup(&field_name, &format_spec,
                                format_spec_needs_expanding, conversion, writer,
                                args, kwargs, recursion_depth, auto_number))
                 return 0;
+        }
     }
     return result;
 }
@@ -921,7 +918,7 @@
              int recursion_depth, AutoNumber *auto_number)
 {
     _PyUnicodeWriter writer;
-    Py_ssize_t initlen;
+    Py_ssize_t minlen;
 
     /* check the recursion level */
     if (recursion_depth <= 0) {
@@ -930,9 +927,8 @@
         return NULL;
     }
 
-    initlen = PyUnicode_GET_LENGTH(input->str) + 100;
-    if (_PyUnicodeWriter_Init(&writer, initlen, 127) == -1)
-        return NULL;
+    minlen = PyUnicode_GET_LENGTH(input->str) + 100;
+    _PyUnicodeWriter_Init(&writer, minlen);
 
     if (!do_markup(input, args, kwargs, &writer, recursion_depth,
                    auto_number)) {