Issue #14744: Use the new _PyUnicodeWriter internal API to speed up str%args and str.format(args)

 * Formatting string, int, float and complex use the _PyUnicodeWriter API. It
   avoids a temporary buffer in most cases.
 * Add _PyUnicodeWriter_WriteStr() to restore the PyAccu optimization: just
   keep a reference to the string if the output is only composed of one string
 * Disable overallocation when formatting the last argument of str%args and
   str.format(args)
 * Overallocation allocates at least 100 characters: add min_length attribute
   to the _PyUnicodeWriter structure
 * Add new private functions: _PyUnicode_FastCopyCharacters(),
   _PyUnicode_FastFill() and _PyUnicode_FromASCII()

The speed up is around 20% in average.
diff --git a/Include/complexobject.h b/Include/complexobject.h
index 3e4ecff..1934f3b 100644
--- a/Include/complexobject.h
+++ b/Include/complexobject.h
@@ -63,10 +63,12 @@
 /* Format the object based on the format_spec, as defined in PEP 3101
    (Advanced String Formatting). */
 #ifndef Py_LIMITED_API
-PyAPI_FUNC(PyObject *) _PyComplex_FormatAdvanced(PyObject *obj,
-                                                 PyObject *format_spec,
-                                                 Py_ssize_t start,
-                                                 Py_ssize_t end);
+PyAPI_FUNC(int) _PyComplex_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
 #endif
 
 #ifdef __cplusplus
diff --git a/Include/floatobject.h b/Include/floatobject.h
index 0ca4881..46ef6e6 100644
--- a/Include/floatobject.h
+++ b/Include/floatobject.h
@@ -112,10 +112,12 @@
 
 /* Format the object based on the format_spec, as defined in PEP 3101
    (Advanced String Formatting). */
-PyAPI_FUNC(PyObject *) _PyFloat_FormatAdvanced(PyObject *obj,
-                                               PyObject *format_spec,
-                                               Py_ssize_t start,
-                                               Py_ssize_t end);
+PyAPI_FUNC(int) _PyFloat_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
 #endif /* Py_LIMITED_API */
 
 #ifdef __cplusplus
diff --git a/Include/longobject.h b/Include/longobject.h
index c58ddf4..d741f1b 100644
--- a/Include/longobject.h
+++ b/Include/longobject.h
@@ -151,14 +151,22 @@
 
 /* _PyLong_Format: Convert the long to a string object with given base,
    appending a base prefix of 0[box] if base is 2, 8 or 16. */
-PyAPI_FUNC(PyObject *) _PyLong_Format(PyObject *aa, int base);
+PyAPI_FUNC(PyObject *) _PyLong_Format(PyObject *obj, int base);
+
+PyAPI_FUNC(int) _PyLong_FormatWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    int base,
+    int alternate);
 
 /* Format the object based on the format_spec, as defined in PEP 3101
    (Advanced String Formatting). */
-PyAPI_FUNC(PyObject *) _PyLong_FormatAdvanced(PyObject *obj,
-                                              PyObject *format_spec,
-                                              Py_ssize_t start,
-                                              Py_ssize_t end);
+PyAPI_FUNC(int) _PyLong_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
 #endif /* Py_LIMITED_API */
 
 /* These aren't really part of the long object, but they're handy. The
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 486d4fa..99ea48b 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -648,8 +648,20 @@
     Py_ssize_t from_start,
     Py_ssize_t how_many
     );
+
+/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
+   may crash if parameters are invalid (e.g. if the output string
+   is too short). */
+PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
+    PyObject *to,
+    Py_ssize_t to_start,
+    PyObject *from,
+    Py_ssize_t from_start,
+    Py_ssize_t how_many
+    );
 #endif
 
+#ifndef Py_LIMITED_API
 /* Fill a string with a character: write fill_char into
    unicode[start:start+length].
 
@@ -658,13 +670,21 @@
 
    Return the number of written character, or return -1 and raise an exception
    on error. */
-#ifndef Py_LIMITED_API
 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
     PyObject *unicode,
     Py_ssize_t start,
     Py_ssize_t length,
     Py_UCS4 fill_char
     );
+
+/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
+   if parameters are invalid (e.g. if length is longer than the string). */
+PyAPI_FUNC(void) _PyUnicode_FastFill(
+    PyObject *unicode,
+    Py_ssize_t start,
+    Py_ssize_t length,
+    Py_UCS4 fill_char
+    );
 #endif
 
 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
@@ -696,13 +716,19 @@
     const char *u              /* UTF-8 encoded string */
     );
 
+#ifndef Py_LIMITED_API
 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
    Scan the string to find the maximum character. */
-#ifndef Py_LIMITED_API
 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
     int kind,
     const void *buffer,
     Py_ssize_t size);
+
+/* Create a new string from a buffer of ASCII characters.
+   WARNING: Don't check if the string contains any non-ASCII character. */
+PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
+    const char *buffer,
+    Py_ssize_t size);
 #endif
 
 PyAPI_FUNC(PyObject*) PyUnicode_Substring(
@@ -865,12 +891,69 @@
     );
 
 #ifndef Py_LIMITED_API
+typedef struct {
+    PyObject *buffer;
+    void *data;
+    enum PyUnicode_Kind kind;
+    Py_UCS4 maxchar;
+    Py_ssize_t size;
+    Py_ssize_t pos;
+    /* minimum length of the buffer when overallocation is enabled,
+       see _PyUnicodeWriter_Init() */
+    Py_ssize_t min_length;
+    struct {
+        unsigned char overallocate:1;
+        /* If readonly is 1, buffer is a shared string (cannot be modified)
+           and size is set to 0. */
+        unsigned char readonly:1;
+    } flags;
+} _PyUnicodeWriter ;
+
+/* Initialize a Unicode writer.
+
+   If min_length is greater than zero, _PyUnicodeWriter_Prepare()
+   overallocates the buffer and min_length is the minimum length in characters
+   of the buffer. */
+PyAPI_FUNC(void)
+_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length);
+
+/* Prepare the buffer to write 'length' characters
+   with the specified maximum character.
+
+   Return 0 on success, raise an exception and return -1 on error. */
+#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
+    (((MAXCHAR) <= (WRITER)->maxchar                                  \
+      && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
+     ? 0                                                              \
+     : (((LENGTH) == 0)                                               \
+        ? 0                                                           \
+        : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
+
+/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
+   instead. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
+                                 Py_ssize_t length, Py_UCS4 maxchar);
+
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str);
+
+PyAPI_FUNC(PyObject *)
+_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
+
+PyAPI_FUNC(void)
+_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
+#endif
+
+#ifndef Py_LIMITED_API
 /* Format the object based on the format_spec, as defined in PEP 3101
    (Advanced String Formatting). */
-PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
-                                                 PyObject *format_spec,
-                                                 Py_ssize_t start,
-                                                 Py_ssize_t end);
+PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
 #endif
 
 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);