bpo-34523: Add _PyCoreConfig.filesystem_encoding (GH-8963)

_PyCoreConfig_Read() is now responsible to choose the filesystem
encoding and error handler. Using Py_Main(), the encoding is now
chosen even before calling Py_Initialize().

_PyCoreConfig.filesystem_encoding is now the reference, instead of
Py_FileSystemDefaultEncoding, for the Python filesystem encoding.

Changes:

* Add filesystem_encoding and filesystem_errors to _PyCoreConfig
* _PyCoreConfig_Read() now reads the locale encoding for the file
  system encoding.
* PyUnicode_EncodeFSDefault() and PyUnicode_DecodeFSDefaultAndSize()
  now use the interpreter configuration rather than
  Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
  global configuration variables.
* Add _Py_SetFileSystemEncoding() and _Py_ClearFileSystemEncoding()
  private functions to only modify Py_FileSystemDefaultEncoding and
  Py_FileSystemDefaultEncodeErrors in coreconfig.c.
* _Py_CoerceLegacyLocale() now takes an int rather than
  _PyCoreConfig for the warning.
diff --git a/Python/coreconfig.c b/Python/coreconfig.c
index 00037d9..0ec4640 100644
--- a/Python/coreconfig.c
+++ b/Python/coreconfig.c
@@ -5,6 +5,11 @@
 #  include <langinfo.h>
 #endif
 
+#include <locale.h>     /* setlocale() */
+#ifdef HAVE_LANGINFO_H
+#include <langinfo.h>   /* nl_langinfo(CODESET) */
+#endif
+
 
 #define DECODE_LOCALE_ERR(NAME, LEN) \
     (((LEN) == -2) \
@@ -32,6 +37,8 @@
 int Py_HasFileSystemDefaultEncoding = 0;
 #endif
 const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape";
+static int _Py_HasFileSystemDefaultEncodeErrors = 1;
+
 /* UTF-8 mode (PEP 540): if equals to 1, use the UTF-8 encoding, and change
    stdin and stdout error handler to "surrogateescape". It is equal to
    -1 by default: unknown, will be set by Py_Main() */
@@ -88,6 +95,47 @@
 }
 
 
+void
+_Py_ClearFileSystemEncoding(void)
+{
+    if (!Py_HasFileSystemDefaultEncoding && Py_FileSystemDefaultEncoding) {
+        PyMem_RawFree((char*)Py_FileSystemDefaultEncoding);
+        Py_FileSystemDefaultEncoding = NULL;
+    }
+    if (!_Py_HasFileSystemDefaultEncodeErrors && Py_FileSystemDefaultEncodeErrors) {
+        PyMem_RawFree((char*)Py_FileSystemDefaultEncodeErrors);
+        Py_FileSystemDefaultEncodeErrors = NULL;
+    }
+}
+
+
+/* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
+   global configuration variables. */
+int
+_Py_SetFileSystemEncoding(const char *encoding, const char *errors)
+{
+    char *encoding2 = _PyMem_RawStrdup(encoding);
+    if (encoding2 == NULL) {
+        return -1;
+    }
+
+    char *errors2 = _PyMem_RawStrdup(errors);
+    if (errors2 == NULL) {
+        PyMem_RawFree(encoding2);
+        return -1;
+    }
+
+    _Py_ClearFileSystemEncoding();
+
+    Py_FileSystemDefaultEncoding = encoding2;
+    Py_HasFileSystemDefaultEncoding = 0;
+
+    Py_FileSystemDefaultEncodeErrors = errors2;
+    _Py_HasFileSystemDefaultEncodeErrors = 0;
+    return 0;
+}
+
+
 /* Helper to allow an embedding application to override the normal
  * mechanism that attempts to figure out an appropriate IO encoding
  */
@@ -209,6 +257,8 @@
 #endif
     CLEAR(config->base_exec_prefix);
 
+    CLEAR(config->filesystem_encoding);
+    CLEAR(config->filesystem_errors);
     CLEAR(config->stdio_encoding);
     CLEAR(config->stdio_errors);
 #undef CLEAR
@@ -302,6 +352,8 @@
     COPY_ATTR(quiet);
     COPY_ATTR(user_site_directory);
     COPY_ATTR(buffered_stdio);
+    COPY_STR_ATTR(filesystem_encoding);
+    COPY_STR_ATTR(filesystem_errors);
     COPY_STR_ATTR(stdio_encoding);
     COPY_STR_ATTR(stdio_errors);
 #ifdef MS_WINDOWS
@@ -312,6 +364,7 @@
     COPY_ATTR(_frozen);
 
 #undef COPY_ATTR
+#undef COPY_STR_ATTR
 #undef COPY_WSTR_ATTR
 #undef COPY_WSTRLIST
     return 0;
@@ -976,8 +1029,8 @@
 }
 
 
-_PyInitError
-_Py_get_locale_encoding(char **locale_encoding)
+static _PyInitError
+get_locale_encoding(char **locale_encoding)
 {
 #ifdef MS_WINDOWS
     char encoding[20];
@@ -1087,7 +1140,7 @@
 
     /* Choose the default error handler based on the current locale. */
     if (config->stdio_encoding == NULL) {
-        _PyInitError err = _Py_get_locale_encoding(&config->stdio_encoding);
+        _PyInitError err = get_locale_encoding(&config->stdio_encoding);
         if (_Py_INIT_FAILED(err)) {
             return err;
         }
@@ -1104,6 +1157,81 @@
 }
 
 
+static _PyInitError
+config_init_fs_encoding(_PyCoreConfig *config)
+{
+#ifdef MS_WINDOWS
+    if (config->legacy_windows_fs_encoding) {
+        /* Legacy Windows filesystem encoding: mbcs/replace */
+        if (config->filesystem_encoding == NULL) {
+            config->filesystem_encoding = _PyMem_RawStrdup("mbcs");
+            if (config->filesystem_encoding == NULL) {
+                return _Py_INIT_NO_MEMORY();
+            }
+        }
+        if (config->filesystem_errors == NULL) {
+            config->filesystem_errors = _PyMem_RawStrdup("replace");
+            if (config->filesystem_errors == NULL) {
+                return _Py_INIT_NO_MEMORY();
+            }
+        }
+    }
+
+    /* Windows defaults to utf-8/surrogatepass (PEP 529) */
+    if (config->filesystem_encoding == NULL) {
+        config->filesystem_encoding = _PyMem_RawStrdup("utf-8");
+        if (config->filesystem_encoding == NULL) {
+            return _Py_INIT_NO_MEMORY();
+        }
+    }
+    if (config->filesystem_errors == NULL) {
+        config->filesystem_errors = _PyMem_RawStrdup("surrogatepass");
+        if (config->filesystem_errors == NULL) {
+            return _Py_INIT_NO_MEMORY();
+        }
+    }
+#else
+    if (config->utf8_mode) {
+        /* UTF-8 Mode use: utf-8/surrogateescape */
+        if (config->filesystem_encoding == NULL) {
+            config->filesystem_encoding = _PyMem_RawStrdup("utf-8");
+            if (config->filesystem_encoding == NULL) {
+                return _Py_INIT_NO_MEMORY();
+            }
+        }
+        /* errors defaults to surrogateescape above */
+    }
+
+    if (config->filesystem_encoding == NULL) {
+        /* macOS and Android use UTF-8, other platforms use
+           the locale encoding. */
+        char *locale_encoding;
+#if defined(__APPLE__) || defined(__ANDROID__)
+        locale_encoding = "UTF-8";
+#else
+        _PyInitError err = get_locale_encoding(&locale_encoding);
+        if (_Py_INIT_FAILED(err)) {
+            return err;
+        }
+#endif
+        config->filesystem_encoding = _PyMem_RawStrdup(locale_encoding);
+        if (config->filesystem_encoding == NULL) {
+            return _Py_INIT_NO_MEMORY();
+        }
+    }
+
+    if (config->filesystem_errors == NULL) {
+        /* by default, use the "surrogateescape" error handler */
+        config->filesystem_errors = _PyMem_RawStrdup("surrogateescape");
+        if (config->filesystem_errors == NULL) {
+            return _Py_INIT_NO_MEMORY();
+        }
+    }
+#endif
+    return _Py_INIT_OK();
+}
+
+
 /* Read configuration settings from standard locations
  *
  * This function doesn't make any changes to the interpreter state - it
@@ -1216,6 +1344,13 @@
         config->argc = 0;
     }
 
+    if (config->filesystem_encoding == NULL && config->filesystem_errors == NULL) {
+        err = config_init_fs_encoding(config);
+        if (_Py_INIT_FAILED(err)) {
+            return err;
+        }
+    }
+
     err = config_init_stdio_encoding(config);
     if (_Py_INIT_FAILED(err)) {
         return err;
@@ -1223,6 +1358,10 @@
 
     assert(config->coerce_c_locale >= 0);
     assert(config->use_environment >= 0);
+    assert(config->filesystem_encoding != NULL);
+    assert(config->filesystem_errors != NULL);
+    assert(config->stdio_encoding != NULL);
+    assert(config->stdio_errors != NULL);
 
     return _Py_INIT_OK();
 }
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 9f6757f..6d97f2f 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -339,7 +339,7 @@
     "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n";
 
 static void
-_coerce_default_locale_settings(const _PyCoreConfig *config, const _LocaleCoercionTarget *target)
+_coerce_default_locale_settings(int warn, const _LocaleCoercionTarget *target)
 {
     const char *newloc = target->locale_name;
 
@@ -352,7 +352,7 @@
                 "Error setting LC_CTYPE, skipping C locale coercion\n");
         return;
     }
-    if (config->coerce_c_locale_warn) {
+    if (warn) {
         fprintf(stderr, C_LOCALE_COERCION_WARNING, newloc);
     }
 
@@ -362,7 +362,7 @@
 #endif
 
 void
-_Py_CoerceLegacyLocale(const _PyCoreConfig *config)
+_Py_CoerceLegacyLocale(int warn)
 {
 #ifdef PY_COERCE_C_LOCALE
     const char *locale_override = getenv("LC_ALL");
@@ -385,7 +385,7 @@
                 }
 #endif
                 /* Successfully configured locale, so make it the default */
-                _coerce_default_locale_settings(config, target);
+                _coerce_default_locale_settings(warn, target);
                 return;
             }
         }
@@ -1162,11 +1162,7 @@
     /* Cleanup Unicode implementation */
     _PyUnicode_Fini();
 
-    /* reset file system default encoding */
-    if (!Py_HasFileSystemDefaultEncoding && Py_FileSystemDefaultEncoding) {
-        PyMem_RawFree((char*)Py_FileSystemDefaultEncoding);
-        Py_FileSystemDefaultEncoding = NULL;
-    }
+    _Py_ClearFileSystemEncoding();
 
     /* XXX Still allocated:
        - various static ad-hoc pointers to interned strings
@@ -1475,59 +1471,31 @@
 static _PyInitError
 initfsencoding(PyInterpreterState *interp)
 {
-    PyObject *codec;
+    _PyCoreConfig *config = &interp->core_config;
 
-#ifdef MS_WINDOWS
-    if (Py_LegacyWindowsFSEncodingFlag) {
-        Py_FileSystemDefaultEncoding = "mbcs";
-        Py_FileSystemDefaultEncodeErrors = "replace";
-    }
-    else {
-        Py_FileSystemDefaultEncoding = "utf-8";
-        Py_FileSystemDefaultEncodeErrors = "surrogatepass";
-    }
-#else
-    if (Py_FileSystemDefaultEncoding == NULL) {
-        if (interp->core_config.utf8_mode) {
-            Py_FileSystemDefaultEncoding = "utf-8";
-            Py_HasFileSystemDefaultEncoding = 1;
-        }
-        else if (_Py_GetForceASCII()) {
-            Py_FileSystemDefaultEncoding = "ascii";
-            Py_HasFileSystemDefaultEncoding = 1;
-        }
-        else {
-            extern _PyInitError _Py_get_locale_encoding(char **locale_encoding);
-
-            char *locale_encoding;
-            _PyInitError err = _Py_get_locale_encoding(&locale_encoding);
-            if (_Py_INIT_FAILED(err)) {
-                return err;
-            }
-
-            Py_FileSystemDefaultEncoding = get_codec_name(locale_encoding);
-            PyMem_RawFree(locale_encoding);
-            if (Py_FileSystemDefaultEncoding == NULL) {
-                return _Py_INIT_ERR("failed to get the Python codec "
-                                    "of the locale encoding");
-            }
-
-            Py_HasFileSystemDefaultEncoding = 0;
-            interp->fscodec_initialized = 1;
-            return _Py_INIT_OK();
-        }
-    }
-#endif
-
-    /* the encoding is mbcs, utf-8 or ascii */
-    codec = _PyCodec_Lookup(Py_FileSystemDefaultEncoding);
-    if (!codec) {
+    char *encoding = get_codec_name(config->filesystem_encoding);
+    if (encoding == NULL) {
         /* Such error can only occurs in critical situations: no more
-         * memory, import a module of the standard library failed,
-         * etc. */
-        return _Py_INIT_ERR("unable to load the file system codec");
+           memory, import a module of the standard library failed, etc. */
+        return _Py_INIT_ERR("failed to get the Python codec "
+                            "of the filesystem encoding");
     }
-    Py_DECREF(codec);
+
+    /* Update the filesystem encoding to the normalized Python codec name.
+       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
+       (Python codec name). */
+    PyMem_RawFree(config->filesystem_encoding);
+    config->filesystem_encoding = encoding;
+
+    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
+       global configuration variables. */
+    if (_Py_SetFileSystemEncoding(config->filesystem_encoding,
+                                  config->filesystem_errors) < 0) {
+        return _Py_INIT_NO_MEMORY();
+    }
+
+    /* PyUnicode can now use the Python codec rather than C implementation
+       for the filesystem encoding */
     interp->fscodec_initialized = 1;
     return _Py_INIT_OK();
 }
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 177b830..91df4b0 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -389,11 +389,9 @@
 static PyObject *
 sys_getfilesystemencoding(PyObject *self, PyObject *Py_UNUSED(ignored))
 {
-    if (Py_FileSystemDefaultEncoding)
-        return PyUnicode_FromString(Py_FileSystemDefaultEncoding);
-    PyErr_SetString(PyExc_RuntimeError,
-                    "filesystem encoding is not initialized");
-    return NULL;
+    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
+    const _PyCoreConfig *config = &interp->core_config;
+    return PyUnicode_FromString(config->filesystem_encoding);
 }
 
 PyDoc_STRVAR(getfilesystemencoding_doc,
@@ -406,11 +404,9 @@
 static PyObject *
 sys_getfilesystemencodeerrors(PyObject *self, PyObject *Py_UNUSED(ignored))
 {
-    if (Py_FileSystemDefaultEncodeErrors)
-        return PyUnicode_FromString(Py_FileSystemDefaultEncodeErrors);
-    PyErr_SetString(PyExc_RuntimeError,
-        "filesystem encoding is not initialized");
-    return NULL;
+    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
+    const _PyCoreConfig *config = &interp->core_config;
+    return PyUnicode_FromString(config->filesystem_errors);
 }
 
 PyDoc_STRVAR(getfilesystemencodeerrors_doc,
@@ -1150,8 +1146,30 @@
 static PyObject *
 sys_enablelegacywindowsfsencoding(PyObject *self)
 {
-    Py_FileSystemDefaultEncoding = "mbcs";
-    Py_FileSystemDefaultEncodeErrors = "replace";
+    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
+    _PyCoreConfig *config = &interp->core_config;
+
+    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
+    char *encoding = _PyMem_RawStrdup("mbcs");
+    char *errors = _PyMem_RawStrdup("replace");
+    if (encoding == NULL || errors == NULL) {
+        PyMem_Free(encoding);
+        PyMem_Free(errors);
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    PyMem_RawFree(config->filesystem_encoding);
+    config->filesystem_encoding = encoding;
+    PyMem_RawFree(config->filesystem_errors);
+    config->filesystem_errors = errors;
+
+    if (_Py_SetFileSystemEncoding(config->filesystem_encoding,
+                                  config->filesystem_errors) < 0) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
     Py_RETURN_NONE;
 }