bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)

* Add -X utf8 command line option, PYTHONUTF8 environment variable
  and a new sys.flags.utf8_mode flag.
* If the LC_CTYPE locale is "C" at startup: enable automatically the
  UTF-8 mode.
* Add _winapi.GetACP(). encodings._alias_mbcs() now calls
  _winapi.GetACP() to get the ANSI code page
* locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8
  mode. As a side effect, open() now uses the UTF-8 encoding by
  default in this mode.
* Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding
  in the UTF-8 Mode.
* Update subprocess._args_from_interpreter_flags() to handle -X utf8
* Skip some tests relying on the current locale if the UTF-8 mode is
  enabled.
* Add test_utf8mode.py.
* _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to
  return also the length (number of wide characters).
* pymain_get_global_config() and pymain_set_global_config() now
  always copy flag values, rather than only copying if the new value
  is greater than the old value.
diff --git a/Modules/_winapi.c b/Modules/_winapi.c
index 0a1d139..604c05d 100644
--- a/Modules/_winapi.c
+++ b/Modules/_winapi.c
@@ -1490,6 +1490,20 @@
 }
 
 
+/*[clinic input]
+_winapi.GetACP
+
+Get the current Windows ANSI code page identifier.
+[clinic start generated code]*/
+
+static PyObject *
+_winapi_GetACP_impl(PyObject *module)
+/*[clinic end generated code: output=f7ee24bf705dbb88 input=1433c96d03a05229]*/
+{
+    return PyLong_FromUnsignedLong(GetACP());
+}
+
+
 static PyMethodDef winapi_functions[] = {
     _WINAPI_CLOSEHANDLE_METHODDEF
     _WINAPI_CONNECTNAMEDPIPE_METHODDEF
@@ -1515,6 +1529,7 @@
     _WINAPI_WAITFORMULTIPLEOBJECTS_METHODDEF
     _WINAPI_WAITFORSINGLEOBJECT_METHODDEF
     _WINAPI_WRITEFILE_METHODDEF
+    _WINAPI_GETACP_METHODDEF
     {NULL, NULL}
 };
 
@@ -1595,14 +1610,14 @@
     WINAPI_CONSTANT(F_DWORD, WAIT_OBJECT_0);
     WINAPI_CONSTANT(F_DWORD, WAIT_ABANDONED_0);
     WINAPI_CONSTANT(F_DWORD, WAIT_TIMEOUT);
-    
+
     WINAPI_CONSTANT(F_DWORD, ABOVE_NORMAL_PRIORITY_CLASS);
     WINAPI_CONSTANT(F_DWORD, BELOW_NORMAL_PRIORITY_CLASS);
     WINAPI_CONSTANT(F_DWORD, HIGH_PRIORITY_CLASS);
     WINAPI_CONSTANT(F_DWORD, IDLE_PRIORITY_CLASS);
     WINAPI_CONSTANT(F_DWORD, NORMAL_PRIORITY_CLASS);
     WINAPI_CONSTANT(F_DWORD, REALTIME_PRIORITY_CLASS);
-    
+
     WINAPI_CONSTANT(F_DWORD, CREATE_NO_WINDOW);
     WINAPI_CONSTANT(F_DWORD, DETACHED_PROCESS);
     WINAPI_CONSTANT(F_DWORD, CREATE_DEFAULT_ERROR_MODE);
diff --git a/Modules/clinic/_winapi.c.h b/Modules/clinic/_winapi.c.h
index 01bba36..e5781efb 100644
--- a/Modules/clinic/_winapi.c.h
+++ b/Modules/clinic/_winapi.c.h
@@ -889,4 +889,22 @@
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=fba2ad7bf1a87e4a input=a9049054013a1b77]*/
+
+PyDoc_STRVAR(_winapi_GetACP__doc__,
+"GetACP($module, /)\n"
+"--\n"
+"\n"
+"Get the current Windows ANSI code page identifier.");
+
+#define _WINAPI_GETACP_METHODDEF    \
+    {"GetACP", (PyCFunction)_winapi_GetACP, METH_NOARGS, _winapi_GetACP__doc__},
+
+static PyObject *
+_winapi_GetACP_impl(PyObject *module);
+
+static PyObject *
+_winapi_GetACP(PyObject *module, PyObject *Py_UNUSED(ignored))
+{
+    return _winapi_GetACP_impl(module);
+}
+/*[clinic end generated code: output=fd91c1ec286f0bf3 input=a9049054013a1b77]*/
diff --git a/Modules/main.c b/Modules/main.c
index ac8a38c..9ce111c 100644
--- a/Modules/main.c
+++ b/Modules/main.c
@@ -1114,50 +1114,32 @@
 }
 
 
-static void
-pymain_get_flag(int flag, int *value)
-{
-    if (flag) {
-        *value = flag;
-    }
-}
-
-static void
-pymain_set_flag(int *flag, int value)
-{
-    /* Helper to set flag variables from command line options
-    *   - uses the higher of the two values if they're both set
-    *   - otherwise leaves the flag unset
-    */
-    if (*flag < value) {
-        *flag = value;
-    }
-}
-
-
 /* Get Py_xxx global configuration variables */
 static void
 pymain_get_global_config(_PyMain *pymain)
 {
     _Py_CommandLineDetails *cmdline = &pymain->cmdline;
-    pymain_get_flag(Py_BytesWarningFlag, &cmdline->bytes_warning);
-    pymain_get_flag(Py_DebugFlag, &cmdline->debug);
-    pymain_get_flag(Py_InspectFlag, &cmdline->inspect);
-    pymain_get_flag(Py_InteractiveFlag, &cmdline->interactive);
-    pymain_get_flag(Py_IsolatedFlag, &cmdline->isolated);
-    pymain_get_flag(Py_OptimizeFlag, &cmdline->optimization_level);
-    pymain_get_flag(Py_DontWriteBytecodeFlag, &cmdline->dont_write_bytecode);
-    pymain_get_flag(Py_NoUserSiteDirectory, &cmdline->no_user_site_directory);
-    pymain_get_flag(Py_NoSiteFlag, &cmdline->no_site_import);
-    pymain_get_flag(Py_UnbufferedStdioFlag, &cmdline->use_unbuffered_io);
-    pymain_get_flag(Py_VerboseFlag, &cmdline->verbosity);
-    pymain_get_flag(Py_QuietFlag, &cmdline->quiet_flag);
-#ifdef MS_WINDOWS
-    pymain_get_flag(Py_LegacyWindowsFSEncodingFlag, &cmdline->legacy_windows_fs_encoding);
-    pymain_get_flag(Py_LegacyWindowsStdioFlag, &cmdline->legacy_windows_stdio);
-#endif
 
-    pymain_get_flag(Py_IgnoreEnvironmentFlag, &pymain->core_config.ignore_environment);
+    cmdline->bytes_warning = Py_BytesWarningFlag;
+    cmdline->debug = Py_DebugFlag;
+    cmdline->inspect = Py_InspectFlag;
+    cmdline->interactive = Py_InteractiveFlag;
+    cmdline->isolated = Py_IsolatedFlag;
+    cmdline->optimization_level = Py_OptimizeFlag;
+    cmdline->dont_write_bytecode = Py_DontWriteBytecodeFlag;
+    cmdline->no_user_site_directory = Py_NoUserSiteDirectory;
+    cmdline->no_site_import = Py_NoSiteFlag;
+    cmdline->use_unbuffered_io = Py_UnbufferedStdioFlag;
+    cmdline->verbosity = Py_VerboseFlag;
+    cmdline->quiet_flag = Py_QuietFlag;
+#ifdef MS_WINDOWS
+    cmdline->legacy_windows_fs_encoding = Py_LegacyWindowsFSEncodingFlag;
+    cmdline->legacy_windows_stdio = Py_LegacyWindowsStdioFlag;
+#endif
+    cmdline->check_hash_pycs_mode = _Py_CheckHashBasedPycsMode ;
+
+    pymain->core_config.ignore_environment = Py_IgnoreEnvironmentFlag;
+    pymain->core_config.utf8_mode = Py_UTF8Mode;
 }
 
 
@@ -1166,26 +1148,27 @@
 pymain_set_global_config(_PyMain *pymain)
 {
     _Py_CommandLineDetails *cmdline = &pymain->cmdline;
-    pymain_set_flag(&Py_BytesWarningFlag, cmdline->bytes_warning);
-    pymain_set_flag(&Py_DebugFlag, cmdline->debug);
-    pymain_set_flag(&Py_InspectFlag, cmdline->inspect);
-    pymain_set_flag(&Py_InteractiveFlag, cmdline->interactive);
-    pymain_set_flag(&Py_IsolatedFlag, cmdline->isolated);
-    pymain_set_flag(&Py_OptimizeFlag, cmdline->optimization_level);
-    pymain_set_flag(&Py_DontWriteBytecodeFlag, cmdline->dont_write_bytecode);
-    pymain_set_flag(&Py_NoUserSiteDirectory, cmdline->no_user_site_directory);
-    pymain_set_flag(&Py_NoSiteFlag, cmdline->no_site_import);
-    pymain_set_flag(&Py_UnbufferedStdioFlag, cmdline->use_unbuffered_io);
-    pymain_set_flag(&Py_VerboseFlag, cmdline->verbosity);
-    pymain_set_flag(&Py_QuietFlag, cmdline->quiet_flag);
-    if (cmdline->check_hash_pycs_mode)
-        _Py_CheckHashBasedPycsMode = cmdline->check_hash_pycs_mode;
+
+    Py_BytesWarningFlag = cmdline->bytes_warning;
+    Py_DebugFlag = cmdline->debug;
+    Py_InspectFlag = cmdline->inspect;
+    Py_InteractiveFlag = cmdline->interactive;
+    Py_IsolatedFlag = cmdline->isolated;
+    Py_OptimizeFlag = cmdline->optimization_level;
+    Py_DontWriteBytecodeFlag = cmdline->dont_write_bytecode;
+    Py_NoUserSiteDirectory = cmdline->no_user_site_directory;
+    Py_NoSiteFlag = cmdline->no_site_import;
+    Py_UnbufferedStdioFlag = cmdline->use_unbuffered_io;
+    Py_VerboseFlag = cmdline->verbosity;
+    Py_QuietFlag = cmdline->quiet_flag;
+    _Py_CheckHashBasedPycsMode = cmdline->check_hash_pycs_mode;
 #ifdef MS_WINDOWS
-    pymain_set_flag(&Py_LegacyWindowsFSEncodingFlag, cmdline->legacy_windows_fs_encoding);
-    pymain_set_flag(&Py_LegacyWindowsStdioFlag, cmdline->legacy_windows_stdio);
+    Py_LegacyWindowsFSEncodingFlag = cmdline->legacy_windows_fs_encoding;
+    Py_LegacyWindowsStdioFlag = cmdline->legacy_windows_stdio;
 #endif
 
-    pymain_set_flag(&Py_IgnoreEnvironmentFlag, pymain->core_config.ignore_environment);
+    Py_IgnoreEnvironmentFlag = pymain->core_config.ignore_environment;
+    Py_UTF8Mode = pymain->core_config.utf8_mode;
 }
 
 
@@ -1609,6 +1592,57 @@
 }
 
 
+static int
+pymain_init_utf8_mode(_PyMain *pymain)
+{
+    _PyCoreConfig *core_config = &pymain->core_config;
+
+#ifdef MS_WINDOWS
+    if (pymain->cmdline.legacy_windows_fs_encoding) {
+        core_config->utf8_mode = 0;
+        return 0;
+    }
+#endif
+
+    wchar_t *xopt = pymain_get_xoption(pymain, L"utf8");
+    if (xopt) {
+        wchar_t *sep = wcschr(xopt, L'=');
+        if (sep) {
+            xopt = sep + 1;
+            if (wcscmp(xopt, L"1") == 0) {
+                core_config->utf8_mode = 1;
+            }
+            else if (wcscmp(xopt, L"0") == 0) {
+                core_config->utf8_mode = 0;
+            }
+            else {
+                pymain->err = _Py_INIT_USER_ERR("invalid -X utf8 option value");
+                return -1;
+            }
+        }
+        else {
+            core_config->utf8_mode = 1;
+        }
+        return 0;
+    }
+
+    char *opt = pymain_get_env_var("PYTHONUTF8");
+    if (opt) {
+        if (strcmp(opt, "1") == 0) {
+            core_config->utf8_mode = 1;
+        }
+        else if (strcmp(opt, "0") == 0) {
+            core_config->utf8_mode = 0;
+        }
+        else {
+            pymain->err = _Py_INIT_USER_ERR("invalid PYTHONUTF8 environment "
+                                             "variable value");
+            return -1;
+        }
+        return 0;
+    }
+    return 0;
+}
 
 
 static int
@@ -1674,6 +1708,9 @@
         pymain->core_config.malloc_stats = 1;
     }
 
+    if (pymain_init_utf8_mode(pymain) < 0) {
+        return -1;
+    }
 
     return 0;
 }
@@ -1702,6 +1739,7 @@
     if (pymain_parse_envvars(pymain) < 0) {
         return -1;
     }
+    /* FIXME: if utf8_mode value changed, parse again cmdline */
 
     _PyInitError err = _PyMainInterpreterConfig_Read(&pymain->config);
     if (_Py_INIT_FAILED(err)) {
@@ -1730,6 +1768,7 @@
 static int
 pymain_init_python(_PyMain *pymain)
 {
+
     pymain_set_global_config(pymain);
 
     pymain_init_stdio(pymain);
@@ -1788,6 +1827,7 @@
         return -1;
     }
 
+    pymain->core_config.utf8_mode = Py_UTF8Mode;
     pymain->core_config._disable_importlib = 0;
     pymain->config.install_signal_handlers = 1;