bpo-28180: Implementation for PEP 538 (#659)

- new PYTHONCOERCECLOCALE config setting
- coerces legacy C locale to C.UTF-8, C.utf8 or UTF-8 by default
- always uses C.UTF-8 on Android
- uses `surrogateescape` on stdin and stdout in the coercion
  target locales
- configure option to disable locale coercion at build time
- configure option to disable C locale warning at build time
diff --git a/Lib/test/support/script_helper.py b/Lib/test/support/script_helper.py
index 1e74647..b3ac848 100644
--- a/Lib/test/support/script_helper.py
+++ b/Lib/test/support/script_helper.py
@@ -48,8 +48,35 @@
     return __cached_interp_requires_environment
 
 
-_PythonRunResult = collections.namedtuple("_PythonRunResult",
-                                          ("rc", "out", "err"))
+class _PythonRunResult(collections.namedtuple("_PythonRunResult",
+                                          ("rc", "out", "err"))):
+    """Helper for reporting Python subprocess run results"""
+    def fail(self, cmd_line):
+        """Provide helpful details about failed subcommand runs"""
+        # Limit to 80 lines to ASCII characters
+        maxlen = 80 * 100
+        out, err = self.out, self.err
+        if len(out) > maxlen:
+            out = b'(... truncated stdout ...)' + out[-maxlen:]
+        if len(err) > maxlen:
+            err = b'(... truncated stderr ...)' + err[-maxlen:]
+        out = out.decode('ascii', 'replace').rstrip()
+        err = err.decode('ascii', 'replace').rstrip()
+        raise AssertionError("Process return code is %d\n"
+                             "command line: %r\n"
+                             "\n"
+                             "stdout:\n"
+                             "---\n"
+                             "%s\n"
+                             "---\n"
+                             "\n"
+                             "stderr:\n"
+                             "---\n"
+                             "%s\n"
+                             "---"
+                             % (self.rc, cmd_line,
+                                out,
+                                err))
 
 
 # Executing the interpreter in a subprocess
@@ -107,30 +134,7 @@
 def _assert_python(expected_success, *args, **env_vars):
     res, cmd_line = run_python_until_end(*args, **env_vars)
     if (res.rc and expected_success) or (not res.rc and not expected_success):
-        # Limit to 80 lines to ASCII characters
-        maxlen = 80 * 100
-        out, err = res.out, res.err
-        if len(out) > maxlen:
-            out = b'(... truncated stdout ...)' + out[-maxlen:]
-        if len(err) > maxlen:
-            err = b'(... truncated stderr ...)' + err[-maxlen:]
-        out = out.decode('ascii', 'replace').rstrip()
-        err = err.decode('ascii', 'replace').rstrip()
-        raise AssertionError("Process return code is %d\n"
-                             "command line: %r\n"
-                             "\n"
-                             "stdout:\n"
-                             "---\n"
-                             "%s\n"
-                             "---\n"
-                             "\n"
-                             "stderr:\n"
-                             "---\n"
-                             "%s\n"
-                             "---"
-                             % (res.rc, cmd_line,
-                                out,
-                                err))
+        res.fail(cmd_line)
     return res
 
 def assert_python_ok(*args, **env_vars):
diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py
new file mode 100644
index 0000000..c14d820
--- /dev/null
+++ b/Lib/test/test_c_locale_coercion.py
@@ -0,0 +1,262 @@
+# Tests the attempted automatic coercion of the C locale to a UTF-8 locale
+
+import unittest
+import os
+import sys
+import sysconfig
+import shutil
+import subprocess
+from collections import namedtuple
+
+import test.support
+from test.support.script_helper import (
+    run_python_until_end,
+    interpreter_requires_environment,
+)
+
+# In order to get the warning messages to match up as expected, the candidate
+# order here must much the target locale order in Python/pylifecycle.c
+_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
+
+# There's no reliable cross-platform way of checking locale alias
+# lists, so the only way of knowing which of these locales will work
+# is to try them with locale.setlocale(). We do that in a subprocess
+# to avoid altering the locale of the test runner.
+def _set_locale_in_subprocess(locale_name):
+    cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))"
+    cmd = cmd_fmt.format(locale_name)
+    result, py_cmd = run_python_until_end("-c", cmd, __isolated=True)
+    return result.rc == 0
+
+_EncodingDetails = namedtuple("EncodingDetails",
+                              "fsencoding stdin_info stdout_info stderr_info")
+
+class EncodingDetails(_EncodingDetails):
+    CHILD_PROCESS_SCRIPT = ";".join([
+        "import sys",
+        "print(sys.getfilesystemencoding())",
+        "print(sys.stdin.encoding + ':' + sys.stdin.errors)",
+        "print(sys.stdout.encoding + ':' + sys.stdout.errors)",
+        "print(sys.stderr.encoding + ':' + sys.stderr.errors)",
+    ])
+
+    @classmethod
+    def get_expected_details(cls, expected_fsencoding):
+        """Returns expected child process details for a given encoding"""
+        _stream = expected_fsencoding + ":{}"
+        # stdin and stdout should use surrogateescape either because the
+        # coercion triggered, or because the C locale was detected
+        stream_info = 2*[_stream.format("surrogateescape")]
+        # stderr should always use backslashreplace
+        stream_info.append(_stream.format("backslashreplace"))
+        return dict(cls(expected_fsencoding, *stream_info)._asdict())
+
+    @staticmethod
+    def _handle_output_variations(data):
+        """Adjust the output to handle platform specific idiosyncrasies
+
+        * Some platforms report ASCII as ANSI_X3.4-1968
+        * Some platforms report ASCII as US-ASCII
+        * Some platforms report UTF-8 instead of utf-8
+        """
+        data = data.replace(b"ANSI_X3.4-1968", b"ascii")
+        data = data.replace(b"US-ASCII", b"ascii")
+        data = data.lower()
+        return data
+
+    @classmethod
+    def get_child_details(cls, env_vars):
+        """Retrieves fsencoding and standard stream details from a child process
+
+        Returns (encoding_details, stderr_lines):
+
+        - encoding_details: EncodingDetails for eager decoding
+        - stderr_lines: result of calling splitlines() on the stderr output
+
+        The child is run in isolated mode if the current interpreter supports
+        that.
+        """
+        result, py_cmd = run_python_until_end(
+            "-c", cls.CHILD_PROCESS_SCRIPT,
+            __isolated=True,
+            **env_vars
+        )
+        if not result.rc == 0:
+            result.fail(py_cmd)
+        # All subprocess outputs in this test case should be pure ASCII
+        adjusted_output = cls._handle_output_variations(result.out)
+        stdout_lines = adjusted_output.decode("ascii").rstrip().splitlines()
+        child_encoding_details = dict(cls(*stdout_lines)._asdict())
+        stderr_lines = result.err.decode("ascii").rstrip().splitlines()
+        return child_encoding_details, stderr_lines
+
+
+class _ChildProcessEncodingTestCase(unittest.TestCase):
+    # Base class to check for expected encoding details in a child process
+
+    def _check_child_encoding_details(self,
+                                      env_vars,
+                                      expected_fsencoding,
+                                      expected_warning):
+        """Check the C locale handling for the given process environment
+
+        Parameters:
+            expected_fsencoding: the encoding the child is expected to report
+            allow_c_locale: setting to use for PYTHONALLOWCLOCALE
+              None: don't set the variable at all
+              str: the value set in the child's environment
+        """
+        result = EncodingDetails.get_child_details(env_vars)
+        encoding_details, stderr_lines = result
+        self.assertEqual(encoding_details,
+                         EncodingDetails.get_expected_details(
+                             expected_fsencoding))
+        self.assertEqual(stderr_lines, expected_warning)
+
+# Details of the shared library warning emitted at runtime
+LIBRARY_C_LOCALE_WARNING = (
+    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
+    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
+    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
+    "locales is recommended."
+)
+
+@unittest.skipUnless(sysconfig.get_config_var("PY_WARN_ON_C_LOCALE"),
+                     "C locale runtime warning disabled at build time")
+class LocaleWarningTests(_ChildProcessEncodingTestCase):
+    # Test warning emitted when running in the C locale
+
+    def test_library_c_locale_warning(self):
+        self.maxDiff = None
+        for locale_to_set in ("C", "POSIX", "invalid.ascii"):
+            var_dict = {
+                "LC_ALL": locale_to_set
+            }
+            with self.subTest(forced_locale=locale_to_set):
+                self._check_child_encoding_details(var_dict,
+                                                   "ascii",
+                                                   [LIBRARY_C_LOCALE_WARNING])
+
+# Details of the CLI locale coercion warning emitted at runtime
+CLI_COERCION_WARNING_FMT = (
+    "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
+    "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)."
+)
+
+class _LocaleCoercionTargetsTestCase(_ChildProcessEncodingTestCase):
+    # Base class for test cases that rely on coercion targets being defined
+
+    available_targets = []
+    targets_required = True
+
+    @classmethod
+    def setUpClass(cls):
+        first_target_locale = None
+        available_targets = cls.available_targets
+        # Find the target locales available in the current system
+        for target_locale in _C_UTF8_LOCALES:
+            if _set_locale_in_subprocess(target_locale):
+                available_targets.append(target_locale)
+                if first_target_locale is None:
+                    first_target_locale = target_locale
+        if cls.targets_required and not available_targets:
+            raise unittest.SkipTest("No C-with-UTF-8 locale available")
+        # Expect coercion to use the first available locale
+        warning_msg = CLI_COERCION_WARNING_FMT.format(first_target_locale)
+        cls.EXPECTED_COERCION_WARNING = warning_msg
+
+
+class LocaleConfigurationTests(_LocaleCoercionTargetsTestCase):
+    # Test explicit external configuration via the process environment
+
+    def test_external_target_locale_configuration(self):
+        # Explicitly setting a target locale should give the same behaviour as
+        # is seen when implicitly coercing to that target locale
+        self.maxDiff = None
+
+        expected_warning = []
+        expected_fsencoding = "utf-8"
+
+        base_var_dict = {
+            "LANG": "",
+            "LC_CTYPE": "",
+            "LC_ALL": "",
+        }
+        for env_var in ("LANG", "LC_CTYPE"):
+            for locale_to_set in self.available_targets:
+                with self.subTest(env_var=env_var,
+                                  configured_locale=locale_to_set):
+                    var_dict = base_var_dict.copy()
+                    var_dict[env_var] = locale_to_set
+                    self._check_child_encoding_details(var_dict,
+                                                       expected_fsencoding,
+                                                       expected_warning)
+
+
+
+@test.support.cpython_only
+@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
+                     "C locale coercion disabled at build time")
+class LocaleCoercionTests(_LocaleCoercionTargetsTestCase):
+    # Test implicit reconfiguration of the environment during CLI startup
+
+    def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale):
+        """Check the C locale handling for various configurations
+
+        Parameters:
+            expected_fsencoding: the encoding the child is expected to report
+            allow_c_locale: setting to use for PYTHONALLOWCLOCALE
+              None: don't set the variable at all
+              str: the value set in the child's environment
+        """
+
+        # Check for expected warning on stderr if C locale is coerced
+        self.maxDiff = None
+
+        expected_warning = []
+        if coerce_c_locale != "0":
+            expected_warning.append(self.EXPECTED_COERCION_WARNING)
+
+        base_var_dict = {
+            "LANG": "",
+            "LC_CTYPE": "",
+            "LC_ALL": "",
+        }
+        for env_var in ("LANG", "LC_CTYPE"):
+            for locale_to_set in ("", "C", "POSIX", "invalid.ascii"):
+                with self.subTest(env_var=env_var,
+                                  nominal_locale=locale_to_set,
+                                  PYTHONCOERCECLOCALE=coerce_c_locale):
+                    var_dict = base_var_dict.copy()
+                    var_dict[env_var] = locale_to_set
+                    if coerce_c_locale is not None:
+                        var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
+                    self._check_child_encoding_details(var_dict,
+                                                       expected_fsencoding,
+                                                       expected_warning)
+
+    def test_test_PYTHONCOERCECLOCALE_not_set(self):
+        # This should coerce to the first available target locale by default
+        self._check_c_locale_coercion("utf-8", coerce_c_locale=None)
+
+    def test_PYTHONCOERCECLOCALE_not_zero(self):
+        # *Any* string other that "0" is considered "set" for our purposes
+        # and hence should result in the locale coercion being enabled
+        for setting in ("", "1", "true", "false"):
+            self._check_c_locale_coercion("utf-8", coerce_c_locale=setting)
+
+    def test_PYTHONCOERCECLOCALE_set_to_zero(self):
+        # The setting "0" should result in the locale coercion being disabled
+        self._check_c_locale_coercion("ascii", coerce_c_locale="0")
+
+
+def test_main():
+    test.support.run_unittest(
+        LocaleConfigurationTests,
+        LocaleCoercionTests,
+        LocaleWarningTests
+    )
+    test.support.reap_children()
+
+if __name__ == "__main__":
+    test_main()
diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py
index 8ac8af9..c4a9766 100644
--- a/Lib/test/test_capi.py
+++ b/Lib/test/test_capi.py
@@ -371,14 +371,21 @@
     def tearDown(self):
         os.chdir(self.oldcwd)
 
-    def run_embedded_interpreter(self, *args):
+    def run_embedded_interpreter(self, *args, env=None):
         """Runs a test in the embedded interpreter"""
         cmd = [self.test_exe]
         cmd.extend(args)
+        if env is not None and sys.platform == 'win32':
+            # Windows requires at least the SYSTEMROOT environment variable to
+            # start Python.
+            env = env.copy()
+            env['SYSTEMROOT'] = os.environ['SYSTEMROOT']
+
         p = subprocess.Popen(cmd,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
-                             universal_newlines=True)
+                             universal_newlines=True,
+                             env=env)
         (out, err) = p.communicate()
         self.assertEqual(p.returncode, 0,
                          "bad returncode %d, stderr is %r" %
@@ -471,26 +478,16 @@
                 self.assertNotEqual(sub.tstate, main.tstate)
                 self.assertNotEqual(sub.modules, main.modules)
 
-    @staticmethod
-    def _get_default_pipe_encoding():
-        rp, wp = os.pipe()
-        try:
-            with os.fdopen(wp, 'w') as w:
-                default_pipe_encoding = w.encoding
-        finally:
-            os.close(rp)
-        return default_pipe_encoding
-
     def test_forced_io_encoding(self):
         # Checks forced configuration of embedded interpreter IO streams
-        out, err = self.run_embedded_interpreter("forced_io_encoding")
-        if support.verbose:
+        env = {"PYTHONIOENCODING": "utf-8:surrogateescape"}
+        out, err = self.run_embedded_interpreter("forced_io_encoding", env=env)
+        if support.verbose > 1:
             print()
             print(out)
             print(err)
-        expected_errors = sys.__stdout__.errors
-        expected_stdin_encoding = sys.__stdin__.encoding
-        expected_pipe_encoding = self._get_default_pipe_encoding()
+        expected_stream_encoding = "utf-8"
+        expected_errors = "surrogateescape"
         expected_output = '\n'.join([
         "--- Use defaults ---",
         "Expected encoding: default",
@@ -517,8 +514,8 @@
         "stdout: latin-1:replace",
         "stderr: latin-1:backslashreplace"])
         expected_output = expected_output.format(
-                                in_encoding=expected_stdin_encoding,
-                                out_encoding=expected_pipe_encoding,
+                                in_encoding=expected_stream_encoding,
+                                out_encoding=expected_stream_encoding,
                                 errors=expected_errors)
         # This is useful if we ever trip over odd platform behaviour
         self.maxDiff = None
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
index 5f96d61..3e20427 100644
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -8,8 +8,9 @@
 import subprocess
 import tempfile
 from test.support import script_helper, is_android
-from test.support.script_helper import (spawn_python, kill_python, assert_python_ok,
-    assert_python_failure)
+from test.support.script_helper import (
+    spawn_python, kill_python, assert_python_ok, assert_python_failure
+)
 
 
 # XXX (ncoghlan): Move to script_helper and make consistent with run_python
@@ -150,6 +151,7 @@
         env = os.environ.copy()
         # Use C locale to get ascii for the locale encoding
         env['LC_ALL'] = 'C'
+        env['PYTHONCOERCECLOCALE'] = '0'
         code = (
             b'import locale; '
             b'print(ascii("' + undecodable + b'"), '
diff --git a/Lib/test/test_subprocess.py b/Lib/test/test_subprocess.py
index 7fabe6a..52b05c1 100644
--- a/Lib/test/test_subprocess.py
+++ b/Lib/test/test_subprocess.py
@@ -642,7 +642,8 @@
             # on adding even when the environment in exec is empty.
             # Gentoo sandboxes also force LD_PRELOAD and SANDBOX_* to exist.
             return ('VERSIONER' in n or '__CF' in n or  # MacOS
-                    n == 'LD_PRELOAD' or n.startswith('SANDBOX'))  # Gentoo
+                    n == 'LD_PRELOAD' or n.startswith('SANDBOX') or # Gentoo
+                    n == 'LC_CTYPE') # Locale coercion triggered
 
         with subprocess.Popen([sys.executable, "-c",
                                'import os; print(list(os.environ.keys()))'],
diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
index ed78e2a..3844812 100644
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py
@@ -682,6 +682,7 @@
         # Force the POSIX locale
         env = os.environ.copy()
         env["LC_ALL"] = "C"
+        env["PYTHONCOERCECLOCALE"] = "0"
         code = '\n'.join((
             'import sys',
             'def dump(name):',