bpo-30565: Add PYTHONCOERCECLOCALE=warn runtime flag (GH-2260)

- removes PY_WARN_ON_C_LOCALE build time flag
- locale coercion and compatibility warnings are now always compiled
  in, but are off by default
- adds PYTHONCOERCECLOCALE=warn runtime option to aid in
  debugging potentially locale related compatibility problems

Due to not-yet-resolved test failures on *BSD systems (including
Mac OS X), this also temporarily disables UTF-8 as a locale coercion
target, and skips testing the interpreter's behavior in the POSIX locale.
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst
index 920d5c0..5adad15 100644
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@@ -744,6 +744,11 @@
    :data:`sys.stdin` and :data:`sys.stdout` to ``surrogateescape``. This
    behavior can be overridden using :envvar:`PYTHONIOENCODING` as usual.
 
+   For debugging purposes, setting ``PYTHONCOERCECLOCALE=warn`` will cause
+   Python to emit warning messages on ``stderr`` if either the locale coercion
+   activates, or else if a locale that *would* have triggered coercion is
+   still active when the Python runtime is initialized.
+
    Availability: \*nix
 
    .. versionadded:: 3.7
diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst
index db11954..5f683eb 100644
--- a/Doc/whatsnew/3.7.rst
+++ b/Doc/whatsnew/3.7.rst
@@ -96,20 +96,11 @@
 ``UTF-8``). The default error handler for ``stderr`` continues to be
 ``backslashreplace``, regardless of locale.
 
-.. note::
-
-   In the current implementation, a warning message is printed directly to
-   ``stderr`` even for successful implicit locale coercion. This gives
-   redistributors and system integrators the opportunity to determine if they
-   should be making an environmental change to avoid the need for implicit
-   coercion at the Python interpreter level.
-
-   However, it's not clear that this is going to be the best approach for
-   the final 3.7.0 release, and we may end up deciding to disable the warning
-   by default and provide some way of opting into it at runtime or build time.
-
-   Concrete examples of use cases where it would be preferrable to disable the
-   warning by default can be noted on :issue:`30565`.
+Locale coercion is silent by default, but to assist in debugging potentially
+locale related integration problems, explicit warnings (emitted directly on
+``stderr`` can be requested by setting ``PYTHONCOERCECLOCALE=warn``. This
+setting will also cause the Python runtime to emit a warning if the legacy C
+locale remains active when the core interpreter is initialized.
 
 .. seealso::
 
diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py
index aa0771c..a4b4626 100644
--- a/Lib/test/test_c_locale_coercion.py
+++ b/Lib/test/test_c_locale_coercion.py
@@ -22,13 +22,23 @@
 else:
     C_LOCALE_FS_ENCODING = C_LOCALE_STREAM_ENCODING
 
-# XXX (ncoghlan): The above is probably still wrong for:
+# Note that the above is probably still wrong in some cases, such as:
 # * Windows when PYTHONLEGACYWINDOWSFSENCODING is set
 # * AIX and any other platforms that use latin-1 in the C locale
+#
+# Options for dealing with this:
+# * Don't set PYTHON_COERCE_C_LOCALE on such platforms (e.g. Windows doesn't)
+# * Fix the test expectations to match the actual platform behaviour
 
 # In order to get the warning messages to match up as expected, the candidate
 # order here must much the target locale order in Python/pylifecycle.c
-_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
+_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8") #, "UTF-8")
+
+# XXX (ncoghlan): Using UTF-8 as a target locale is currently disabled due to
+#                 problems encountered on *BSD systems with those test cases
+# For additional details see:
+#     nl_langinfo CODESET error: https://bugs.python.org/issue30647
+#     locale handling differences: https://bugs.python.org/issue30672
 
 # There's no reliable cross-platform way of checking locale alias
 # lists, so the only way of knowing which of these locales will work
@@ -40,20 +50,24 @@
     result, py_cmd = run_python_until_end("-c", cmd, __isolated=True)
     return result.rc == 0
 
-_EncodingDetails = namedtuple("EncodingDetails",
-                              "fsencoding stdin_info stdout_info stderr_info")
+_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all"
+_EncodingDetails = namedtuple("EncodingDetails", _fields)
 
 class EncodingDetails(_EncodingDetails):
+    # XXX (ncoghlan): Using JSON for child state reporting may be less fragile
     CHILD_PROCESS_SCRIPT = ";".join([
-        "import sys",
+        "import sys, os",
         "print(sys.getfilesystemencoding())",
         "print(sys.stdin.encoding + ':' + sys.stdin.errors)",
         "print(sys.stdout.encoding + ':' + sys.stdout.errors)",
         "print(sys.stderr.encoding + ':' + sys.stderr.errors)",
+        "print(os.environ.get('LANG', 'not set'))",
+        "print(os.environ.get('LC_CTYPE', 'not set'))",
+        "print(os.environ.get('LC_ALL', 'not set'))",
     ])
 
     @classmethod
-    def get_expected_details(cls, fs_encoding, stream_encoding):
+    def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars):
         """Returns expected child process details for a given encoding"""
         _stream = stream_encoding + ":{}"
         # stdin and stdout should use surrogateescape either because the
@@ -61,7 +75,14 @@
         stream_info = 2*[_stream.format("surrogateescape")]
         # stderr should always use backslashreplace
         stream_info.append(_stream.format("backslashreplace"))
-        return dict(cls(fs_encoding, *stream_info)._asdict())
+        expected_lang = env_vars.get("LANG", "not set").lower()
+        if coercion_expected:
+            expected_lc_ctype = CLI_COERCION_TARGET.lower()
+        else:
+            expected_lc_ctype = env_vars.get("LC_CTYPE", "not set").lower()
+        expected_lc_all = env_vars.get("LC_ALL", "not set").lower()
+        env_info = expected_lang, expected_lc_ctype, expected_lc_all
+        return dict(cls(fs_encoding, *stream_info, *env_info)._asdict())
 
     @staticmethod
     def _handle_output_variations(data):
@@ -97,64 +118,20 @@
             result.fail(py_cmd)
         # All subprocess outputs in this test case should be pure ASCII
         adjusted_output = cls._handle_output_variations(result.out)
-        stdout_lines = adjusted_output.decode("ascii").rstrip().splitlines()
+        stdout_lines = adjusted_output.decode("ascii").splitlines()
         child_encoding_details = dict(cls(*stdout_lines)._asdict())
         stderr_lines = result.err.decode("ascii").rstrip().splitlines()
         return child_encoding_details, stderr_lines
 
 
-class _ChildProcessEncodingTestCase(unittest.TestCase):
-    # Base class to check for expected encoding details in a child process
-
-    def _check_child_encoding_details(self,
-                                      env_vars,
-                                      expected_fs_encoding,
-                                      expected_stream_encoding,
-                                      expected_warning):
-        """Check the C locale handling for the given process environment
-
-        Parameters:
-            expected_fs_encoding: expected sys.getfilesystemencoding() result
-            expected_stream_encoding: expected encoding for standard streams
-            expected_warning: stderr output to expect (if any)
-        """
-        result = EncodingDetails.get_child_details(env_vars)
-        encoding_details, stderr_lines = result
-        self.assertEqual(encoding_details,
-                         EncodingDetails.get_expected_details(
-                             expected_fs_encoding,
-                             expected_stream_encoding))
-        self.assertEqual(stderr_lines, expected_warning)
-
 # Details of the shared library warning emitted at runtime
-LIBRARY_C_LOCALE_WARNING = (
+LEGACY_LOCALE_WARNING = (
     "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
     "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
     "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
     "locales is recommended."
 )
 
-@unittest.skipUnless(sysconfig.get_config_var("PY_WARN_ON_C_LOCALE"),
-                     "C locale runtime warning disabled at build time")
-class LocaleWarningTests(_ChildProcessEncodingTestCase):
-    # Test warning emitted when running in the C locale
-
-    def test_library_c_locale_warning(self):
-        self.maxDiff = None
-        for locale_to_set in ("C", "POSIX", "invalid.ascii"):
-            # XXX (ncoghlan): Mac OS X doesn't behave as expected in the
-            #                 POSIX locale, so we skip that for now
-            if sys.platform == "darwin" and locale_to_set == "POSIX":
-                continue
-            var_dict = {
-                "LC_ALL": locale_to_set
-            }
-            with self.subTest(forced_locale=locale_to_set):
-                self._check_child_encoding_details(var_dict,
-                                                   C_LOCALE_FS_ENCODING,
-                                                   C_LOCALE_STREAM_ENCODING,
-                                                   [LIBRARY_C_LOCALE_WARNING])
-
 # Details of the CLI locale coercion warning emitted at runtime
 CLI_COERCION_WARNING_FMT = (
     "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
@@ -163,9 +140,13 @@
 
 
 AVAILABLE_TARGETS = None
+CLI_COERCION_TARGET = None
+CLI_COERCION_WARNING = None
 
 def setUpModule():
     global AVAILABLE_TARGETS
+    global CLI_COERCION_TARGET
+    global CLI_COERCION_WARNING
 
     if AVAILABLE_TARGETS is not None:
         # initialization already done
@@ -177,26 +158,57 @@
         if _set_locale_in_subprocess(target_locale):
             AVAILABLE_TARGETS.append(target_locale)
 
+    if AVAILABLE_TARGETS:
+        # Coercion is expected to use the first available target locale
+        CLI_COERCION_TARGET = AVAILABLE_TARGETS[0]
+        CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET)
 
 
-class _LocaleCoercionTargetsTestCase(_ChildProcessEncodingTestCase):
-    # Base class for test cases that rely on coercion targets being defined
+class _LocaleHandlingTestCase(unittest.TestCase):
+    # Base class to check expected locale handling behaviour
 
-    @classmethod
-    def setUpClass(cls):
+    def _check_child_encoding_details(self,
+                                      env_vars,
+                                      expected_fs_encoding,
+                                      expected_stream_encoding,
+                                      expected_warnings,
+                                      coercion_expected):
+        """Check the C locale handling for the given process environment
+
+        Parameters:
+            expected_fs_encoding: expected sys.getfilesystemencoding() result
+            expected_stream_encoding: expected encoding for standard streams
+            expected_warning: stderr output to expect (if any)
+        """
+        result = EncodingDetails.get_child_details(env_vars)
+        encoding_details, stderr_lines = result
+        expected_details = EncodingDetails.get_expected_details(
+            coercion_expected,
+            expected_fs_encoding,
+            expected_stream_encoding,
+            env_vars
+        )
+        self.assertEqual(encoding_details, expected_details)
+        if expected_warnings is None:
+            expected_warnings = []
+        self.assertEqual(stderr_lines, expected_warnings)
+
+
+class LocaleConfigurationTests(_LocaleHandlingTestCase):
+    # Test explicit external configuration via the process environment
+
+    def setUpClass():
+        # This relies on setupModule() having been run, so it can't be
+        # handled via the @unittest.skipUnless decorator
         if not AVAILABLE_TARGETS:
             raise unittest.SkipTest("No C-with-UTF-8 locale available")
 
-
-class LocaleConfigurationTests(_LocaleCoercionTargetsTestCase):
-    # Test explicit external configuration via the process environment
-
     def test_external_target_locale_configuration(self):
+
         # Explicitly setting a target locale should give the same behaviour as
         # is seen when implicitly coercing to that target locale
         self.maxDiff = None
 
-        expected_warning = []
         expected_fs_encoding = "utf-8"
         expected_stream_encoding = "utf-8"
 
@@ -209,6 +221,7 @@
             for locale_to_set in AVAILABLE_TARGETS:
                 # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
                 #                 expected, so skip that combination for now
+                # See https://bugs.python.org/issue30672 for discussion
                 if env_var == "LANG" and locale_to_set == "UTF-8":
                     continue
 
@@ -219,17 +232,23 @@
                     self._check_child_encoding_details(var_dict,
                                                        expected_fs_encoding,
                                                        expected_stream_encoding,
-                                                       expected_warning)
+                                                       expected_warnings=None,
+                                                       coercion_expected=False)
 
 
 
 @test.support.cpython_only
 @unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
                      "C locale coercion disabled at build time")
-class LocaleCoercionTests(_LocaleCoercionTargetsTestCase):
+class LocaleCoercionTests(_LocaleHandlingTestCase):
     # Test implicit reconfiguration of the environment during CLI startup
 
-    def _check_c_locale_coercion(self, fs_encoding, stream_encoding, coerce_c_locale):
+    def _check_c_locale_coercion(self,
+                                 fs_encoding, stream_encoding,
+                                 coerce_c_locale,
+                                 expected_warnings=None,
+                                 coercion_expected=True,
+                                 **extra_vars):
         """Check the C locale handling for various configurations
 
         Parameters:
@@ -238,27 +257,31 @@
             coerce_c_locale: setting to use for PYTHONCOERCECLOCALE
               None: don't set the variable at all
               str: the value set in the child's environment
+            expected_warnings: expected warning lines on stderr
+            extra_vars: additional environment variables to set in subprocess
         """
-
-        # Check for expected warning on stderr if C locale is coerced
         self.maxDiff = None
 
-        expected_warning = []
-        if coerce_c_locale != "0":
-            # Expect coercion to use the first available locale
-            warning_msg = CLI_COERCION_WARNING_FMT.format(AVAILABLE_TARGETS[0])
-            expected_warning.append(warning_msg)
+        if not AVAILABLE_TARGETS:
+            # Locale coercion is disabled when there aren't any target locales
+            fs_encoding = C_LOCALE_FS_ENCODING
+            stream_encoding = C_LOCALE_STREAM_ENCODING
+            coercion_expected = False
+            if expected_warnings:
+                expected_warnings = [LEGACY_LOCALE_WARNING]
 
         base_var_dict = {
             "LANG": "",
             "LC_CTYPE": "",
             "LC_ALL": "",
         }
+        base_var_dict.update(extra_vars)
         for env_var in ("LANG", "LC_CTYPE"):
             for locale_to_set in ("", "C", "POSIX", "invalid.ascii"):
-                # XXX (ncoghlan): Mac OS X doesn't behave as expected in the
+                # XXX (ncoghlan): *BSD platforms don't behave as expected in the
                 #                 POSIX locale, so we skip that for now
-                if sys.platform == "darwin" and locale_to_set == "POSIX":
+                # See https://bugs.python.org/issue30672 for discussion
+                if locale_to_set == "POSIX":
                     continue
                 with self.subTest(env_var=env_var,
                                   nominal_locale=locale_to_set,
@@ -267,33 +290,62 @@
                     var_dict[env_var] = locale_to_set
                     if coerce_c_locale is not None:
                         var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
+                    # Check behaviour on successful coercion
                     self._check_child_encoding_details(var_dict,
                                                        fs_encoding,
                                                        stream_encoding,
-                                                       expected_warning)
+                                                       expected_warnings,
+                                                       coercion_expected)
 
     def test_test_PYTHONCOERCECLOCALE_not_set(self):
         # This should coerce to the first available target locale by default
         self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None)
 
     def test_PYTHONCOERCECLOCALE_not_zero(self):
-        # *Any* string other that "0" is considered "set" for our purposes
+        # *Any* string other than "0" is considered "set" for our purposes
         # and hence should result in the locale coercion being enabled
         for setting in ("", "1", "true", "false"):
             self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting)
 
+    def test_PYTHONCOERCECLOCALE_set_to_warn(self):
+        # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales
+        self._check_c_locale_coercion("utf-8", "utf-8",
+                                      coerce_c_locale="warn",
+                                      expected_warnings=[CLI_COERCION_WARNING])
+
+
     def test_PYTHONCOERCECLOCALE_set_to_zero(self):
         # The setting "0" should result in the locale coercion being disabled
         self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
                                       C_LOCALE_STREAM_ENCODING,
-                                      coerce_c_locale="0")
+                                      coerce_c_locale="0",
+                                      coercion_expected=False)
+        # Setting LC_ALL=C shouldn't make any difference to the behaviour
+        self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
+                                      C_LOCALE_STREAM_ENCODING,
+                                      coerce_c_locale="0",
+                                      LC_ALL="C",
+                                      coercion_expected=False)
 
+    def test_LC_ALL_set_to_C(self):
+        # Setting LC_ALL should render the locale coercion ineffective
+        self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
+                                      C_LOCALE_STREAM_ENCODING,
+                                      coerce_c_locale=None,
+                                      LC_ALL="C",
+                                      coercion_expected=False)
+        # And result in a warning about a lack of locale compatibility
+        self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
+                                      C_LOCALE_STREAM_ENCODING,
+                                      coerce_c_locale="warn",
+                                      LC_ALL="C",
+                                      expected_warnings=[LEGACY_LOCALE_WARNING],
+                                      coercion_expected=False)
 
 def test_main():
     test.support.run_unittest(
         LocaleConfigurationTests,
-        LocaleCoercionTests,
-        LocaleWarningTests
+        LocaleCoercionTests
     )
     test.support.reap_children()
 
diff --git a/Modules/main.c b/Modules/main.c
index 94400fe..08b2276 100644
--- a/Modules/main.c
+++ b/Modules/main.c
@@ -105,10 +105,10 @@
 "   predictable seed.\n"
 "PYTHONMALLOC: set the Python memory allocators and/or install debug hooks\n"
 "   on Python memory allocators. Use PYTHONMALLOC=debug to install debug\n"
-"   hooks.\n";
-static const char usage_7[] =
+"   hooks.\n"
 "PYTHONCOERCECLOCALE: if this variable is set to 0, it disables the locale\n"
-"   coercion behavior\n";
+"   coercion behavior. Use PYTHONCOERCECLOCALE=warn to request display of\n"
+"   locale coercion and locale compatibility warnings on stderr.\n";
 
 static int
 usage(int exitcode, const wchar_t* program)
@@ -125,7 +125,6 @@
         fprintf(f, usage_4, (wint_t)DELIM);
         fprintf(f, usage_5, (wint_t)DELIM, PYTHONHOMEHELP);
         fputs(usage_6, f);
-        fputs(usage_7, f);
     }
     return exitcode;
 }
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index b7c9822..953bc90 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -356,6 +356,10 @@
 {
 #ifndef MS_WINDOWS
     /* On non-Windows systems, the C locale is considered a legacy locale */
+    /* XXX (ncoghlan): some platforms (notably Mac OS X) don't appear to treat
+     *                 the POSIX locale as a simple alias for the C locale, so
+     *                 we may also want to check for that explicitly.
+     */
     const char *ctype_loc = setlocale(LC_CTYPE, NULL);
     return ctype_loc != NULL && strcmp(ctype_loc, "C") == 0;
 #else
@@ -364,6 +368,30 @@
 #endif
 }
 
+static const char *_C_LOCALE_WARNING =
+    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
+    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
+    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
+    "locales is recommended.\n";
+
+static int
+_legacy_locale_warnings_enabled(void)
+{
+    const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
+    return (coerce_c_locale != NULL &&
+            strncmp(coerce_c_locale, "warn", 5) == 0);
+}
+
+static void
+_emit_stderr_warning_for_legacy_locale(void)
+{
+    if (_legacy_locale_warnings_enabled()) {
+        if (_Py_LegacyLocaleDetected()) {
+            fprintf(stderr, "%s", _C_LOCALE_WARNING);
+        }
+    }
+}
+
 typedef struct _CandidateLocale {
     const char *locale_name; /* The locale to try as a coercion target */
 } _LocaleCoercionTarget;
@@ -371,10 +399,17 @@
 static _LocaleCoercionTarget _TARGET_LOCALES[] = {
     {"C.UTF-8"},
     {"C.utf8"},
-    {"UTF-8"},
+    /* {"UTF-8"}, */
     {NULL}
 };
 
+/* XXX (ncoghlan): Using UTF-8 as a target locale is currently disabled due to
+ *                 problems encountered on *BSD systems with those test cases
+ * For additional details see:
+ *     nl_langinfo CODESET error: https://bugs.python.org/issue30647
+ *     locale handling differences: https://bugs.python.org/issue30672
+ */
+
 static char *
 get_default_standard_stream_error_handler(void)
 {
@@ -419,7 +454,9 @@
                 "Error setting LC_CTYPE, skipping C locale coercion\n");
         return;
     }
-    fprintf(stderr, _C_LOCALE_COERCION_WARNING, newloc);
+    if (_legacy_locale_warnings_enabled()) {
+        fprintf(stderr, _C_LOCALE_COERCION_WARNING, newloc);
+    }
 
     /* Reconfigure with the overridden environment variables */
     setlocale(LC_ALL, "");
@@ -465,26 +502,6 @@
 }
 
 
-#ifdef PY_WARN_ON_C_LOCALE
-static const char *_C_LOCALE_WARNING =
-    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
-    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
-    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
-    "locales is recommended.\n";
-
-static void
-_emit_stderr_warning_for_c_locale(void)
-{
-    const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
-    if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) {
-        if (_Py_LegacyLocaleDetected()) {
-            fprintf(stderr, "%s", _C_LOCALE_WARNING);
-        }
-    }
-}
-#endif
-
-
 /* Global initializations.  Can be undone by Py_Finalize().  Don't
    call this twice without an intervening Py_Finalize() call.
 
@@ -561,9 +578,7 @@
        the locale's charset without having to switch
        locales. */
     setlocale(LC_CTYPE, "");
-#ifdef PY_WARN_ON_C_LOCALE
-    _emit_stderr_warning_for_c_locale();
-#endif
+    _emit_stderr_warning_for_legacy_locale();
 #endif
 #endif