bpo-36402: Fix threading._shutdown() race condition (GH-13948)

Fix a race condition at Python shutdown when waiting for threads.
Wait until the Python thread state of all non-daemon threads get
deleted (join all non-daemon threads), rather than just wait until
Python threads complete.

* Add threading._shutdown_locks: set of Thread._tstate_lock locks
  of non-daemon threads used by _shutdown() to wait until all Python
  thread states get deleted. See Thread._set_tstate_lock().
* Add also threading._shutdown_locks_lock to protect access to
  threading._shutdown_locks.
* Add test_finalization_shutdown() test.
diff --git a/Lib/test/test_threading.py b/Lib/test/test_threading.py
index 6ac4ea9..ad90010 100644
--- a/Lib/test/test_threading.py
+++ b/Lib/test/test_threading.py
@@ -583,6 +583,41 @@
         self.assertEqual(data.splitlines(),
                          ["GC: True True True"] * 2)
 
+    def test_finalization_shutdown(self):
+        # bpo-36402: Py_Finalize() calls threading._shutdown() which must wait
+        # until Python thread states of all non-daemon threads get deleted.
+        #
+        # Test similar to SubinterpThreadingTests.test_threads_join_2(), but
+        # test the finalization of the main interpreter.
+        code = """if 1:
+            import os
+            import threading
+            import time
+            import random
+
+            def random_sleep():
+                seconds = random.random() * 0.010
+                time.sleep(seconds)
+
+            class Sleeper:
+                def __del__(self):
+                    random_sleep()
+
+            tls = threading.local()
+
+            def f():
+                # Sleep a bit so that the thread is still running when
+                # Py_Finalize() is called.
+                random_sleep()
+                tls.x = Sleeper()
+                random_sleep()
+
+            threading.Thread(target=f).start()
+            random_sleep()
+        """
+        rc, out, err = assert_python_ok("-c", code)
+        self.assertEqual(err, b"")
+
     def test_tstate_lock(self):
         # Test an implementation detail of Thread objects.
         started = _thread.allocate_lock()
@@ -878,15 +913,22 @@
         self.addCleanup(os.close, w)
         code = r"""if 1:
             import os
+            import random
             import threading
             import time
 
+            def random_sleep():
+                seconds = random.random() * 0.010
+                time.sleep(seconds)
+
             def f():
                 # Sleep a bit so that the thread is still running when
                 # Py_EndInterpreter is called.
-                time.sleep(0.05)
+                random_sleep()
                 os.write(%d, b"x")
+
             threading.Thread(target=f).start()
+            random_sleep()
             """ % (w,)
         ret = test.support.run_in_subinterp(code)
         self.assertEqual(ret, 0)
@@ -903,22 +945,29 @@
         self.addCleanup(os.close, w)
         code = r"""if 1:
             import os
+            import random
             import threading
             import time
 
+            def random_sleep():
+                seconds = random.random() * 0.010
+                time.sleep(seconds)
+
             class Sleeper:
                 def __del__(self):
-                    time.sleep(0.05)
+                    random_sleep()
 
             tls = threading.local()
 
             def f():
                 # Sleep a bit so that the thread is still running when
                 # Py_EndInterpreter is called.
-                time.sleep(0.05)
+                random_sleep()
                 tls.x = Sleeper()
                 os.write(%d, b"x")
+
             threading.Thread(target=f).start()
+            random_sleep()
             """ % (w,)
         ret = test.support.run_in_subinterp(code)
         self.assertEqual(ret, 0)