Issue #18808: Thread.join() now waits for the underlying thread state to be destroyed before returning.
This prevents unpredictable aborts in Py_EndInterpreter() when some non-daemon threads are still running.
diff --git a/Include/pystate.h b/Include/pystate.h
index e41fe4c..ddc6892 100644
--- a/Include/pystate.h
+++ b/Include/pystate.h
@@ -118,6 +118,32 @@
     int trash_delete_nesting;
     PyObject *trash_delete_later;
 
+    /* Called when a thread state is deleted normally, but not when it
+     * is destroyed after fork().
+     * Pain:  to prevent rare but fatal shutdown errors (issue 18808),
+     * Thread.join() must wait for the join'ed thread's tstate to be unlinked
+     * from the tstate chain.  That happens at the end of a thread's life,
+     * in pystate.c.
+     * The obvious way doesn't quite work:  create a lock which the tstate
+     * unlinking code releases, and have Thread.join() wait to acquire that
+     * lock.  The problem is that we _are_ at the end of the thread's life:
+     * if the thread holds the last reference to the lock, decref'ing the
+     * lock will delete the lock, and that may trigger arbitrary Python code
+     * if there's a weakref, with a callback, to the lock.  But by this time
+     * _PyThreadState_Current is already NULL, so only the simplest of C code
+     * can be allowed to run (in particular it must not be possible to
+     * release the GIL).
+     * So instead of holding the lock directly, the tstate holds a weakref to
+     * the lock:  that's the value of on_delete_data below.  Decref'ing a
+     * weakref is harmless.
+     * on_delete points to _threadmodule.c's static release_sentinel() function.
+     * After the tstate is unlinked, release_sentinel is called with the
+     * weakref-to-lock (on_delete_data) argument, and release_sentinel releases
+     * the indirectly held lock.
+     */
+    void (*on_delete)(void *);
+    void *on_delete_data;
+
     /* XXX signal handlers should also be here */
 
 } PyThreadState;