New PyGILState_ API - implements pep 311, from patch 684256.
diff --git a/Include/pystate.h b/Include/pystate.h
index f4c9d6e..c1182a6 100644
--- a/Include/pystate.h
+++ b/Include/pystate.h
@@ -72,6 +72,7 @@
     PyObject *dict;
 
     int tick_counter;
+    int gilstate_counter;
 
     /* XXX signal handlers should also be here */
 
@@ -104,6 +105,51 @@
 #define PyThreadState_GET() (_PyThreadState_Current)
 #endif
 
+typedef 
+    enum {PyGILState_LOCKED, PyGILState_UNLOCKED}
+        PyGILState_STATE;
+
+/* Ensure that the current thread is ready to call the Python
+   C API, regardless of the current state of Python, or of its
+   thread lock.  This may be called as many times as desired
+   by a thread so long as each call is matched with a call to 
+   PyGILState_Release().  In general, other thread-state APIs may 
+   be used between _Ensure() and _Release() calls, so long as the 
+   thread-state is restored to its previous state before the Release().
+   For example, normal use of the Py_BEGIN_ALLOW_THREADS/
+   Py_END_ALLOW_THREADS macros are acceptable.
+
+   The return value is an opaque "handle" to the thread state when
+   PyGILState_Acquire() was called, and must be passed to
+   PyGILState_Release() to ensure Python is left in the same state. Even
+   though recursive calls are allowed, these handles can *not* be shared - 
+   each unique call to PyGILState_Ensure must save the handle for its 
+   call to PyGILState_Release.
+
+   When the function returns, the current thread will hold the GIL.
+
+   Failure is a fatal error.
+*/
+PyAPI_FUNC(PyGILState_STATE) PyGILState_Ensure(void);
+
+/* Release any resources previously acquired.  After this call, Python's
+   state will be the same as it was prior to the corresponding
+   PyGILState_Acquire call (but generally this state will be unknown to 
+   the caller, hence the use of the GILState API.)
+
+   Every call to PyGILState_Ensure must be matched by a call to 
+   PyGILState_Release on the same thread.
+*/
+PyAPI_FUNC(void) PyGILState_Release(PyGILState_STATE);
+
+/* Helper/diagnostic function - get the current thread state for
+   this thread.  May return NULL if no GILState API has been used 
+   on the current thread.  Note the main thread always has such a 
+   thread-state, even if no auto-thread-state call has been made 
+   on the main thread.
+*/
+PyAPI_FUNC(PyThreadState *) PyGILState_GetThisThreadState(void);
+
 /* Routines for advanced debuggers, requested by David Beazley.
    Don't use unless you know what you are doing! */
 PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_Head(void);
diff --git a/Include/pythread.h b/Include/pythread.h
index 8a3bf26..0fa8db0 100644
--- a/Include/pythread.h
+++ b/Include/pythread.h
@@ -30,10 +30,12 @@
 PyAPI_FUNC(void) PyThread__PyThread_exit_prog(int);
 #endif
 
+/* Thread Local Storage (TLS) API */
 PyAPI_FUNC(int) PyThread_create_key(void);
 PyAPI_FUNC(void) PyThread_delete_key(int);
 PyAPI_FUNC(int) PyThread_set_key_value(int, void *);
 PyAPI_FUNC(void *) PyThread_get_key_value(int);
+PyAPI_FUNC(void) PyThread_delete_key_value(int key);
 
 #ifdef __cplusplus
 }
diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py
index fc31760..57cc9b8 100644
--- a/Lib/test/test_capi.py
+++ b/Lib/test/test_capi.py
@@ -14,3 +14,32 @@
             test()
         except _testcapi.error:
             raise test_support.TestFailed, sys.exc_info()[1]
+
+# some extra thread-state tests driven via _testcapi
+def TestThreadState():
+    import thread
+    import time
+
+    if test_support.verbose:
+        print "auto-thread-state"
+
+    idents = []
+
+    def callback():
+        idents.append(thread.get_ident())
+    
+    _testcapi._test_thread_state(callback)
+    time.sleep(1)
+    # Check our main thread is in the list exactly 3 times.
+    if idents.count(thread.get_ident()) != 3:
+        raise test_support.TestFailed, \
+              "Couldn't find main thread correctly in the list"
+
+try:
+    _testcapi._test_thread_state
+    have_thread_state = True
+except AttributeError:
+    have_thread_state = False
+    
+if have_thread_state:
+    TestThreadState()
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
index b3da398..b52e965 100644
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -7,6 +7,10 @@
 
 #include "Python.h"
 
+#ifdef WITH_THREAD
+#include "pythread.h"
+#endif /* WITH_THREAD */
+
 static PyObject *TestError;	/* set to exception object in init */
 
 /* Raise TestError with test_name + ": " + msg, and return NULL. */
@@ -535,6 +539,46 @@
 	return NULL;
 }
 
+#ifdef WITH_THREAD
+
+void _make_call(void *callable)
+{
+	PyObject *rc;
+	PyGILState_STATE s = PyGILState_Ensure();
+	rc = PyObject_CallFunction(callable, "");
+	Py_XDECREF(rc);
+	PyGILState_Release(s);
+}
+
+static PyObject *
+test_thread_state(PyObject *self, PyObject *args)
+{
+	PyObject *fn;
+	if (!PyArg_ParseTuple(args, "O:test_thread_state", &fn))
+		return NULL;
+	/* Ensure Python is setup for threading */
+	PyEval_InitThreads();
+	/* Start a new thread for our callback. */
+	PyThread_start_new_thread( _make_call, fn);
+	/* Make the callback with the thread lock held by this thread */
+	_make_call(fn);
+	/* Do it all again, but this time with the thread-lock released */
+	Py_BEGIN_ALLOW_THREADS
+	_make_call(fn);
+	Py_END_ALLOW_THREADS
+	/* And once more with and without a thread
+	   XXX - should use a lock and work out exactly what we are trying 
+	   to test <wink> 
+	*/
+	Py_BEGIN_ALLOW_THREADS
+	PyThread_start_new_thread( _make_call, fn);
+	_make_call(fn);
+	Py_END_ALLOW_THREADS
+	Py_INCREF(Py_None);
+	return Py_None;
+}
+#endif
+
 static PyMethodDef TestMethods[] = {
 	{"raise_exception",	raise_exception,		 METH_VARARGS},
 	{"test_config",		(PyCFunction)test_config,	 METH_NOARGS},
@@ -554,6 +598,9 @@
 #ifdef Py_USING_UNICODE
 	{"test_u_code",		(PyCFunction)test_u_code,	 METH_NOARGS},
 #endif
+#ifdef WITH_THREAD
+	{"_test_thread_state", (PyCFunction)test_thread_state, METH_VARARGS},
+#endif
 	{NULL, NULL} /* sentinel */
 };
 
diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c
index 9bf52f3..d34756a 100644
--- a/Modules/posixmodule.c
+++ b/Modules/posixmodule.c
@@ -4354,22 +4354,11 @@
  * exit code as the result of the close() operation.  This permits the
  * files to be closed in any order - it is always the close() of the
  * final handle that will return the exit code.
+ *
+ * NOTE: This function is currently called with the GIL released.
+ * hence we use the GILState API to manage our state.
  */
 
- /* RED_FLAG 31-Aug-2000 Tim
-  * This is always called (today!) between a pair of
-  * Py_BEGIN_ALLOW_THREADS/ Py_END_ALLOW_THREADS
-  * macros.  So the thread running this has no valid thread state, as
-  * far as Python is concerned.  However, this calls some Python API
-  * functions that cannot be called safely without a valid thread
-  * state, in particular PyDict_GetItem.
-  * As a temporary hack (although it may last for years ...), we
-  * *rely* on not having a valid thread state in this function, in
-  * order to create our own "from scratch".
-  * This will deadlock if _PyPclose is ever called by a thread
-  * holding the global lock.
-  */
-
 static int _PyPclose(FILE *file)
 {
 	int result;
@@ -4378,40 +4367,16 @@
 	PyObject *procObj, *hProcessObj, *intObj, *fileObj;
 	long file_count;
 #ifdef WITH_THREAD
-	PyInterpreterState* pInterpreterState;
-	PyThreadState* pThreadState;
+	PyGILState_STATE state;
 #endif
 
 	/* Close the file handle first, to ensure it can't block the
 	 * child from exiting if it's the last handle.
 	 */
 	result = fclose(file);
-
 #ifdef WITH_THREAD
-	/* Bootstrap a valid thread state into existence. */
-	pInterpreterState = PyInterpreterState_New();
-	if (!pInterpreterState) {
-		/* Well, we're hosed now!  We don't have a thread
-		 * state, so can't call a nice error routine, or raise
-		 * an exception.  Just die.
-		 */
-		 Py_FatalError("unable to allocate interpreter state "
-		 	       "when closing popen object");
-		 return -1;  /* unreachable */
-	}
-	pThreadState = PyThreadState_New(pInterpreterState);
-	if (!pThreadState) {
-		 Py_FatalError("unable to allocate thread state "
-		 	       "when closing popen object");
-		 return -1;  /* unreachable */
-	}
-	/* Grab the global lock.  Note that this will deadlock if the
-	 * current thread already has the lock! (see RED_FLAG comments
-	 * before this function)
-	 */
-	PyEval_RestoreThread(pThreadState);
+	state = PyGILState_Ensure();
 #endif
-
 	if (_PyPopenProcs) {
 		if ((fileObj = PyLong_FromVoidPtr(file)) != NULL &&
 		    (procObj = PyDict_GetItem(_PyPopenProcs,
@@ -4470,17 +4435,8 @@
 	} /* if _PyPopenProcs */
 
 #ifdef WITH_THREAD
-	/* Tear down the thread & interpreter states.
-	 * Note that interpreter state clear & delete functions automatically
-	 * call the thread clear & delete functions, and indeed insist on
-	 * doing that themselves.  The lock must be held during the clear, but
-	 * need not be held during the delete.
-	 */
-	PyInterpreterState_Clear(pInterpreterState);
-	PyEval_ReleaseThread(pThreadState);
-	PyInterpreterState_Delete(pInterpreterState);
+	PyGILState_Release(state);
 #endif
-
 	return result;
 }
 
diff --git a/Python/ceval.c b/Python/ceval.c
index 080b3c1..3ea1bdc 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -321,6 +321,8 @@
 {
 	if (tstate == NULL)
 		Py_FatalError("PyEval_AcquireThread: NULL new thread state");
+	/* Check someone has called PyEval_InitThreads() to create the lock */
+	assert(interpreter_lock);
 	PyThread_acquire_lock(interpreter_lock, 1);
 	if (PyThreadState_Swap(tstate) != NULL)
 		Py_FatalError(
diff --git a/Python/pystate.c b/Python/pystate.c
index 62bf09b..8faf6a0 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -142,6 +142,7 @@
 		tstate->tracing = 0;
 		tstate->use_tracing = 0;
 		tstate->tick_counter = 0;
+		tstate->gilstate_counter = 0;
 
 		tstate->dict = NULL;
 
@@ -259,7 +260,17 @@
 	PyThreadState *old = _PyThreadState_Current;
 
 	_PyThreadState_Current = new;
-
+	/* It should not be possible for more than one thread state
+	   to be used for a thread.  Check this the best we can in debug 
+	   builds.
+	*/
+#if defined(Py_DEBUG)
+	if (new) {
+		PyThreadState *check = PyGILState_GetThisThreadState();
+		if (check && check != new)
+			Py_FatalError("Invalid thread state for this thread");
+	}
+#endif
 	return old;
 }
 
@@ -308,3 +319,131 @@
 PyThreadState_Next(PyThreadState *tstate) {
 	return tstate->next;
 }
+
+/* Python "auto thread state" API. */
+#ifdef WITH_THREAD
+
+/* Keep this as a static, as it is not reliable!  It can only
+   ever be compared to the state for the *current* thread.
+   * If not equal, then it doesn't matter that the actual
+     value may change immediately after comparison, as it can't
+     possibly change to the current thread's state.
+   * If equal, then the current thread holds the lock, so the value can't
+     change until we yield the lock.
+*/
+static int
+PyThreadState_IsCurrent(PyThreadState *tstate)
+{
+	/* Must be the tstate for this thread */
+	assert(PyGILState_GetThisThreadState()==tstate);
+	/* On Windows at least, simple reads and writes to 32 bit values
+	   are atomic.
+	*/
+	return tstate == _PyThreadState_Current;
+}
+
+/* The single PyInterpreterState used by this process'
+   GILState implementation
+*/
+static PyInterpreterState *autoInterpreterState = NULL;
+static int autoTLSkey = 0;
+
+/* Internal initialization/finalization functions called by 
+   Py_Initialize/Py_Finalize 
+*/
+void _PyGILState_Init(PyInterpreterState *i, PyThreadState *t)
+{
+	assert(i && t); /* must init with a valid states */
+	autoTLSkey = PyThread_create_key();
+	autoInterpreterState = i;
+	/* Now stash the thread state for this thread in TLS */
+	PyThread_set_key_value(autoTLSkey, (void *)t);
+	assert(t->gilstate_counter==0); /* must be a new thread state */
+	t->gilstate_counter = 1;
+}
+
+void _PyGILState_Fini(void)
+{
+	PyThread_delete_key(autoTLSkey);
+	autoTLSkey = 0;
+	autoInterpreterState = NULL;;
+}
+
+/* The public functions */
+PyThreadState *PyGILState_GetThisThreadState(void)
+{
+	if (autoInterpreterState==NULL || autoTLSkey==0)
+		return NULL;
+	return (PyThreadState *) PyThread_get_key_value(autoTLSkey);
+}
+
+PyGILState_STATE PyGILState_Ensure(void)
+{
+	int current;
+	PyThreadState *tcur;
+	/* Note that we do not auto-init Python here - apart from 
+	   potential races with 2 threads auto-initializing, pep-311 
+	   spells out other issues.  Embedders are expected to have
+	   called Py_Initialize() and usually PyEval_InitThreads().
+	*/
+	assert(autoInterpreterState); /* Py_Initialize() hasn't been called! */
+	tcur = PyThread_get_key_value(autoTLSkey);
+	if (tcur==NULL) {
+		/* Create a new thread state for this thread */
+		tcur = PyThreadState_New(autoInterpreterState);
+		if (tcur==NULL)
+			Py_FatalError("Couldn't create thread-state for new thread");
+		PyThread_set_key_value(autoTLSkey, (void *)tcur);
+		current = 0; /* new thread state is never current */
+	} else
+		current = PyThreadState_IsCurrent(tcur);
+	if (!current)
+		PyEval_RestoreThread(tcur);
+	/* Update our counter in the thread-state - no need for locks:
+	   - tcur will remain valid as we hold the GIL.
+	   - the counter is safe as we are the only thread "allowed" 
+	     to modify this value
+	*/
+	tcur->gilstate_counter++;
+	return current ? PyGILState_LOCKED : PyGILState_UNLOCKED;
+}
+
+void PyGILState_Release(PyGILState_STATE oldstate)
+{
+	PyThreadState *tcur = PyThread_get_key_value(autoTLSkey);
+	if (tcur==NULL)
+		Py_FatalError("auto-releasing thread-state, "
+		              "but no thread-state for this thread");
+	/* We must hold the GIL and have our thread state current */
+	/* XXX - remove the check - the assert should be fine,
+	   but while this is very new (April 2003), the extra check 
+	   by release-only users can't hurt.
+	*/
+	if (!PyThreadState_IsCurrent(tcur))
+		Py_FatalError("This thread state must be current when releasing");
+	assert (PyThreadState_IsCurrent(tcur));
+	tcur->gilstate_counter -= 1;
+	assert (tcur->gilstate_counter >= 0); /* illegal counter value */
+
+	/* If we are about to destroy this thread-state, we must 
+	   clear it while the lock is held, as destructors may run
+	*/
+	if (tcur->gilstate_counter==0) {
+		/* can't have been locked when we created it */
+		assert(oldstate==PyGILState_UNLOCKED);
+		PyThreadState_Clear(tcur);
+	}
+
+	/* Release the lock if necessary */
+	if (oldstate==PyGILState_UNLOCKED)
+		PyEval_ReleaseThread(tcur);
+
+	/* Now complete destruction of the thread if necessary */
+	if (tcur->gilstate_counter==0) {
+		/* Delete this thread from our TLS */
+		PyThread_delete_key_value(autoTLSkey);
+		/* Delete the thread-state */
+		PyThreadState_Delete(tcur);
+	}
+}
+#endif /* WITH_THREAD */
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index 0a9a637..29ba120 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -50,6 +50,11 @@
 extern void _PyUnicode_Init(void);
 extern void _PyUnicode_Fini(void);
 
+#ifdef WITH_THREAD
+extern void _PyGILState_Init(PyInterpreterState *, PyThreadState *);
+extern void _PyGILState_Fini(void);
+#endif /* WITH_THREAD */
+
 int Py_DebugFlag; /* Needed by parser.c */
 int Py_VerboseFlag; /* Needed by import.c */
 int Py_InteractiveFlag; /* Needed by Py_FdIsInteractive() below */
@@ -180,6 +185,11 @@
 	if (!Py_NoSiteFlag)
 		initsite(); /* Module site */
 
+	/* auto-thread-state API, if available */
+#ifdef WITH_THREAD
+	_PyGILState_Init(interp, tstate);
+#endif /* WITH_THREAD */
+
 	PyModule_WarningsModule = PyImport_ImportModule("warnings");
 
 #if defined(Py_USING_UNICODE) && defined(HAVE_LANGINFO_H) && defined(CODESET)
@@ -244,6 +254,11 @@
 	call_sys_exitfunc();
 	initialized = 0;
 
+	/* Cleanup auto-thread-state */
+#ifdef WITH_THREAD
+	_PyGILState_Fini();
+#endif /* WITH_THREAD */
+
 	/* Get current thread state and interpreter pointer */
 	tstate = PyThreadState_Get();
 	interp = tstate->interp;
diff --git a/Python/thread.c b/Python/thread.c
index 819186c..87230e0 100644
--- a/Python/thread.c
+++ b/Python/thread.c
@@ -137,3 +137,111 @@
 #include "thread_foobar.h"
 #endif
 */
+
+#ifndef Py_HAVE_NATIVE_TLS
+/* If the platform has not supplied a platform specific
+   TLS implementation, provide our own.
+
+   This code stolen from "thread_sgi.h", where it was the only
+   implementation of an existing Python TLS API.
+*/
+/*
+ * Per-thread data ("key") support.
+ */
+
+struct key {
+	struct key *next;
+	long id;
+	int key;
+	void *value;
+};
+
+static struct key *keyhead = NULL;
+static int nkeys = 0;
+static PyThread_type_lock keymutex = NULL;
+
+static struct key *find_key(int key, void *value)
+{
+	struct key *p;
+	long id = PyThread_get_thread_ident();
+	for (p = keyhead; p != NULL; p = p->next) {
+		if (p->id == id && p->key == key)
+			return p;
+	}
+	if (value == NULL)
+		return NULL;
+	p = (struct key *)malloc(sizeof(struct key));
+	if (p != NULL) {
+		p->id = id;
+		p->key = key;
+		p->value = value;
+		PyThread_acquire_lock(keymutex, 1);
+		p->next = keyhead;
+		keyhead = p;
+		PyThread_release_lock(keymutex);
+	}
+	return p;
+}
+
+int PyThread_create_key(void)
+{
+	if (keymutex == NULL)
+		keymutex = PyThread_allocate_lock();
+	return ++nkeys;
+}
+
+void PyThread_delete_key(int key)
+{
+	struct key *p, **q;
+	PyThread_acquire_lock(keymutex, 1);
+	q = &keyhead;
+	while ((p = *q) != NULL) {
+		if (p->key == key) {
+			*q = p->next;
+			free((void *)p);
+			/* NB This does *not* free p->value! */
+		}
+		else
+			q = &p->next;
+	}
+	PyThread_release_lock(keymutex);
+}
+
+int PyThread_set_key_value(int key, void *value)
+{
+	struct key *p = find_key(key, value);
+	if (p == NULL)
+		return -1;
+	else
+		return 0;
+}
+
+void *PyThread_get_key_value(int key)
+{
+	struct key *p = find_key(key, NULL);
+	if (p == NULL)
+		return NULL;
+	else
+		return p->value;
+}
+
+void PyThread_delete_key_value(int key)
+{
+	long id = PyThread_get_thread_ident();
+	struct key *p, **q;
+	PyThread_acquire_lock(keymutex, 1);
+	q = &keyhead;
+	while ((p = *q) != NULL) {
+		if (p->key == key && p->id == id) {
+			*q = p->next;
+			free((void *)p);
+			/* NB This does *not* free p->value! */
+			break;
+		}
+		else
+			q = &p->next;
+	}
+	PyThread_release_lock(keymutex);
+}
+
+#endif /* Py_HAVE_NATIVE_TLS */
diff --git a/Python/thread_sgi.h b/Python/thread_sgi.h
index 0fceb21..e246538 100644
--- a/Python/thread_sgi.h
+++ b/Python/thread_sgi.h
@@ -377,83 +377,3 @@
 	if (usunsetlock((ulock_t) lock) < 0)
 		perror("usunsetlock");
 }
-
-/*
- * Per-thread data ("key") support.
- */
-
-struct key {
-	struct key *next;
-	long id;
-	int key;
-	void *value;
-};
-
-static struct key *keyhead = NULL;
-static int nkeys = 0;
-static PyThread_type_lock keymutex = NULL;
-
-static struct key *find_key(int key, void *value)
-{
-	struct key *p;
-	long id = PyThread_get_thread_ident();
-	for (p = keyhead; p != NULL; p = p->next) {
-		if (p->id == id && p->key == key)
-			return p;
-	}
-	if (value == NULL)
-		return NULL;
-	p = (struct key *)malloc(sizeof(struct key));
-	if (p != NULL) {
-		p->id = id;
-		p->key = key;
-		p->value = value;
-		PyThread_acquire_lock(keymutex, 1);
-		p->next = keyhead;
-		keyhead = p;
-		PyThread_release_lock(keymutex);
-	}
-	return p;
-}
-
-int PyThread_create_key(void)
-{
-	if (keymutex == NULL)
-		keymutex = PyThread_allocate_lock();
-	return ++nkeys;
-}
-
-void PyThread_delete_key(int key)
-{
-	struct key *p, **q;
-	PyThread_acquire_lock(keymutex, 1);
-	q = &keyhead;
-	while ((p = *q) != NULL) {
-		if (p->key == key) {
-			*q = p->next;
-			free((void *)p);
-			/* NB This does *not* free p->value! */
-		}
-		else
-			q = &p->next;
-	}
-	PyThread_release_lock(keymutex);
-}
-
-int PyThread_set_key_value(int key, void *value)
-{
-	struct key *p = find_key(key, value);
-	if (p == NULL)
-		return -1;
-	else
-		return 0;
-}
-
-void *PyThread_get_key_value(int key)
-{
-	struct key *p = find_key(key, NULL);
-	if (p == NULL)
-		return NULL;
-	else
-		return p->value;
-}