New PyGILState_ API - implements pep 311, from patch 684256.
diff --git a/Python/ceval.c b/Python/ceval.c
index 080b3c1..3ea1bdc 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -321,6 +321,8 @@
 {
 	if (tstate == NULL)
 		Py_FatalError("PyEval_AcquireThread: NULL new thread state");
+	/* Check someone has called PyEval_InitThreads() to create the lock */
+	assert(interpreter_lock);
 	PyThread_acquire_lock(interpreter_lock, 1);
 	if (PyThreadState_Swap(tstate) != NULL)
 		Py_FatalError(
diff --git a/Python/pystate.c b/Python/pystate.c
index 62bf09b..8faf6a0 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -142,6 +142,7 @@
 		tstate->tracing = 0;
 		tstate->use_tracing = 0;
 		tstate->tick_counter = 0;
+		tstate->gilstate_counter = 0;
 
 		tstate->dict = NULL;
 
@@ -259,7 +260,17 @@
 	PyThreadState *old = _PyThreadState_Current;
 
 	_PyThreadState_Current = new;
-
+	/* It should not be possible for more than one thread state
+	   to be used for a thread.  Check this the best we can in debug 
+	   builds.
+	*/
+#if defined(Py_DEBUG)
+	if (new) {
+		PyThreadState *check = PyGILState_GetThisThreadState();
+		if (check && check != new)
+			Py_FatalError("Invalid thread state for this thread");
+	}
+#endif
 	return old;
 }
 
@@ -308,3 +319,131 @@
 PyThreadState_Next(PyThreadState *tstate) {
 	return tstate->next;
 }
+
+/* Python "auto thread state" API. */
+#ifdef WITH_THREAD
+
+/* Keep this as a static, as it is not reliable!  It can only
+   ever be compared to the state for the *current* thread.
+   * If not equal, then it doesn't matter that the actual
+     value may change immediately after comparison, as it can't
+     possibly change to the current thread's state.
+   * If equal, then the current thread holds the lock, so the value can't
+     change until we yield the lock.
+*/
+static int
+PyThreadState_IsCurrent(PyThreadState *tstate)
+{
+	/* Must be the tstate for this thread */
+	assert(PyGILState_GetThisThreadState()==tstate);
+	/* On Windows at least, simple reads and writes to 32 bit values
+	   are atomic.
+	*/
+	return tstate == _PyThreadState_Current;
+}
+
+/* The single PyInterpreterState used by this process'
+   GILState implementation
+*/
+static PyInterpreterState *autoInterpreterState = NULL;
+static int autoTLSkey = 0;
+
+/* Internal initialization/finalization functions called by 
+   Py_Initialize/Py_Finalize 
+*/
+void _PyGILState_Init(PyInterpreterState *i, PyThreadState *t)
+{
+	assert(i && t); /* must init with a valid states */
+	autoTLSkey = PyThread_create_key();
+	autoInterpreterState = i;
+	/* Now stash the thread state for this thread in TLS */
+	PyThread_set_key_value(autoTLSkey, (void *)t);
+	assert(t->gilstate_counter==0); /* must be a new thread state */
+	t->gilstate_counter = 1;
+}
+
+void _PyGILState_Fini(void)
+{
+	PyThread_delete_key(autoTLSkey);
+	autoTLSkey = 0;
+	autoInterpreterState = NULL;;
+}
+
+/* The public functions */
+PyThreadState *PyGILState_GetThisThreadState(void)
+{
+	if (autoInterpreterState==NULL || autoTLSkey==0)
+		return NULL;
+	return (PyThreadState *) PyThread_get_key_value(autoTLSkey);
+}
+
+PyGILState_STATE PyGILState_Ensure(void)
+{
+	int current;
+	PyThreadState *tcur;
+	/* Note that we do not auto-init Python here - apart from 
+	   potential races with 2 threads auto-initializing, pep-311 
+	   spells out other issues.  Embedders are expected to have
+	   called Py_Initialize() and usually PyEval_InitThreads().
+	*/
+	assert(autoInterpreterState); /* Py_Initialize() hasn't been called! */
+	tcur = PyThread_get_key_value(autoTLSkey);
+	if (tcur==NULL) {
+		/* Create a new thread state for this thread */
+		tcur = PyThreadState_New(autoInterpreterState);
+		if (tcur==NULL)
+			Py_FatalError("Couldn't create thread-state for new thread");
+		PyThread_set_key_value(autoTLSkey, (void *)tcur);
+		current = 0; /* new thread state is never current */
+	} else
+		current = PyThreadState_IsCurrent(tcur);
+	if (!current)
+		PyEval_RestoreThread(tcur);
+	/* Update our counter in the thread-state - no need for locks:
+	   - tcur will remain valid as we hold the GIL.
+	   - the counter is safe as we are the only thread "allowed" 
+	     to modify this value
+	*/
+	tcur->gilstate_counter++;
+	return current ? PyGILState_LOCKED : PyGILState_UNLOCKED;
+}
+
+void PyGILState_Release(PyGILState_STATE oldstate)
+{
+	PyThreadState *tcur = PyThread_get_key_value(autoTLSkey);
+	if (tcur==NULL)
+		Py_FatalError("auto-releasing thread-state, "
+		              "but no thread-state for this thread");
+	/* We must hold the GIL and have our thread state current */
+	/* XXX - remove the check - the assert should be fine,
+	   but while this is very new (April 2003), the extra check 
+	   by release-only users can't hurt.
+	*/
+	if (!PyThreadState_IsCurrent(tcur))
+		Py_FatalError("This thread state must be current when releasing");
+	assert (PyThreadState_IsCurrent(tcur));
+	tcur->gilstate_counter -= 1;
+	assert (tcur->gilstate_counter >= 0); /* illegal counter value */
+
+	/* If we are about to destroy this thread-state, we must 
+	   clear it while the lock is held, as destructors may run
+	*/
+	if (tcur->gilstate_counter==0) {
+		/* can't have been locked when we created it */
+		assert(oldstate==PyGILState_UNLOCKED);
+		PyThreadState_Clear(tcur);
+	}
+
+	/* Release the lock if necessary */
+	if (oldstate==PyGILState_UNLOCKED)
+		PyEval_ReleaseThread(tcur);
+
+	/* Now complete destruction of the thread if necessary */
+	if (tcur->gilstate_counter==0) {
+		/* Delete this thread from our TLS */
+		PyThread_delete_key_value(autoTLSkey);
+		/* Delete the thread-state */
+		PyThreadState_Delete(tcur);
+	}
+}
+#endif /* WITH_THREAD */
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index 0a9a637..29ba120 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -50,6 +50,11 @@
 extern void _PyUnicode_Init(void);
 extern void _PyUnicode_Fini(void);
 
+#ifdef WITH_THREAD
+extern void _PyGILState_Init(PyInterpreterState *, PyThreadState *);
+extern void _PyGILState_Fini(void);
+#endif /* WITH_THREAD */
+
 int Py_DebugFlag; /* Needed by parser.c */
 int Py_VerboseFlag; /* Needed by import.c */
 int Py_InteractiveFlag; /* Needed by Py_FdIsInteractive() below */
@@ -180,6 +185,11 @@
 	if (!Py_NoSiteFlag)
 		initsite(); /* Module site */
 
+	/* auto-thread-state API, if available */
+#ifdef WITH_THREAD
+	_PyGILState_Init(interp, tstate);
+#endif /* WITH_THREAD */
+
 	PyModule_WarningsModule = PyImport_ImportModule("warnings");
 
 #if defined(Py_USING_UNICODE) && defined(HAVE_LANGINFO_H) && defined(CODESET)
@@ -244,6 +254,11 @@
 	call_sys_exitfunc();
 	initialized = 0;
 
+	/* Cleanup auto-thread-state */
+#ifdef WITH_THREAD
+	_PyGILState_Fini();
+#endif /* WITH_THREAD */
+
 	/* Get current thread state and interpreter pointer */
 	tstate = PyThreadState_Get();
 	interp = tstate->interp;
diff --git a/Python/thread.c b/Python/thread.c
index 819186c..87230e0 100644
--- a/Python/thread.c
+++ b/Python/thread.c
@@ -137,3 +137,111 @@
 #include "thread_foobar.h"
 #endif
 */
+
+#ifndef Py_HAVE_NATIVE_TLS
+/* If the platform has not supplied a platform specific
+   TLS implementation, provide our own.
+
+   This code stolen from "thread_sgi.h", where it was the only
+   implementation of an existing Python TLS API.
+*/
+/*
+ * Per-thread data ("key") support.
+ */
+
+struct key {
+	struct key *next;
+	long id;
+	int key;
+	void *value;
+};
+
+static struct key *keyhead = NULL;
+static int nkeys = 0;
+static PyThread_type_lock keymutex = NULL;
+
+static struct key *find_key(int key, void *value)
+{
+	struct key *p;
+	long id = PyThread_get_thread_ident();
+	for (p = keyhead; p != NULL; p = p->next) {
+		if (p->id == id && p->key == key)
+			return p;
+	}
+	if (value == NULL)
+		return NULL;
+	p = (struct key *)malloc(sizeof(struct key));
+	if (p != NULL) {
+		p->id = id;
+		p->key = key;
+		p->value = value;
+		PyThread_acquire_lock(keymutex, 1);
+		p->next = keyhead;
+		keyhead = p;
+		PyThread_release_lock(keymutex);
+	}
+	return p;
+}
+
+int PyThread_create_key(void)
+{
+	if (keymutex == NULL)
+		keymutex = PyThread_allocate_lock();
+	return ++nkeys;
+}
+
+void PyThread_delete_key(int key)
+{
+	struct key *p, **q;
+	PyThread_acquire_lock(keymutex, 1);
+	q = &keyhead;
+	while ((p = *q) != NULL) {
+		if (p->key == key) {
+			*q = p->next;
+			free((void *)p);
+			/* NB This does *not* free p->value! */
+		}
+		else
+			q = &p->next;
+	}
+	PyThread_release_lock(keymutex);
+}
+
+int PyThread_set_key_value(int key, void *value)
+{
+	struct key *p = find_key(key, value);
+	if (p == NULL)
+		return -1;
+	else
+		return 0;
+}
+
+void *PyThread_get_key_value(int key)
+{
+	struct key *p = find_key(key, NULL);
+	if (p == NULL)
+		return NULL;
+	else
+		return p->value;
+}
+
+void PyThread_delete_key_value(int key)
+{
+	long id = PyThread_get_thread_ident();
+	struct key *p, **q;
+	PyThread_acquire_lock(keymutex, 1);
+	q = &keyhead;
+	while ((p = *q) != NULL) {
+		if (p->key == key && p->id == id) {
+			*q = p->next;
+			free((void *)p);
+			/* NB This does *not* free p->value! */
+			break;
+		}
+		else
+			q = &p->next;
+	}
+	PyThread_release_lock(keymutex);
+}
+
+#endif /* Py_HAVE_NATIVE_TLS */
diff --git a/Python/thread_sgi.h b/Python/thread_sgi.h
index 0fceb21..e246538 100644
--- a/Python/thread_sgi.h
+++ b/Python/thread_sgi.h
@@ -377,83 +377,3 @@
 	if (usunsetlock((ulock_t) lock) < 0)
 		perror("usunsetlock");
 }
-
-/*
- * Per-thread data ("key") support.
- */
-
-struct key {
-	struct key *next;
-	long id;
-	int key;
-	void *value;
-};
-
-static struct key *keyhead = NULL;
-static int nkeys = 0;
-static PyThread_type_lock keymutex = NULL;
-
-static struct key *find_key(int key, void *value)
-{
-	struct key *p;
-	long id = PyThread_get_thread_ident();
-	for (p = keyhead; p != NULL; p = p->next) {
-		if (p->id == id && p->key == key)
-			return p;
-	}
-	if (value == NULL)
-		return NULL;
-	p = (struct key *)malloc(sizeof(struct key));
-	if (p != NULL) {
-		p->id = id;
-		p->key = key;
-		p->value = value;
-		PyThread_acquire_lock(keymutex, 1);
-		p->next = keyhead;
-		keyhead = p;
-		PyThread_release_lock(keymutex);
-	}
-	return p;
-}
-
-int PyThread_create_key(void)
-{
-	if (keymutex == NULL)
-		keymutex = PyThread_allocate_lock();
-	return ++nkeys;
-}
-
-void PyThread_delete_key(int key)
-{
-	struct key *p, **q;
-	PyThread_acquire_lock(keymutex, 1);
-	q = &keyhead;
-	while ((p = *q) != NULL) {
-		if (p->key == key) {
-			*q = p->next;
-			free((void *)p);
-			/* NB This does *not* free p->value! */
-		}
-		else
-			q = &p->next;
-	}
-	PyThread_release_lock(keymutex);
-}
-
-int PyThread_set_key_value(int key, void *value)
-{
-	struct key *p = find_key(key, value);
-	if (p == NULL)
-		return -1;
-	else
-		return 0;
-}
-
-void *PyThread_get_key_value(int key)
-{
-	struct key *p = find_key(key, NULL);
-	if (p == NULL)
-		return NULL;
-	else
-		return p->value;
-}