- pythunrun.c, Py_Finalize(): move the call to _Py_PrintReferences()
  even farther down, to just before the call to
  _PyObject_DebugMallocStats().  This required the following changes:

- pystate.c, PyThreadState_GetDict(): changed not to raise an
  exception or issue a fatal error when no current thread state is
  available, but simply return NULL without raising an exception
  (ever).

- object.c, Py_ReprEnter(): when PyThreadState_GetDict() returns NULL,
  don't raise an exception but return 0.  This means that when
  printing a container that's recursive, printing will go on and on
  and on.  But that shouldn't happen in the case we care about (see
  first bullet).

- Updated Misc/NEWS and Doc/api/init.tex to reflect changes to
  PyThreadState_GetDict() definition.
diff --git a/Python/pystate.c b/Python/pystate.c
index 1139851..62bf09b 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -266,17 +266,21 @@
 /* An extension mechanism to store arbitrary additional per-thread state.
    PyThreadState_GetDict() returns a dictionary that can be used to hold such
    state; the caller should pick a unique key and store its state there.  If
-   PyThreadState_GetDict() returns NULL, an exception has been raised (most
-   likely MemoryError) and the caller should pass on the exception. */
+   PyThreadState_GetDict() returns NULL, an exception has *not* been raised
+   and the caller should assume no per-thread state is available. */
 
 PyObject *
 PyThreadState_GetDict(void)
 {
 	if (_PyThreadState_Current == NULL)
-		Py_FatalError("PyThreadState_GetDict: no current thread");
+		return NULL;
 
-	if (_PyThreadState_Current->dict == NULL)
-		_PyThreadState_Current->dict = PyDict_New();
+	if (_PyThreadState_Current->dict == NULL) {
+		PyObject *d;
+		_PyThreadState_Current->dict = d = PyDict_New();
+		if (d == NULL)
+			PyErr_Clear();
+	}
 	return _PyThreadState_Current->dict;
 }
 
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index fdbd19f..fbf4283 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -280,14 +280,6 @@
 	/* Clear interpreter state */
 	PyInterpreterState_Clear(interp);
 
-#ifdef Py_TRACE_REFS
-	/* Dump references -- this may implicitly need the thread state,
-	   so this is the last possible place where we can do this. */
-	if (Py_GETENV("PYTHONDUMPREFS")) {
-		_Py_PrintReferences(stderr);
-	}
-#endif /* Py_TRACE_REFS */
-
 	/* Delete current thread */
 	PyThreadState_Swap(NULL);
 	PyInterpreterState_Delete(interp);
@@ -314,6 +306,14 @@
 
 	PyGrammar_RemoveAccelerators(&_PyParser_Grammar);
 
+#ifdef Py_TRACE_REFS
+	/* Dump references -- this may implicitly need the thread state,
+	   so this is the last possible place where we can do this. */
+	if (Py_GETENV("PYTHONDUMPREFS")) {
+		_Py_PrintReferences(stderr);
+	}
+#endif /* Py_TRACE_REFS */
+
 #ifdef PYMALLOC_DEBUG
 	if (Py_GETENV("PYTHONMALLOCSTATS"))
 		_PyObject_DebugMallocStats();