- pythunrun.c, Py_Finalize(): move the call to _Py_PrintReferences()
  even farther down, to just before the call to
  _PyObject_DebugMallocStats().  This required the following changes:

- pystate.c, PyThreadState_GetDict(): changed not to raise an
  exception or issue a fatal error when no current thread state is
  available, but simply return NULL without raising an exception
  (ever).

- object.c, Py_ReprEnter(): when PyThreadState_GetDict() returns NULL,
  don't raise an exception but return 0.  This means that when
  printing a container that's recursive, printing will go on and on
  and on.  But that shouldn't happen in the case we care about (see
  first bullet).

- Updated Misc/NEWS and Doc/api/init.tex to reflect changes to
  PyThreadState_GetDict() definition.
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index fdbd19f..fbf4283 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -280,14 +280,6 @@
 	/* Clear interpreter state */
 	PyInterpreterState_Clear(interp);
 
-#ifdef Py_TRACE_REFS
-	/* Dump references -- this may implicitly need the thread state,
-	   so this is the last possible place where we can do this. */
-	if (Py_GETENV("PYTHONDUMPREFS")) {
-		_Py_PrintReferences(stderr);
-	}
-#endif /* Py_TRACE_REFS */
-
 	/* Delete current thread */
 	PyThreadState_Swap(NULL);
 	PyInterpreterState_Delete(interp);
@@ -314,6 +306,14 @@
 
 	PyGrammar_RemoveAccelerators(&_PyParser_Grammar);
 
+#ifdef Py_TRACE_REFS
+	/* Dump references -- this may implicitly need the thread state,
+	   so this is the last possible place where we can do this. */
+	if (Py_GETENV("PYTHONDUMPREFS")) {
+		_Py_PrintReferences(stderr);
+	}
+#endif /* Py_TRACE_REFS */
+
 #ifdef PYMALLOC_DEBUG
 	if (Py_GETENV("PYTHONMALLOCSTATS"))
 		_PyObject_DebugMallocStats();