New PYMALLOC_DEBUG function void _PyMalloc_DebugDumpStats(void).
This displays stats about the # of arenas, pools, blocks and bytes, to
stderr, both used and reserved but unused.

CAUTION:  Because PYMALLOC_DEBUG is on, the debug malloc routine adds
16 bytes to each request.  This makes each block appear two size classes
higher than it would be if PYMALLOC_DEBUG weren't on.

So far, playing with this confirms the obvious:  there's a lot of activity
in the "small dict" size class, but nothing in the core makes any use of
the 8-byte or 16-byte classes.
diff --git a/Include/pymem.h b/Include/pymem.h
index 5d9beed..18c49d7 100644
--- a/Include/pymem.h
+++ b/Include/pymem.h
@@ -102,6 +102,7 @@
 DL_IMPORT(void) _PyMalloc_DebugFree(void *p);
 DL_IMPORT(void) _PyMalloc_DebugDumpAddress(const void *p);
 DL_IMPORT(void) _PyMalloc_DebugCheckAddress(const void *p);
+DL_IMPORT(void) _PyMalloc_DebugDumpStats(void);
 #define _PyMalloc_MALLOC _PyMalloc_DebugMalloc
 #define _PyMalloc_REALLOC _PyMalloc_DebugRealloc
 #define _PyMalloc_FREE _PyMalloc_DebugFree
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index cf2b477..3030844 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -1026,7 +1026,11 @@
 	return fresh;
 }
 
-void
+/* Check the forbidden bytes on both ends of the memory allocated for p.
+ * If anything is wrong, print info to stderr via _PyMalloc_DebugDumpAddress,
+ * and call Py_FatalError to kill the program.
+ */
+ void
 _PyMalloc_DebugCheckAddress(const void *p)
 {
 	const uchar *q = (const uchar *)p;
@@ -1063,6 +1067,7 @@
 	Py_FatalError(msg);
 }
 
+/* Display info to stderr about the memory block at p. */
 void
 _PyMalloc_DebugDumpAddress(const void *p)
 {
@@ -1149,4 +1154,95 @@
 	}
 }
 
+/* Print summary info to stderr about the state of pymalloc's structures. */
+void
+_PyMalloc_DebugDumpStats(void)
+{
+	uint i;
+	const uint numclasses = SMALL_REQUEST_THRESHOLD >> ALIGNMENT_SHIFT;
+	uint numfreepools = 0;
+	/* # of pools per class index */
+	ulong numpools[SMALL_REQUEST_THRESHOLD >> ALIGNMENT_SHIFT];
+	/* # of allocated blocks per class index */
+	ulong numblocks[SMALL_REQUEST_THRESHOLD >> ALIGNMENT_SHIFT];
+	/* # of free blocks per class index */
+	ulong numfreeblocks[SMALL_REQUEST_THRESHOLD >> ALIGNMENT_SHIFT];
+	ulong grandtotal;	/* total # of allocated bytes */
+	ulong freegrandtotal;	/* total # of available bytes in used blocks */
+
+	fprintf(stderr, "%u arenas * %d bytes/arena = %lu total bytes.\n",
+		narenas, ARENA_SIZE, narenas * (ulong)ARENA_SIZE);
+	fprintf(stderr, "Small block threshold = %d, in %u size classes.\n",
+		SMALL_REQUEST_THRESHOLD, numclasses);
+	fprintf(stderr, "pymalloc malloc+realloc called %lu times.\n",
+		serialno);
+
+	for (i = 0; i < numclasses; ++i)
+		numpools[i] = numblocks[i] = numfreeblocks[i] = 0;
+
+	/* Because empty pools aren't linked to from anything, it's easiest
+	 * to march over all the arenas.
+	 */
+	for (i = 0; i < narenas; ++i) {
+		uint poolsinarena;
+		uint j;
+		uptr base = arenas[i];
+
+		/* round up to pool alignment */
+		poolsinarena = ARENA_SIZE / POOL_SIZE;
+		if (base & (uptr)POOL_SIZE_MASK) {
+			--poolsinarena;
+			base &= ~(uptr)POOL_SIZE_MASK;
+			base += POOL_SIZE;
+		}
+
+		if (i == narenas - 1) {
+			/* current arena may have raw memory at the end */
+			numfreepools += nfreepools;
+			poolsinarena -= nfreepools;
+		}
+
+		/* visit every pool in the arena */
+		for (j = 0; j < poolsinarena; ++j, base += POOL_SIZE) {
+			poolp p = (poolp)base;
+			if (p->ref.count == 0) {
+				/* currently unused */
+				++numfreepools;
+				continue;
+			}
+			++numpools[p->szidx];
+			numblocks[p->szidx] += p->ref.count;
+			numfreeblocks[p->szidx] += p->capacity - p->ref.count;
+		}
+	}
+
+	fputc('\n', stderr);
+	fprintf(stderr, "Number of unused pools: %u\n", numfreepools);
+	fputc('\n', stderr);
+	fputs("class   num bytes   num pools   blocks in use  avail blocks\n"
+	      "-----   ---------   ---------   -------------  ------------\n",
+		stderr);
+
+	grandtotal = freegrandtotal = 0;
+	for (i = 0; i < numclasses; ++i) {
+		ulong p = numpools[i];
+		ulong b = numblocks[i];
+		ulong f = numfreeblocks[i];
+		uint size = (i+1) << ALIGNMENT_SHIFT;
+		if (p == 0) {
+			assert(b == 0 && f == 0);
+			continue;
+		}
+		fprintf(stderr, "%5u %11u %11lu %15lu %13lu\n",
+			i, size, p, b, f);
+		grandtotal += b * size;
+		freegrandtotal += f * size;
+	}
+	fputc('\n', stderr);
+	fprintf(stderr, "Total bytes in allocated blocks: %lu\n",
+		grandtotal);
+	fprintf(stderr, "Total free bytes in used pools:  %lu\n",
+		freegrandtotal);
+}
+
 #endif	/* PYMALLOC_DEBUG */