This is the fastest I could get on Intel GCC.  I kept the memset() in to clear
the newly created tuples, but tuples added in the freelist are now cleared in
tupledealloc already (which is very cheap, because we are already
Py_XDECREF'ing all elements anyway).

Python should have a standard Py_ZAP macro like ZAP in pystate.c.
diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c
index 159dc44..9794bec 100644
--- a/Objects/tupleobject.c
+++ b/Objects/tupleobject.c
@@ -68,9 +68,8 @@
 		op = PyObject_GC_NewVar(PyTupleObject, &PyTuple_Type, size);
 		if (op == NULL)
 			return NULL;
+		memset(op->ob_item, 0, size*sizeof(PyObject*));
 	}
-	for (i=0; i < size; i++)
-		op->ob_item[i] = NULL;
 #if MAXSAVESIZE > 0
 	if (size == 0) {
 		free_tuples[0] = op;
@@ -165,19 +164,27 @@
 	Py_TRASHCAN_SAFE_BEGIN(op)
 	if (len > 0) {
 		i = len;
-		while (--i >= 0)
-			Py_XDECREF(op->ob_item[i]);
 #if MAXSAVESIZE > 0
 		if (len < MAXSAVESIZE &&
 		    num_free_tuples[len] < MAXSAVEDTUPLES &&
 		    op->ob_type == &PyTuple_Type)
 		{
+			while (--i >= 0) {
+				PyObject* o = op->ob_item[i];
+				if (o != NULL) {
+					op->ob_item[i] = NULL;
+					Py_DECREF(o);
+				}
+			}
 			op->ob_item[0] = (PyObject *) free_tuples[len];
 			num_free_tuples[len]++;
 			free_tuples[len] = op;
 			goto done; /* return */
 		}
+		else
 #endif
+			while (--i >= 0)
+				Py_XDECREF(op->ob_item[i]);
 	}
 	op->ob_type->tp_free((PyObject *)op);
 done: