Rehabilitated the fast-path richcmp code, and sped it up.  It wasn't
helping for types that defined tp_richcmp but not tp_compare, although
that's when it's most valuable, and strings moved into that category
since the fast path was first introduced.  Now it helps for same-type
non-Instance objects that define rich or 3-way compares.

For all the edits here, the rest just amounts to moving the fast path from
do_richcmp into PyObject_RichCompare, saving a layer of function call
(measurable on my box!).  This loses when NESTING_LIMIT is exceeded, but I
don't care about that (fast-paths are for normal cases, not pathologies).

Also added a tasteful <wink> label to get out of PyObject_RichCompare, as
the if/else nesting in this routine was getting incomprehensible.
diff --git a/Objects/object.c b/Objects/object.c
index 7c6819d..07ca47c 100644
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -825,32 +825,6 @@
 do_richcmp(PyObject *v, PyObject *w, int op)
 {
 	PyObject *res;
-	cmpfunc f;
-
-	/* If the types are equal, don't bother with coercions etc. 
-	   Instances are special-cased in try_3way_compare, since
-	   a result of 2 does *not* mean one value being greater
-	   than the other. */
-	if (v->ob_type == w->ob_type
-	    && (f = v->ob_type->tp_compare) != NULL
-	    && !PyInstance_Check(v)) {
-		int c;
-		richcmpfunc f1;
-		if ((f1 = RICHCOMPARE(v->ob_type)) != NULL) {
-			/* If the type has richcmp, try it first.
-			   try_rich_compare would try it two-sided,
-			   which is not needed since we've a single
-			   type only. */
-			res = (*f1)(v, w, op);
-			if (res != Py_NotImplemented)
-				return res;
-			Py_DECREF(res);
-		}
-		c = (*f)(v, w);
-		if (c < 0 && PyErr_Occurred())
-			return NULL;
-		return convert_3way_to_object(op, c);
-	}
 
 	res = try_rich_compare(v, w, op);
 	if (res != Py_NotImplemented)
@@ -862,8 +836,6 @@
 
 /* Return:
    NULL for exception;
-   NotImplemented if this particular rich comparison is not implemented or
-     undefined;
    some object not equal to NotImplemented if it is implemented
      (this latter object may not be a Boolean).
 */
@@ -880,11 +852,12 @@
 		 || (v->ob_type->tp_as_sequence
 		     && !PyString_Check(v)
 		     && !PyTuple_Check(v)))) {
+
 		/* try to detect circular data structures */
 		PyObject *token = check_recursion(v, w, op);
-
 		if (token == NULL) {
 			res = NULL;
+			goto Done;
 		}
 		else if (token == Py_None) {
 			/* already comparing these objects with this operator.
@@ -904,10 +877,41 @@
 			res = do_richcmp(v, w, op);
 			delete_token(token);
 		}
+		goto Done;
 	}
-	else {
-		res = do_richcmp(v, w, op);
+
+	/* No nesting extremism.
+	   If the types are equal, and not old-style instances, try to
+	   get out cheap (don't bother with coercions etc.). */
+	if (v->ob_type == w->ob_type && !PyInstance_Check(v)) {
+		cmpfunc fcmp;
+		richcmpfunc frich = RICHCOMPARE(v->ob_type);
+		/* If the type has richcmp, try it first.  try_rich_compare
+		   tries it two-sided, which is not needed since we've a
+		   single type only. */
+		if (frich != NULL) {
+			res = (*frich)(v, w, op);
+			if (res != Py_NotImplemented)
+				goto Done;
+			Py_DECREF(res);
+		}
+		/* No richcmp, or this particular richmp not implemented.
+		   Try 3-way cmp. */
+		fcmp = v->ob_type->tp_compare;
+		if (fcmp != NULL) {
+			int c = (*fcmp)(v, w);
+			if (c < 0 && PyErr_Occurred()) {
+				res = NULL;
+				goto Done;
+			}
+			res = convert_3way_to_object(op, c);
+			goto Done;
+		}
 	}
+
+	/* Fast path not taken, or couldn't deliver a useful result. */
+	res = do_richcmp(v, w, op);
+Done:
 	compare_nesting--;
 	return res;
 }