long_format():  Easy speedup for output bases that aren't a power of 2 (in
particular, str(long) and repr(long) use base 10, and that gets a factor
of 4 speedup).  Another factor of 2 can be gotten by refactoring divrem1 to
support in-place division, but that started getting messy so I'm leaving
that out.
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 85b26a3..53ae5ed 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -811,28 +811,45 @@
 		}
 	}
 	else {
+		/* Not 0, and base not a power of 2.  Divide repeatedly by
+		   base, but for speed use the highest power of base that
+		   fits in a digit. */
+		digit powbase = base;  /* powbase == base ** power */
+		int power = 1;
+		for (;;) {
+			unsigned long newpow = powbase * (unsigned long)base;
+			if (newpow >> SHIFT)  /* doesn't fit in a digit */
+				break;
+			powbase = (digit)newpow;
+			++power;
+		}
+		
 		Py_INCREF(a);
 		do {
+			int ntostore = power;
 			digit rem;
-			PyLongObject *temp = divrem1(a, (digit)base, &rem);
+			PyLongObject *temp = divrem1(a, powbase, &rem);
+			Py_DECREF(a);
 			if (temp == NULL) {
-				Py_DECREF(a);
 				Py_DECREF(str);
 				return NULL;
 			}
-			if (rem < 10)
-				rem += '0';
-			else
-				rem += 'A'-10;
-			assert(p > PyString_AS_STRING(str));
-			*--p = (char) rem;
-			Py_DECREF(a);
 			a = temp;
 			SIGCHECK({
 				Py_DECREF(a);
 				Py_DECREF(str);
 				return NULL;
 			})
+			while (--ntostore >= 0) {
+				digit nextrem = (digit)(rem / base);
+				char c = (char)(rem - nextrem * base);
+				assert(p > PyString_AS_STRING(str));
+				c += (c < 10) ? '0' : 'A'-10;
+				*--p = c;
+				rem = nextrem;
+				if (a->ob_size == 0 && rem == 0)
+					break;  /* skip leading zeroes */
+			}
 		} while (ABS(a->ob_size) != 0);
 		Py_DECREF(a);
 	}