Add rtree lookup path caching.

rtree-based extent lookups remain more expensive than chunk-based run
lookups, but with this optimization the fast path slowdown is ~3 CPU
cycles per metadata lookup (on Intel Core i7-4980HQ), versus ~11 cycles
prior.  The path caching speedup tends to degrade gracefully unless
allocated memory is spread far apart (as is the case when using a
mixture of sbrk() and mmap()).
diff --git a/src/rtree.c b/src/rtree.c
index b602730..421de3e 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -52,11 +52,12 @@
 		rtree->levels[height-1].cumbits = bits;
 	}
 
-	/* Compute lookup table to be used by rtree_start_level(). */
+	/* Compute lookup table to be used by rtree_[ctx_]start_level(). */
 	for (i = 0; i < RTREE_HEIGHT_MAX; i++) {
 		rtree->start_level[i] = hmin(RTREE_HEIGHT_MAX - 1 - i, height -
 		    1);
 	}
+	rtree->start_level[RTREE_HEIGHT_MAX] = 0;
 
 	return (false);
 }