Add rtree lookup path caching.

rtree-based extent lookups remain more expensive than chunk-based run
lookups, but with this optimization the fast path slowdown is ~3 CPU
cycles per metadata lookup (on Intel Core i7-4980HQ), versus ~11 cycles
prior.  The path caching speedup tends to degrade gracefully unless
allocated memory is spread far apart (as is the case when using a
mixture of sbrk() and mmap()).
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index a41a15f..8b8dbe8 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -181,8 +181,11 @@
 JEMALLOC_INLINE extent_t *
 extent_lookup(tsdn_t *tsdn, const void *ptr, bool dependent)
 {
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 
-	return (rtree_read(tsdn, &extents_rtree, (uintptr_t)ptr, dependent));
+	return (rtree_read(tsdn, &extents_rtree, rtree_ctx, (uintptr_t)ptr,
+	    dependent));
 }
 
 JEMALLOC_INLINE arena_t *
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index d4e5525..07e7f28 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -399,6 +399,7 @@
 rtree_child_read_hard
 rtree_child_tryread
 rtree_clear
+rtree_ctx_start_level
 rtree_delete
 rtree_elm_acquire
 rtree_elm_lookup
@@ -502,6 +503,9 @@
 tsd_prof_tdata_get
 tsd_prof_tdata_set
 tsd_prof_tdatap_get
+tsd_rtree_ctx_get
+tsd_rtree_ctx_set
+tsd_rtree_ctxp_get
 tsd_rtree_elm_witnesses_get
 tsd_rtree_elm_witnesses_set
 tsd_rtree_elm_witnessesp_get
@@ -529,6 +533,7 @@
 tsd_witnessesp_get
 tsdn_fetch
 tsdn_null
+tsdn_rtree_ctx
 tsdn_tsd
 witness_assert_lockless
 witness_assert_not_owner
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index a47a79e..fc88dfe 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -10,6 +10,7 @@
 typedef struct rtree_elm_witness_s rtree_elm_witness_t;
 typedef struct rtree_elm_witness_tsd_s rtree_elm_witness_tsd_t;
 typedef struct rtree_level_s rtree_level_t;
+typedef struct rtree_ctx_s rtree_ctx_t;
 typedef struct rtree_s rtree_t;
 
 /*
@@ -25,6 +26,13 @@
 /* Used for two-stage lock-free node initialization. */
 #define	RTREE_NODE_INITIALIZING	((rtree_elm_t *)0x1)
 
+#define	RTREE_CTX_INITIALIZER	{					\
+	false,								\
+	0,								\
+	0,								\
+	{NULL /* C initializes all trailing elements to NULL. */}	\
+}
+
 /*
  * Maximum number of concurrently acquired elements per thread.  This controls
  * how many witness_t structures are embedded in tsd.  Ideally rtree_elm_t would
@@ -78,9 +86,9 @@
 	 *
 	 * Suppose keys comprise 48 bits, and LG_RTREE_BITS_PER_LEVEL is 4.
 	 * This results in a 3-level tree, and the leftmost leaf can be directly
-	 * accessed via subtrees[2], the subtree prefixed by 0x0000 (excluding
-	 * 0x00000000) can be accessed via subtrees[1], and the remainder of the
-	 * tree can be accessed via subtrees[0].
+	 * accessed via levels[2], the subtree prefixed by 0x0000 (excluding
+	 * 0x00000000) can be accessed via levels[1], and the remainder of the
+	 * tree can be accessed via levels[0].
 	 *
 	 *   levels[0] : [<unused> | 0x0001******** | 0x0002******** | ...]
 	 *
@@ -90,7 +98,7 @@
 	 *
 	 * This has practical implications on x64, which currently uses only the
 	 * lower 47 bits of virtual address space in userland, thus leaving
-	 * subtrees[0] unused and avoiding a level of tree traversal.
+	 * levels[0] unused and avoiding a level of tree traversal.
 	 */
 	union {
 		void		*subtree_pun;
@@ -105,13 +113,31 @@
 	unsigned		cumbits;
 };
 
+struct rtree_ctx_s {
+	/* If false, key/elms have not yet been initialized by a lookup. */
+	bool		valid;
+	/* Key that corresponds to the tree path recorded in elms. */
+	uintptr_t	key;
+	/* Memoized rtree_start_level(key). */
+	unsigned	start_level;
+	/*
+	 * A path through rtree, driven by key.  Only elements that could
+	 * actually be used for subsequent lookups are initialized, i.e. if
+	 * start_level = rtree_start_level(key) is non-zero, the first
+	 * start_level elements are uninitialized.  The last element contains a
+	 * pointer to the leaf node element that corresponds to key, so that
+	 * exact matches require no tree node offset computation.
+	 */
+	rtree_elm_t	*elms[RTREE_HEIGHT_MAX + 1];
+};
+
 struct rtree_s {
 	unsigned		height;
 	/*
 	 * Precomputed table used to convert from the number of leading 0 key
 	 * bits to which subtree level to start at.
 	 */
-	unsigned		start_level[RTREE_HEIGHT_MAX];
+	unsigned		start_level[RTREE_HEIGHT_MAX + 1];
 	rtree_level_t		levels[RTREE_HEIGHT_MAX];
 };
 
@@ -143,7 +169,9 @@
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-unsigned	rtree_start_level(rtree_t *rtree, uintptr_t key);
+unsigned	rtree_start_level(const rtree_t *rtree, uintptr_t key);
+unsigned	rtree_ctx_start_level(const rtree_t *rtree,
+    const rtree_ctx_t *rtree_ctx, uintptr_t key);
 uintptr_t	rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level);
 
 bool	rtree_node_valid(rtree_elm_t *node);
@@ -156,33 +184,55 @@
     bool dependent);
 rtree_elm_t	*rtree_subtree_read(tsdn_t *tsdn, rtree_t *rtree,
     unsigned level, bool dependent);
-rtree_elm_t	*rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key,
-    bool dependent, bool init_missing);
+rtree_elm_t	*rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree,
+    rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent, bool init_missing);
 
-bool	rtree_write(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key,
-    const extent_t *extent);
-extent_t	*rtree_read(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key,
-    bool dependent);
-rtree_elm_t	*rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key,
-    bool dependent, bool init_missing);
+bool	rtree_write(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
+    uintptr_t key, const extent_t *extent);
+extent_t	*rtree_read(tsdn_t *tsdn, rtree_t *rtree,
+    rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent);
+rtree_elm_t	*rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree,
+    rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent, bool init_missing);
 extent_t	*rtree_elm_read_acquired(tsdn_t *tsdn, const rtree_t *rtree,
     rtree_elm_t *elm);
 void	rtree_elm_write_acquired(tsdn_t *tsdn, const rtree_t *rtree,
     rtree_elm_t *elm, const extent_t *extent);
 void	rtree_elm_release(tsdn_t *tsdn, const rtree_t *rtree, rtree_elm_t *elm);
-void	rtree_clear(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key);
+void	rtree_clear(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
+    uintptr_t key);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_))
 JEMALLOC_ALWAYS_INLINE unsigned
-rtree_start_level(rtree_t *rtree, uintptr_t key)
+rtree_start_level(const rtree_t *rtree, uintptr_t key)
 {
 	unsigned start_level;
 
 	if (unlikely(key == 0))
 		return (rtree->height - 1);
 
-	start_level = rtree->start_level[lg_floor(key) >>
+	start_level = rtree->start_level[(lg_floor(key) + 1) >>
+	    LG_RTREE_BITS_PER_LEVEL];
+	assert(start_level < rtree->height);
+	return (start_level);
+}
+
+JEMALLOC_ALWAYS_INLINE unsigned
+rtree_ctx_start_level(const rtree_t *rtree, const rtree_ctx_t *rtree_ctx,
+    uintptr_t key)
+{
+	unsigned start_level;
+	uintptr_t key_diff;
+
+	/* Compute the difference between old and new lookup keys. */
+	key_diff = key ^ rtree_ctx->key;
+	assert(key_diff != 0); /* Handled in rtree_elm_lookup(). */
+
+	/*
+	 * Compute the last traversal path element at which the keys' paths
+	 * are the same.
+	 */
+	start_level = rtree->start_level[(lg_floor(key_diff) + 1) >>
 	    LG_RTREE_BITS_PER_LEVEL];
 	assert(start_level < rtree->height);
 	return (start_level);
@@ -291,8 +341,8 @@
 }
 
 JEMALLOC_ALWAYS_INLINE rtree_elm_t *
-rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent,
-    bool init_missing)
+rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
+    uintptr_t key, bool dependent, bool init_missing)
 {
 	uintptr_t subkey;
 	unsigned start_level;
@@ -300,35 +350,95 @@
 
 	assert(!dependent || !init_missing);
 
-	start_level = rtree_start_level(rtree, key);
+	if (dependent || init_missing) {
+		if (likely(rtree_ctx->valid)) {
+			if (key == rtree_ctx->key)
+				return (rtree_ctx->elms[rtree->height]);
+			else {
+				unsigned no_ctx_start_level =
+				    rtree_start_level(rtree, key);
+				unsigned ctx_start_level;
 
-	node = init_missing ? rtree_subtree_read(tsdn, rtree, start_level,
-	    dependent) : rtree_subtree_tryread(rtree, start_level, dependent);
+				if (likely(no_ctx_start_level <=
+				    rtree_ctx->start_level && (ctx_start_level =
+				    rtree_ctx_start_level(rtree, rtree_ctx,
+				    key)) >= rtree_ctx->start_level)) {
+					start_level = ctx_start_level;
+					node = rtree_ctx->elms[ctx_start_level];
+				} else {
+					start_level = no_ctx_start_level;
+					node = init_missing ?
+					    rtree_subtree_read(tsdn, rtree,
+					    no_ctx_start_level, dependent) :
+					    rtree_subtree_tryread(rtree,
+					    no_ctx_start_level, dependent);
+					rtree_ctx->start_level =
+					    no_ctx_start_level;
+					rtree_ctx->elms[no_ctx_start_level] =
+					    node;
+				}
+			}
+		} else {
+			unsigned no_ctx_start_level = rtree_start_level(rtree,
+			    key);
+
+			start_level = no_ctx_start_level;
+			node = init_missing ? rtree_subtree_read(tsdn, rtree,
+			    no_ctx_start_level, dependent) :
+			    rtree_subtree_tryread(rtree, no_ctx_start_level,
+			    dependent);
+			rtree_ctx->valid = true;
+			rtree_ctx->start_level = no_ctx_start_level;
+			rtree_ctx->elms[no_ctx_start_level] = node;
+		}
+		rtree_ctx->key = key;
+	} else {
+		start_level = rtree_start_level(rtree, key);
+		node = init_missing ? rtree_subtree_read(tsdn, rtree,
+		    start_level, dependent) : rtree_subtree_tryread(rtree,
+		    start_level, dependent);
+	}
+
 #define	RTREE_GET_BIAS	(RTREE_HEIGHT_MAX - rtree->height)
 	switch (start_level + RTREE_GET_BIAS) {
 #define	RTREE_GET_SUBTREE(level)					\
 	case level:							\
 		assert(level < (RTREE_HEIGHT_MAX-1));			\
-		if (!dependent && unlikely(!rtree_node_valid(node)))	\
+		if (!dependent && unlikely(!rtree_node_valid(node))) {	\
+			if (init_missing)				\
+				rtree_ctx->valid = false;		\
 			return (NULL);					\
+		}							\
 		subkey = rtree_subkey(rtree, key, level -		\
 		    RTREE_GET_BIAS);					\
 		node = init_missing ? rtree_child_read(tsdn, rtree,	\
 		    &node[subkey], level - RTREE_GET_BIAS, dependent) :	\
 		    rtree_child_tryread(&node[subkey], dependent);	\
+		if (dependent || init_missing) {			\
+			rtree_ctx->elms[level - RTREE_GET_BIAS + 1] =	\
+			    node;					\
+		}							\
 		/* Fall through. */
 #define	RTREE_GET_LEAF(level)						\
 	case level:							\
 		assert(level == (RTREE_HEIGHT_MAX-1));			\
-		if (!dependent && unlikely(!rtree_node_valid(node)))	\
+		if (!dependent && unlikely(!rtree_node_valid(node))) {	\
+			if (init_missing)				\
+				rtree_ctx->valid = false;		\
 			return (NULL);					\
+		}							\
 		subkey = rtree_subkey(rtree, key, level -		\
 		    RTREE_GET_BIAS);					\
 		/*							\
 		 * node is a leaf, so it contains values rather than	\
 		 * child pointers.					\
 		 */							\
-		return (&node[subkey]);
+		node = &node[subkey];					\
+		if (dependent || init_missing) {			\
+			rtree_ctx->elms[level - RTREE_GET_BIAS + 1] =	\
+			    node;					\
+		}							\
+		return (node);
 #if RTREE_HEIGHT_MAX > 1
 	RTREE_GET_SUBTREE(0)
 #endif
@@ -387,14 +497,15 @@
 }
 
 JEMALLOC_INLINE bool
-rtree_write(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, const extent_t *extent)
+rtree_write(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key,
+    const extent_t *extent)
 {
 	rtree_elm_t *elm;
 
 	assert(extent != NULL); /* Use rtree_clear() for this case. */
 	assert(((uintptr_t)extent & (uintptr_t)0x1) == (uintptr_t)0x0);
 
-	elm = rtree_elm_lookup(tsdn, rtree, key, false, true);
+	elm = rtree_elm_lookup(tsdn, rtree, rtree_ctx, key, false, true);
 	if (elm == NULL)
 		return (true);
 	assert(rtree_elm_read(elm, false) == NULL);
@@ -404,11 +515,12 @@
 }
 
 JEMALLOC_ALWAYS_INLINE extent_t *
-rtree_read(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent)
+rtree_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key,
+    bool dependent)
 {
 	rtree_elm_t *elm;
 
-	elm = rtree_elm_lookup(tsdn, rtree, key, dependent, false);
+	elm = rtree_elm_lookup(tsdn, rtree, rtree_ctx, key, dependent, false);
 	if (elm == NULL)
 		return (NULL);
 
@@ -416,12 +528,13 @@
 }
 
 JEMALLOC_INLINE rtree_elm_t *
-rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent,
-    bool init_missing)
+rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
+    uintptr_t key, bool dependent, bool init_missing)
 {
 	rtree_elm_t *elm;
 
-	elm = rtree_elm_lookup(tsdn, rtree, key, dependent, init_missing);
+	elm = rtree_elm_lookup(tsdn, rtree, rtree_ctx, key, dependent,
+	    init_missing);
 	if (!dependent && elm == NULL)
 		return (NULL);
 	{
@@ -481,11 +594,11 @@
 }
 
 JEMALLOC_INLINE void
-rtree_clear(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key)
+rtree_clear(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key)
 {
 	rtree_elm_t *elm;
 
-	elm = rtree_elm_acquire(tsdn, rtree, key, true, false);
+	elm = rtree_elm_acquire(tsdn, rtree, rtree_ctx, key, true, false);
 	rtree_elm_write_acquired(tsdn, rtree, elm, NULL);
 	rtree_elm_release(tsdn, rtree, elm);
 }
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 988edf5..2355f9c 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -572,6 +572,7 @@
     O(narenas_tdata,		unsigned,		no)		\
     O(arenas_tdata_bypass,	bool,			no)		\
     O(tcache_enabled,		tcache_enabled_t,	no)		\
+    O(rtree_ctx,		rtree_ctx_t,		no)		\
     O(witnesses,		witness_list_t,		yes)		\
     O(rtree_elm_witnesses,	rtree_elm_witness_tsd_t,no)		\
     O(witness_fork,		bool,			no)		\
@@ -588,6 +589,7 @@
     0,									\
     false,								\
     tcache_enabled_default,						\
+    RTREE_CTX_INITIALIZER,						\
     ql_head_initializer(witnesses),					\
     RTREE_ELM_WITNESS_TSD_INITIALIZER,					\
     false								\
@@ -651,6 +653,7 @@
 tsdn_t	*tsdn_fetch(void);
 bool	tsdn_null(const tsdn_t *tsdn);
 tsd_t	*tsdn_tsd(tsdn_t *tsdn);
+rtree_ctx_t	*tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TSD_C_))
@@ -741,6 +744,22 @@
 
 	return (&tsdn->tsd);
 }
+
+JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
+tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback)
+{
+
+	/*
+	 * If tsd cannot be accessed, initialize the fallback rtree_ctx and
+	 * return a pointer to it.
+	 */
+	if (unlikely(tsdn_null(tsdn))) {
+		static const rtree_ctx_t rtree_ctx = RTREE_CTX_INITIALIZER;
+		memcpy(fallback, &rtree_ctx, sizeof(rtree_ctx_t));
+		return (fallback);
+	}
+	return (tsd_rtree_ctxp_get(tsdn_tsd(tsdn)));
+}
 #endif
 
 #endif /* JEMALLOC_H_INLINES */
diff --git a/src/extent.c b/src/extent.c
index 9f3ddd9..0c41c06 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -259,18 +259,19 @@
 }
 
 static bool
-extent_rtree_acquire(tsdn_t *tsdn, const extent_t *extent, bool dependent,
-    bool init_missing, rtree_elm_t **r_elm_a, rtree_elm_t **r_elm_b)
+extent_rtree_acquire(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx,
+    const extent_t *extent, bool dependent, bool init_missing,
+    rtree_elm_t **r_elm_a, rtree_elm_t **r_elm_b)
 {
 
-	*r_elm_a = rtree_elm_acquire(tsdn, &extents_rtree,
+	*r_elm_a = rtree_elm_acquire(tsdn, &extents_rtree, rtree_ctx,
 	    (uintptr_t)extent_base_get(extent), dependent, init_missing);
 	if (!dependent && *r_elm_a == NULL)
 		return (true);
 	assert(*r_elm_a != NULL);
 
 	if (extent_size_get(extent) > PAGE) {
-		*r_elm_b = rtree_elm_acquire(tsdn, &extents_rtree,
+		*r_elm_b = rtree_elm_acquire(tsdn, &extents_rtree, rtree_ctx,
 		    (uintptr_t)extent_last_get(extent), dependent,
 		    init_missing);
 		if (!dependent && *r_elm_b == NULL)
@@ -302,14 +303,15 @@
 }
 
 static void
-extent_interior_register(tsdn_t *tsdn, const extent_t *extent)
+extent_interior_register(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx,
+    const extent_t *extent)
 {
 	size_t i;
 
 	assert(extent_slab_get(extent));
 
 	for (i = 1; i < (extent_size_get(extent) >> LG_PAGE) - 1; i++) {
-		rtree_write(tsdn, &extents_rtree,
+		rtree_write(tsdn, &extents_rtree, rtree_ctx,
 		    (uintptr_t)extent_base_get(extent) + (uintptr_t)(i <<
 		    LG_PAGE), extent);
 	}
@@ -318,13 +320,16 @@
 static bool
 extent_register(tsdn_t *tsdn, const extent_t *extent)
 {
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 	rtree_elm_t *elm_a, *elm_b;
 
-	if (extent_rtree_acquire(tsdn, extent, false, true, &elm_a, &elm_b))
+	if (extent_rtree_acquire(tsdn, rtree_ctx, extent, false, true, &elm_a,
+	    &elm_b))
 		return (true);
 	extent_rtree_write_acquired(tsdn, elm_a, elm_b, extent);
 	if (extent_slab_get(extent))
-		extent_interior_register(tsdn, extent);
+		extent_interior_register(tsdn, rtree_ctx, extent);
 	extent_rtree_release(tsdn, elm_a, elm_b);
 
 	if (config_prof && opt_prof && extent_active_get(extent)) {
@@ -347,14 +352,15 @@
 }
 
 static void
-extent_interior_deregister(tsdn_t *tsdn, const extent_t *extent)
+extent_interior_deregister(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx,
+    const extent_t *extent)
 {
 	size_t i;
 
 	assert(extent_slab_get(extent));
 
 	for (i = 1; i < (extent_size_get(extent) >> LG_PAGE) - 1; i++) {
-		rtree_clear(tsdn, &extents_rtree,
+		rtree_clear(tsdn, &extents_rtree, rtree_ctx,
 		    (uintptr_t)extent_base_get(extent) + (uintptr_t)(i <<
 		    LG_PAGE));
 	}
@@ -363,12 +369,15 @@
 static void
 extent_deregister(tsdn_t *tsdn, const extent_t *extent)
 {
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 	rtree_elm_t *elm_a, *elm_b;
 
-	extent_rtree_acquire(tsdn, extent, true, false, &elm_a, &elm_b);
+	extent_rtree_acquire(tsdn, rtree_ctx, extent, true, false, &elm_a,
+	    &elm_b);
 	extent_rtree_write_acquired(tsdn, elm_a, elm_b, NULL);
 	if (extent_slab_get(extent))
-		extent_interior_deregister(tsdn, extent);
+		extent_interior_deregister(tsdn, rtree_ctx, extent);
 	extent_rtree_release(tsdn, elm_a, elm_b);
 
 	if (config_prof && opt_prof && extent_active_get(extent)) {
@@ -422,6 +431,8 @@
     bool slab)
 {
 	extent_t *extent;
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 	size_t size, alloc_size, leadsize, trailsize;
 
 	assert(new_addr == NULL || !slab);
@@ -437,7 +448,7 @@
 	if (new_addr != NULL) {
 		rtree_elm_t *elm;
 
-		elm = rtree_elm_acquire(tsdn, &extents_rtree,
+		elm = rtree_elm_acquire(tsdn, &extents_rtree, rtree_ctx,
 		    (uintptr_t)new_addr, false, false);
 		if (elm != NULL) {
 			extent = rtree_elm_read_acquired(tsdn, &extents_rtree,
@@ -515,7 +526,7 @@
 	extent_active_set(extent, true);
 	if (slab) {
 		extent_slab_set(extent, slab);
-		extent_interior_register(tsdn, extent);
+		extent_interior_register(tsdn, rtree_ctx, extent);
 	}
 
 	malloc_mutex_unlock(tsdn, &arena->extents_mtx);
@@ -731,6 +742,8 @@
     extent_heap_t extent_heaps[NPSIZES], bool cache, extent_t *extent)
 {
 	extent_t *prev, *next;
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 
 	assert(!cache || !extent_zeroed_get(extent));
 
@@ -741,7 +754,7 @@
 	extent_active_set(extent, false);
 	extent_zeroed_set(extent, !cache && extent_zeroed_get(extent));
 	if (extent_slab_get(extent)) {
-		extent_interior_deregister(tsdn, extent);
+		extent_interior_deregister(tsdn, rtree_ctx, extent);
 		extent_slab_set(extent, false);
 	}
 
@@ -750,7 +763,7 @@
 	arena_extent_cache_maybe_insert(arena, extent, cache);
 
 	/* Try to coalesce forward. */
-	next = rtree_read(tsdn, &extents_rtree,
+	next = rtree_read(tsdn, &extents_rtree, rtree_ctx,
 	    (uintptr_t)extent_past_get(extent), false);
 	if (next != NULL) {
 		extent_try_coalesce(tsdn, arena, extent_hooks, extent, next,
@@ -758,7 +771,7 @@
 	}
 
 	/* Try to coalesce backward. */
-	prev = rtree_read(tsdn, &extents_rtree,
+	prev = rtree_read(tsdn, &extents_rtree, rtree_ctx,
 	    (uintptr_t)extent_before_get(extent), false);
 	if (prev != NULL) {
 		extent_try_coalesce(tsdn, arena, extent_hooks, prev, extent,
@@ -910,6 +923,8 @@
   size_t usize_b)
 {
 	extent_t *trail;
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 	rtree_elm_t *lead_elm_a, *lead_elm_b, *trail_elm_a, *trail_elm_b;
 
 	assert(extent_size_get(extent) == size_a + size_b);
@@ -928,8 +943,8 @@
 		    extent_zeroed_get(extent), extent_committed_get(extent),
 		    extent_slab_get(extent));
 
-		if (extent_rtree_acquire(tsdn, &lead, false, true, &lead_elm_a,
-		    &lead_elm_b))
+		if (extent_rtree_acquire(tsdn, rtree_ctx, &lead, false, true,
+		    &lead_elm_a, &lead_elm_b))
 			goto label_error_b;
 	}
 
@@ -937,8 +952,8 @@
 	    size_a), size_b, usize_b, extent_active_get(extent),
 	    extent_zeroed_get(extent), extent_committed_get(extent),
 	    extent_slab_get(extent));
-	if (extent_rtree_acquire(tsdn, trail, false, true, &trail_elm_a,
-	    &trail_elm_b))
+	if (extent_rtree_acquire(tsdn, rtree_ctx, trail, false, true,
+	    &trail_elm_a, &trail_elm_b))
 		goto label_error_c;
 
 	if (extent_hooks->split(extent_base_get(extent), size_a + size_b,
@@ -985,6 +1000,8 @@
 extent_merge_wrapper(tsdn_t *tsdn, arena_t *arena, extent_hooks_t *extent_hooks,
     extent_t *a, extent_t *b)
 {
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 	rtree_elm_t *a_elm_a, *a_elm_b, *b_elm_a, *b_elm_b;
 
 	extent_hooks_assure_initialized(tsdn, arena, extent_hooks);
@@ -998,8 +1015,10 @@
 	 * owned, so the following code uses decomposed helper functions rather
 	 * than extent_{,de}register() to do things in the right order.
 	 */
-	extent_rtree_acquire(tsdn, a, true, false, &a_elm_a, &a_elm_b);
-	extent_rtree_acquire(tsdn, b, true, false, &b_elm_a, &b_elm_b);
+	extent_rtree_acquire(tsdn, rtree_ctx, a, true, false, &a_elm_a,
+	    &a_elm_b);
+	extent_rtree_acquire(tsdn, rtree_ctx, b, true, false, &b_elm_a,
+	    &b_elm_b);
 
 	if (a_elm_b != NULL) {
 		rtree_elm_write_acquired(tsdn, &extents_rtree, a_elm_b, NULL);
diff --git a/src/rtree.c b/src/rtree.c
index b602730..421de3e 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -52,11 +52,12 @@
 		rtree->levels[height-1].cumbits = bits;
 	}
 
-	/* Compute lookup table to be used by rtree_start_level(). */
+	/* Compute lookup table to be used by rtree_[ctx_]start_level(). */
 	for (i = 0; i < RTREE_HEIGHT_MAX; i++) {
 		rtree->start_level[i] = hmin(RTREE_HEIGHT_MAX - 1 - i, height -
 		    1);
 	}
+	rtree->start_level[RTREE_HEIGHT_MAX] = 0;
 
 	return (false);
 }
diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index 786cc35..a05834f 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -40,10 +40,11 @@
 
 	for (i = 1; i <= (sizeof(uintptr_t) << 3); i++) {
 		rtree_t rtree;
+		rtree_ctx_t rtree_ctx = RTREE_CTX_INITIALIZER;
 		test_rtree = &rtree;
 		assert_false(rtree_new(&rtree, i),
 		    "Unexpected rtree_new() failure");
-		assert_ptr_null(rtree_read(tsdn, &rtree, 0, false),
+		assert_ptr_null(rtree_read(tsdn, &rtree, &rtree_ctx, 0, false),
 		    "rtree_read() should return NULL for empty tree");
 		rtree_delete(tsdn, &rtree);
 		test_rtree = NULL;
@@ -66,7 +67,8 @@
 thd_start(void *varg)
 {
 	thd_start_arg_t *arg = (thd_start_arg_t *)varg;
-	sfmt_t	*sfmt;
+	rtree_ctx_t rtree_ctx = RTREE_CTX_INITIALIZER;
+	sfmt_t *sfmt;
 	extent_t *extent;
 	tsdn_t *tsdn;
 	unsigned i;
@@ -81,21 +83,22 @@
 		if (i % 2 == 0) {
 			rtree_elm_t *elm;
 
-			elm = rtree_elm_acquire(tsdn, &arg->rtree, key, false,
-			    true);
+			elm = rtree_elm_acquire(tsdn, &arg->rtree, &rtree_ctx,
+			    key, false, true);
 			assert_ptr_not_null(elm,
 			    "Unexpected rtree_elm_acquire() failure");
-			rtree_elm_write_acquired(tsdn, &arg->rtree, elm, extent);
+			rtree_elm_write_acquired(tsdn, &arg->rtree, elm,
+			    extent);
 			rtree_elm_release(tsdn, &arg->rtree, elm);
 
-			elm = rtree_elm_acquire(tsdn, &arg->rtree, key, true,
-			    false);
+			elm = rtree_elm_acquire(tsdn, &arg->rtree, &rtree_ctx,
+			    key, true, false);
 			assert_ptr_not_null(elm,
 			    "Unexpected rtree_elm_acquire() failure");
 			rtree_elm_read_acquired(tsdn, &arg->rtree, elm);
 			rtree_elm_release(tsdn, &arg->rtree, elm);
 		} else
-			rtree_read(tsdn, &arg->rtree, key, false);
+			rtree_read(tsdn, &arg->rtree, &rtree_ctx, key, false);
 	}
 
 	free(extent);
@@ -145,19 +148,22 @@
 
 	for (i = 1; i <= (sizeof(uintptr_t) << 3); i++) {
 		rtree_t rtree;
+		rtree_ctx_t rtree_ctx = RTREE_CTX_INITIALIZER;
 		test_rtree = &rtree;
 		assert_false(rtree_new(&rtree, i),
 		    "Unexpected rtree_new() failure");
 
-		assert_false(rtree_write(tsdn, &rtree, 0, &extent_a),
-		    "Unexpected rtree_write() failure, i=%u", i);
-		assert_ptr_eq(rtree_read(tsdn, &rtree, 0, true), &extent_a,
+		assert_false(rtree_write(tsdn, &rtree, &rtree_ctx, 0,
+		    &extent_a), "Unexpected rtree_write() failure, i=%u", i);
+		assert_ptr_eq(rtree_read(tsdn, &rtree, &rtree_ctx, 0, true),
+		    &extent_a,
 		    "rtree_read() should return previously set value, i=%u", i);
 
-		assert_false(rtree_write(tsdn, &rtree, ~((uintptr_t)0),
-		    &extent_b), "Unexpected rtree_write() failure, i=%u", i);
-		assert_ptr_eq(rtree_read(tsdn, &rtree, ~((uintptr_t)0), true),
-		    &extent_b,
+		assert_false(rtree_write(tsdn, &rtree, &rtree_ctx,
+		    ~((uintptr_t)0), &extent_b),
+		    "Unexpected rtree_write() failure, i=%u", i);
+		assert_ptr_eq(rtree_read(tsdn, &rtree, &rtree_ctx,
+		    ~((uintptr_t)0), true), &extent_b,
 		    "rtree_read() should return previously set value, i=%u", i);
 
 		rtree_delete(tsdn, &rtree);
@@ -178,27 +184,30 @@
 		    (((uintptr_t)1) << (sizeof(uintptr_t)*8-i)) - 1};
 		extent_t extent;
 		rtree_t rtree;
+		rtree_ctx_t rtree_ctx = RTREE_CTX_INITIALIZER;
 
 		test_rtree = &rtree;
 		assert_false(rtree_new(&rtree, i),
 		    "Unexpected rtree_new() failure");
 
 		for (j = 0; j < sizeof(keys)/sizeof(uintptr_t); j++) {
-			assert_false(rtree_write(tsdn, &rtree, keys[j],
-			    &extent), "Unexpected rtree_write() failure");
+			assert_false(rtree_write(tsdn, &rtree, &rtree_ctx,
+			    keys[j], &extent),
+			    "Unexpected rtree_write() failure");
 			for (k = 0; k < sizeof(keys)/sizeof(uintptr_t); k++) {
-				assert_ptr_eq(rtree_read(tsdn, &rtree, keys[k],
-				    true), &extent, "rtree_read() should "
-				    "return previously set value and ignore "
-				    "insignificant key bits; i=%u, j=%u, k=%u, "
-				    "set key=%#"FMTxPTR", get key=%#"FMTxPTR, i,
-				    j, k, keys[j], keys[k]);
+				assert_ptr_eq(rtree_read(tsdn, &rtree,
+				    &rtree_ctx, keys[k], true), &extent,
+				    "rtree_read() should return previously set "
+				    "value and ignore insignificant key bits; "
+				    "i=%u, j=%u, k=%u, set key=%#"FMTxPTR", "
+				    "get key=%#"FMTxPTR, i, j, k, keys[j],
+				    keys[k]);
 			}
-			assert_ptr_null(rtree_read(tsdn, &rtree,
+			assert_ptr_null(rtree_read(tsdn, &rtree, &rtree_ctx,
 			    (((uintptr_t)1) << (sizeof(uintptr_t)*8-i)), false),
 			    "Only leftmost rtree leaf should be set; "
 			    "i=%u, j=%u", i, j);
-			rtree_clear(tsdn, &rtree, keys[j]);
+			rtree_clear(tsdn, &rtree, &rtree_ctx, keys[j]);
 		}
 
 		rtree_delete(tsdn, &rtree);
@@ -222,6 +231,7 @@
 		extent_t extent;
 		unsigned j;
 		rtree_t rtree;
+		rtree_ctx_t rtree_ctx = RTREE_CTX_INITIALIZER;
 		rtree_elm_t *elm;
 
 		test_rtree = &rtree;
@@ -230,29 +240,32 @@
 
 		for (j = 0; j < NSET; j++) {
 			keys[j] = (uintptr_t)gen_rand64(sfmt);
-			elm = rtree_elm_acquire(tsdn, &rtree, keys[j], false,
-			    true);
+			elm = rtree_elm_acquire(tsdn, &rtree, &rtree_ctx,
+			    keys[j], false, true);
 			assert_ptr_not_null(elm,
 			    "Unexpected rtree_elm_acquire() failure");
 			rtree_elm_write_acquired(tsdn, &rtree, elm, &extent);
 			rtree_elm_release(tsdn, &rtree, elm);
-			assert_ptr_eq(rtree_read(tsdn, &rtree, keys[j], true),
-			    &extent,
+			assert_ptr_eq(rtree_read(tsdn, &rtree, &rtree_ctx,
+			    keys[j], true), &extent,
 			    "rtree_read() should return previously set value");
 		}
 		for (j = 0; j < NSET; j++) {
-			assert_ptr_eq(rtree_read(tsdn, &rtree, keys[j], true),
-			    &extent, "rtree_read() should return previously "
-			    "set value, j=%u", j);
+			assert_ptr_eq(rtree_read(tsdn, &rtree, &rtree_ctx,
+			    keys[j], true), &extent,
+			    "rtree_read() should return previously set value, "
+			    "j=%u", j);
 		}
 
 		for (j = 0; j < NSET; j++) {
-			rtree_clear(tsdn, &rtree, keys[j]);
-			assert_ptr_null(rtree_read(tsdn, &rtree, keys[j], true),
+			rtree_clear(tsdn, &rtree, &rtree_ctx, keys[j]);
+			assert_ptr_null(rtree_read(tsdn, &rtree, &rtree_ctx,
+			    keys[j], true),
 			    "rtree_read() should return previously set value");
 		}
 		for (j = 0; j < NSET; j++) {
-			assert_ptr_null(rtree_read(tsdn, &rtree, keys[j], true),
+			assert_ptr_null(rtree_read(tsdn, &rtree, &rtree_ctx,
+			    keys[j], true),
 			    "rtree_read() should return previously set value");
 		}