Fix long spinning in rtree_node_init

rtree_node_init spinlocks the node, allocates, and then sets the node.
This is under heavy contention at the top of the tree if many threads
start to allocate at the same time.

Instead, take a per-rtree sleeping mutex to reduce spinning.  Tested
both pthreads and osx OSSpinLock, and both reduce spinning adequately

Previous benchmark time:
./ttest1 500 100
~15s

New benchmark time:
./ttest1 500 100
.57s
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index fc88dfe..9c6cc22 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -23,9 +23,6 @@
 #define	RTREE_HEIGHT_MAX						\
     ((1U << (LG_SIZEOF_PTR+3)) / RTREE_BITS_PER_LEVEL)
 
-/* Used for two-stage lock-free node initialization. */
-#define	RTREE_NODE_INITIALIZING	((rtree_elm_t *)0x1)
-
 #define	RTREE_CTX_INITIALIZER	{					\
 	false,								\
 	0,								\
@@ -139,6 +136,7 @@
 	 */
 	unsigned		start_level[RTREE_HEIGHT_MAX + 1];
 	rtree_level_t		levels[RTREE_HEIGHT_MAX];
+	malloc_mutex_t		init_lock;
 };
 
 #endif /* JEMALLOC_H_STRUCTS */
@@ -251,7 +249,7 @@
 rtree_node_valid(rtree_elm_t *node)
 {
 
-	return ((uintptr_t)node > (uintptr_t)RTREE_NODE_INITIALIZING);
+	return ((uintptr_t)node != (uintptr_t)0);
 }
 
 JEMALLOC_ALWAYS_INLINE rtree_elm_t *
diff --git a/include/jemalloc/internal/witness.h b/include/jemalloc/internal/witness.h
index 26024ac..86ddb64 100644
--- a/include/jemalloc/internal/witness.h
+++ b/include/jemalloc/internal/witness.h
@@ -28,7 +28,8 @@
 #define	WITNESS_RANK_ARENA_EXTENT_CACHE	10
 
 #define	WITNESS_RANK_RTREE_ELM		11U
-#define	WITNESS_RANK_BASE		12U
+#define	WITNESS_RANK_RTREE		12U
+#define	WITNESS_RANK_BASE		13U
 
 #define	WITNESS_RANK_LEAF		0xffffffffU
 #define	WITNESS_RANK_ARENA_BIN		WITNESS_RANK_LEAF
diff --git a/src/rtree.c b/src/rtree.c
index 0a42a98..b6b9ed7 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -59,6 +59,8 @@
 	}
 	rtree->start_level[RTREE_HEIGHT_MAX] = 0;
 
+	malloc_mutex_init(&rtree->init_lock, "rtree", WITNESS_RANK_RTREE);
+
 	return (false);
 }
 
@@ -135,25 +137,18 @@
 {
 	rtree_elm_t *node;
 
-	if (atomic_cas_p((void **)elmp, NULL, RTREE_NODE_INITIALIZING)) {
-		spin_t spinner;
-
-		/*
-		 * Another thread is already in the process of initializing.
-		 * Spin-wait until initialization is complete.
-		 */
-		spin_init(&spinner);
-		do {
-			spin_adaptive(&spinner);
-			node = atomic_read_p((void **)elmp);
-		} while (node == RTREE_NODE_INITIALIZING);
-	} else {
+	malloc_mutex_lock(tsdn, &rtree->init_lock);
+	node = atomic_read_p((void**)elmp);
+	if (node == NULL) {
 		node = rtree_node_alloc(tsdn, rtree, ZU(1) <<
 		    rtree->levels[level].bits);
-		if (node == NULL)
+		if (node == NULL) {
+			malloc_mutex_unlock(tsdn, &rtree->init_lock);
 			return (NULL);
+		}
 		atomic_write_p((void **)elmp, node);
 	}
+	malloc_mutex_unlock(tsdn, &rtree->init_lock);
 
 	return (node);
 }
diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index a05834f..03f4e26 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -13,8 +13,10 @@
 	if (rtree != test_rtree)
 		return rtree_node_alloc_orig(tsdn, rtree, nelms);
 
+	malloc_mutex_unlock(tsdn, &rtree->init_lock);
 	node = (rtree_elm_t *)calloc(nelms, sizeof(rtree_elm_t));
 	assert_ptr_not_null(node, "Unexpected calloc() failure");
+	malloc_mutex_lock(tsdn, &rtree->init_lock);
 
 	return (node);
 }