Refactor/fix ph.

Refactor ph to support configurable comparison functions.  Use a cpp
macro code generation form equivalent to the rb macros so that pairing
heaps can be used for both run heaps and chunk heaps.

Remove per node parent pointers, and instead use leftmost siblings' prev
pointers to track parents.

Fix multi-pass sibling merging to iterate over intermediate results
using a FIFO, rather than a LIFO.  Use this fixed sibling merging
implementation for both merge phases of the auxiliary twopass algorithm
(first merging the aux list, then replacing the root with its merged
children).  This fixes both degenerate merge behavior and the potential
for deep recursion.

This regression was introduced by
6bafa6678fc36483e638f1c3a0a9bf79fb89bfc9 (Pairing heap).

This resolves #371.
diff --git a/Makefile.in b/Makefile.in
index 7f2d668..480ce1a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -95,7 +95,6 @@
 	$(srcroot)src/mutex.c \
 	$(srcroot)src/nstime.c \
 	$(srcroot)src/pages.c \
-	$(srcroot)src/ph.c \
 	$(srcroot)src/prng.c \
 	$(srcroot)src/prof.c \
 	$(srcroot)src/quarantine.c \
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 09ae689..6f0fa76 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -160,7 +160,7 @@
 	 * 2) arena_run_t conceptually uses this linkage for in-use non-full
 	 *    runs, rather than directly embedding linkage.
 	 */
-	ph_node_t				ph_link;
+	phn(arena_chunk_map_misc_t)		ph_link;
 
 	union {
 		/* Linkage for list of dirty runs. */
@@ -176,6 +176,7 @@
 		arena_run_t			run;
 	};
 };
+typedef ph(arena_chunk_map_misc_t) arena_run_heap_t;
 #endif /* JEMALLOC_ARENA_STRUCTS_A */
 
 #ifdef JEMALLOC_ARENA_STRUCTS_B
@@ -278,7 +279,7 @@
 	 * objects packed well, and it can also help reduce the number of
 	 * almost-empty chunks.
 	 */
-	ph_heap_t		runs;
+	arena_run_heap_t	runs;
 
 	/* Bin statistics. */
 	malloc_bin_stats_t	stats;
@@ -460,7 +461,7 @@
 	 * Quantized address-ordered heaps of this arena's available runs.  The
 	 * heaps are used for first-best-fit run allocation.
 	 */
-	ph_heap_t		runs_avail[1]; /* Dynamically sized. */
+	arena_run_heap_t	runs_avail[1]; /* Dynamically sized. */
 };
 
 /* Used in conjunction with tsd for fast arena-related context lookup. */
@@ -604,7 +605,6 @@
 size_t	arena_miscelm_to_pageind(const arena_chunk_map_misc_t *miscelm);
 void	*arena_miscelm_to_rpages(const arena_chunk_map_misc_t *miscelm);
 arena_chunk_map_misc_t	*arena_rd_to_miscelm(arena_runs_dirty_link_t *rd);
-arena_chunk_map_misc_t	*arena_ph_to_miscelm(ph_node_t *ph);
 arena_chunk_map_misc_t	*arena_run_to_miscelm(arena_run_t *run);
 size_t	*arena_mapbitsp_get_mutable(arena_chunk_t *chunk, size_t pageind);
 const size_t	*arena_mapbitsp_get_const(const arena_chunk_t *chunk,
@@ -735,18 +735,6 @@
 }
 
 JEMALLOC_ALWAYS_INLINE arena_chunk_map_misc_t *
-arena_ph_to_miscelm(ph_node_t *ph)
-{
-	arena_chunk_map_misc_t *miscelm = (arena_chunk_map_misc_t *)
-	    ((uintptr_t)ph - offsetof(arena_chunk_map_misc_t, ph_link));
-
-	assert(arena_miscelm_to_pageind(miscelm) >= map_bias);
-	assert(arena_miscelm_to_pageind(miscelm) < chunk_npages);
-
-	return (miscelm);
-}
-
-JEMALLOC_ALWAYS_INLINE arena_chunk_map_misc_t *
 arena_run_to_miscelm(arena_run_t *run)
 {
 	arena_chunk_map_misc_t *miscelm = (arena_chunk_map_misc_t
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index c1cccd6..55ca714 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -161,6 +161,7 @@
 #include <malloc/malloc.h>
 #endif
 
+#include "jemalloc/internal/ph.h"
 #define	RB_COMPACT
 #include "jemalloc/internal/rb.h"
 #include "jemalloc/internal/qr.h"
@@ -371,7 +372,6 @@
 #include "jemalloc/internal/tsd.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/extent.h"
-#include "jemalloc/internal/ph.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/base.h"
@@ -402,7 +402,6 @@
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/bitmap.h"
-#include "jemalloc/internal/ph.h"
 #define	JEMALLOC_ARENA_STRUCTS_A
 #include "jemalloc/internal/arena.h"
 #undef JEMALLOC_ARENA_STRUCTS_A
@@ -495,7 +494,6 @@
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
-#include "jemalloc/internal/ph.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/rtree.h"
@@ -527,7 +525,6 @@
 #include "jemalloc/internal/tsd.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/extent.h"
-#include "jemalloc/internal/ph.h"
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/pages.h"
diff --git a/include/jemalloc/internal/ph.h b/include/jemalloc/internal/ph.h
index 519f0dd..70b6e2c 100644
--- a/include/jemalloc/internal/ph.h
+++ b/include/jemalloc/internal/ph.h
@@ -4,257 +4,341 @@
  * "The Pairing Heap: A New Form of Self-Adjusting Heap"
  * https://www.cs.cmu.edu/~sleator/papers/pairing-heaps.pdf
  *
- * With auxiliary list, described in a follow on paper
+ * With auxiliary twopass list, described in a follow on paper.
  *
  * "Pairing Heaps: Experiments and Analysis"
  * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.106.2988&rep=rep1&type=pdf
  *
- * Where search/nsearch/last are not needed, ph.h outperforms rb.h by ~7x fewer
- * cpu cycles, and ~4x fewer memory references.
- *
- * Tagging parent/prev pointers on the next list was also described in the
- * original paper, such that only two pointers are needed.  This is not
- * implemented here, as it substantially increases the memory references
- * needed when ph_remove is called, almost overshadowing the other performance
- * gains.
- *
  *******************************************************************************
  */
-#ifdef JEMALLOC_H_TYPES
 
-typedef struct ph_node_s ph_node_t;
-typedef struct ph_heap_s ph_heap_t;
+#ifndef PH_H_
+#define	PH_H_
 
-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
-
-struct ph_node_s {
-	ph_node_t	*subheaps;
-	ph_node_t	*parent;
-	ph_node_t	*next;
-	ph_node_t	*prev;
-};
-
-struct ph_heap_s {
-	ph_node_t	*root;
-};
-
-#endif /* JEMALLOC_H_STRUCTS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
-
-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
-
-#ifndef JEMALLOC_ENABLE_INLINE
-ph_node_t	*ph_merge_ordered(ph_node_t *heap1, ph_node_t *heap2);
-ph_node_t	*ph_merge(ph_node_t *heap1, ph_node_t *heap2);
-ph_node_t	*ph_merge_pairs(ph_node_t *subheaps);
-void	ph_merge_aux_list(ph_heap_t *l);
-void	ph_new(ph_heap_t *n);
-ph_node_t	*ph_first(ph_heap_t *l);
-void	ph_insert(ph_heap_t *l, ph_node_t *n);
-ph_node_t	*ph_remove_first(ph_heap_t *l);
-void	ph_remove(ph_heap_t *l, ph_node_t *n);
-#endif
-
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PH_C_))
-
-/* Helper routines ************************************************************/
-
-JEMALLOC_INLINE ph_node_t *
-ph_merge_ordered(ph_node_t *heap1, ph_node_t *heap2)
-{
-
-	assert(heap1 != NULL);
-	assert(heap2 != NULL);
-	assert ((uintptr_t)heap1 <= (uintptr_t)heap2);
-
-	heap2->parent = heap1;
-	heap2->prev = NULL;
-	heap2->next = heap1->subheaps;
-	if (heap1->subheaps != NULL)
-		heap1->subheaps->prev = heap2;
-	heap1->subheaps = heap2;
-	return (heap1);
+/* Node structure. */
+#define	phn(a_type)							\
+struct {								\
+	a_type	*phn_prev;						\
+	a_type	*phn_next;						\
+	a_type	*phn_lchild;						\
 }
 
-JEMALLOC_INLINE ph_node_t *
-ph_merge(ph_node_t *heap1, ph_node_t *heap2)
-{
-
-	if (heap1 == NULL)
-		return (heap2);
-	if (heap2 == NULL)
-		return (heap1);
-	/* Optional: user-settable comparison function */
-	if ((uintptr_t)heap1 < (uintptr_t)heap2)
-		return (ph_merge_ordered(heap1, heap2));
-	else
-		return (ph_merge_ordered(heap2, heap1));
+/* Root structure. */
+#define	ph(a_type)							\
+struct {								\
+	a_type	*ph_root;						\
 }
 
-JEMALLOC_INLINE ph_node_t *
-ph_merge_pairs(ph_node_t *subheaps)
-{
+/* Internal utility macros. */
+#define	phn_lchild_get(a_type, a_field, a_phn)				\
+	(a_phn->a_field.phn_lchild)
+#define	phn_lchild_set(a_type, a_field, a_phn, a_lchild) do {		\
+	a_phn->a_field.phn_lchild = a_lchild;				\
+} while (0)
 
-	if (subheaps == NULL)
-		return (NULL);
-	if (subheaps->next == NULL)
-		return (subheaps);
-	{
-		ph_node_t *l0 = subheaps;
-		ph_node_t *l1 = l0->next;
-		ph_node_t *lrest = l1->next;
+#define	phn_next_get(a_type, a_field, a_phn)				\
+	(a_phn->a_field.phn_next)
+#define	phn_prev_set(a_type, a_field, a_phn, a_prev) do {		\
+	a_phn->a_field.phn_prev = a_prev;				\
+} while (0)
 
-		if (lrest != NULL)
-			lrest->prev = NULL;
-		l1->next = NULL;
-		l1->prev = NULL;
-		l0->next = NULL;
-		l0->prev = NULL;
-		return (ph_merge(ph_merge(l0, l1), ph_merge_pairs(lrest)));
-	}
-}
+#define	phn_prev_get(a_type, a_field, a_phn)				\
+	(a_phn->a_field.phn_prev)
+#define	phn_next_set(a_type, a_field, a_phn, a_next) do {		\
+	a_phn->a_field.phn_next = a_next;				\
+} while (0)
+
+#define	phn_merge_ordered(a_type, a_field, a_phn0, a_phn1, a_cmp) do {	\
+	a_type *phn0child;						\
+									\
+	assert(a_phn0 != NULL);						\
+	assert(a_phn1 != NULL);						\
+	assert(a_cmp(a_phn0, a_phn1) <= 0);				\
+									\
+	phn_prev_set(a_type, a_field, a_phn1, a_phn0);			\
+	phn0child = phn_lchild_get(a_type, a_field, a_phn0);		\
+	phn_next_set(a_type, a_field, a_phn1, phn0child);		\
+	if (phn0child != NULL)						\
+		phn_prev_set(a_type, a_field, phn0child, a_phn1);	\
+	phn_lchild_set(a_type, a_field, a_phn0, a_phn1);		\
+} while (0)
+
+#define	phn_merge(a_type, a_field, a_phn0, a_phn1, a_cmp, r_phn) do {	\
+	if (a_phn0 == NULL)						\
+		r_phn = a_phn1;						\
+	else if (a_phn1 == NULL)					\
+		r_phn = a_phn0;						\
+	else if (a_cmp(a_phn0, a_phn1) < 0) {				\
+		phn_merge_ordered(a_type, a_field, a_phn0, a_phn1,	\
+		    a_cmp);						\
+		r_phn = a_phn0;						\
+	} else {							\
+		phn_merge_ordered(a_type, a_field, a_phn1, a_phn0,	\
+		    a_cmp);						\
+		r_phn = a_phn1;						\
+	}								\
+} while (0)
+
+#define	ph_merge_siblings(a_type, a_field, a_phn, a_cmp, r_phn) do {	\
+	a_type *head = NULL;						\
+	a_type *tail = NULL;						\
+	a_type *phn0 = a_phn;						\
+	a_type *phn1 = phn_next_get(a_type, a_field, phn0);		\
+									\
+	/*								\
+	 * Multipass merge, wherein the first two elements of a FIFO	\
+	 * are repeatedly merged, and each result is appended to the	\
+	 * singly linked FIFO, until the FIFO contains only a single	\
+	 * element.  We start with a sibling list but no reference to	\
+	 * its tail, so we do a single pass over the sibling list to	\
+	 * populate the FIFO.						\
+	 */								\
+	if (phn1 != NULL) {						\
+		a_type *phnrest = phn_next_get(a_type, a_field, phn1);	\
+		if (phnrest != NULL)					\
+			phn_prev_set(a_type, a_field, phnrest, NULL);	\
+		phn_prev_set(a_type, a_field, phn0, NULL);		\
+		phn_next_set(a_type, a_field, phn0, NULL);		\
+		phn_prev_set(a_type, a_field, phn1, NULL);		\
+		phn_next_set(a_type, a_field, phn1, NULL);		\
+		phn_merge(a_type, a_field, phn0, phn1, a_cmp, phn0);	\
+		head = tail = phn0;					\
+		phn0 = phnrest;						\
+		while (phn0 != NULL) {					\
+			phn1 = phn_next_get(a_type, a_field, phn0);	\
+			if (phn1 != NULL) {				\
+				phnrest = phn_next_get(a_type, a_field,	\
+				    phn1);				\
+				if (phnrest != NULL) {			\
+					phn_prev_set(a_type, a_field,	\
+					    phnrest, NULL);		\
+				}					\
+				phn_prev_set(a_type, a_field, phn0,	\
+				    NULL);				\
+				phn_next_set(a_type, a_field, phn0,	\
+				    NULL);				\
+				phn_prev_set(a_type, a_field, phn1,	\
+				    NULL);				\
+				phn_next_set(a_type, a_field, phn1,	\
+				    NULL);				\
+				phn_merge(a_type, a_field, phn0, phn1,	\
+				    a_cmp, phn0);			\
+				phn_next_set(a_type, a_field, tail,	\
+				    phn0);				\
+				tail = phn0;				\
+				phn0 = phnrest;				\
+			} else {					\
+				phn_next_set(a_type, a_field, tail,	\
+				    phn0);				\
+				tail = phn0;				\
+				phn0 = NULL;				\
+			}						\
+		}							\
+		phn0 = head;						\
+		phn1 = phn_next_get(a_type, a_field, phn0);		\
+		if (phn1 != NULL) {					\
+			while (true) {					\
+				head = phn_next_get(a_type, a_field,	\
+				    phn1);				\
+				assert(phn_prev_get(a_type, a_field,	\
+				    phn0) == NULL);			\
+				phn_next_set(a_type, a_field, phn0,	\
+				    NULL);				\
+				assert(phn_prev_get(a_type, a_field,	\
+				    phn1) == NULL);			\
+				phn_next_set(a_type, a_field, phn1,	\
+				    NULL);				\
+				phn_merge(a_type, a_field, phn0, phn1,	\
+				    a_cmp, phn0);			\
+				if (head == NULL)			\
+					break;				\
+				phn_next_set(a_type, a_field, tail,	\
+				    phn0);				\
+				tail = phn0;				\
+				phn0 = head;				\
+				phn1 = phn_next_get(a_type, a_field,	\
+				    phn0);				\
+			}						\
+		}							\
+	}								\
+	r_phn = phn0;							\
+} while (0)
+
+#define	ph_merge_aux(a_type, a_field, a_ph, a_cmp) do {			\
+	a_type *phn = phn_next_get(a_type, a_field, a_ph->ph_root);	\
+	if (phn != NULL) {						\
+		phn_prev_set(a_type, a_field, a_ph->ph_root, NULL);	\
+		phn_next_set(a_type, a_field, a_ph->ph_root, NULL);	\
+		phn_prev_set(a_type, a_field, phn, NULL);		\
+		ph_merge_siblings(a_type, a_field, phn, a_cmp, phn);	\
+		assert(phn_next_get(a_type, a_field, phn) == NULL);	\
+		phn_merge(a_type, a_field, a_ph->ph_root, phn, a_cmp,	\
+		    a_ph->ph_root);					\
+	}								\
+} while (0)
+
+#define	ph_merge_children(a_type, a_field, a_phn, a_cmp, r_phn) do {	\
+	a_type *lchild = phn_lchild_get(a_type, a_field, a_phn);	\
+	if (lchild == NULL)						\
+		r_phn = NULL;						\
+	else {								\
+		ph_merge_siblings(a_type, a_field, lchild, a_cmp,	\
+		    r_phn);						\
+	}								\
+} while (0)
 
 /*
- * Merge the aux list into the root node.
+ * The ph_proto() macro generates function prototypes that correspond to the
+ * functions generated by an equivalently parameterized call to ph_gen().
  */
-JEMALLOC_INLINE void
-ph_merge_aux_list(ph_heap_t *l)
-{
+#define	ph_proto(a_attr, a_prefix, a_ph_type, a_type)			\
+a_attr void	a_prefix##new(a_ph_type *ph);				\
+a_attr bool	a_prefix##empty(a_ph_type *ph);				\
+a_attr a_type	*a_prefix##first(a_ph_type *ph);			\
+a_attr void	a_prefix##insert(a_ph_type *ph, a_type *phn);		\
+a_attr a_type	*a_prefix##remove_first(a_ph_type *ph);			\
+a_attr void	a_prefix##remove(a_ph_type *ph, a_type *phn);
 
-	if (l->root == NULL)
-		return;
-	if (l->root->next != NULL) {
-		ph_node_t *l0 = l->root->next;
-		ph_node_t *l1 = l0->next;
-		ph_node_t *lrest = NULL;
-
-		/* Multipass merge. */
-		while (l1 != NULL) {
-			lrest = l1->next;
-			if (lrest != NULL)
-				lrest->prev = NULL;
-			l1->next = NULL;
-			l1->prev = NULL;
-			l0->next = NULL;
-			l0->prev = NULL;
-			l0 = ph_merge(l0, l1);
-			l1 = lrest;
-		}
-		l->root->next = NULL;
-		l->root = ph_merge(l->root, l0);
-	}
+/*
+ * The ph_gen() macro generates a type-specific pairing heap implementation,
+ * based on the above cpp macros.
+ */
+#define	ph_gen(a_attr, a_prefix, a_ph_type, a_type, a_field, a_cmp)	\
+a_attr void								\
+a_prefix##new(a_ph_type *ph)						\
+{									\
+									\
+	memset(ph, 0, sizeof(ph(a_type)));				\
+}									\
+a_attr bool								\
+a_prefix##empty(a_ph_type *ph) {					\
+									\
+	return (ph->ph_root == NULL);					\
+}									\
+a_attr a_type *								\
+a_prefix##first(a_ph_type *ph)						\
+{									\
+									\
+	if (ph->ph_root == NULL)					\
+		return (NULL);						\
+	ph_merge_aux(a_type, a_field, ph, a_cmp);			\
+	return (ph->ph_root);						\
+}									\
+a_attr void								\
+a_prefix##insert(a_ph_type *ph, a_type *phn)				\
+{									\
+									\
+	memset(&phn->a_field, 0, sizeof(phn(a_type)));			\
+									\
+	/*								\
+	 * Treat the root as an aux list during insertion, and lazily	\
+	 * merge during a_prefix##remove_first().  For elements that	\
+	 * are inserted, then removed via a_prefix##remove() before the	\
+	 * aux list is ever processed, this makes insert/remove		\
+	 * constant-time, whereas eager merging would make insert	\
+	 * O(log n).							\
+	 */								\
+	if (ph->ph_root == NULL)					\
+		ph->ph_root = phn;					\
+	else {								\
+		phn_next_set(a_type, a_field, phn, phn_next_get(a_type,	\
+		    a_field, ph->ph_root));				\
+		if (phn_next_get(a_type, a_field, ph->ph_root) !=	\
+		    NULL) {						\
+			phn_prev_set(a_type, a_field,			\
+			    phn_next_get(a_type, a_field, ph->ph_root),	\
+			    phn);					\
+		}							\
+		phn_prev_set(a_type, a_field, phn, ph->ph_root);	\
+		phn_next_set(a_type, a_field, ph->ph_root, phn);	\
+	}								\
+}									\
+a_attr a_type *								\
+a_prefix##remove_first(a_ph_type *ph)					\
+{									\
+	a_type *ret;							\
+									\
+	if (ph->ph_root == NULL)					\
+		return (NULL);						\
+	ph_merge_aux(a_type, a_field, ph, a_cmp);			\
+									\
+	ret = ph->ph_root;						\
+									\
+	ph_merge_children(a_type, a_field, ph->ph_root, a_cmp,		\
+	    ph->ph_root);						\
+									\
+	return (ret);							\
+}									\
+a_attr void								\
+a_prefix##remove(a_ph_type *ph, a_type *phn)				\
+{									\
+	a_type *replace, *parent;					\
+									\
+	/*								\
+	 * We can delete from aux list without merging it, but we need	\
+	 * to merge if we are dealing with the root node.		\
+	 */								\
+	if (ph->ph_root == phn) {					\
+		ph_merge_aux(a_type, a_field, ph, a_cmp);		\
+		if (ph->ph_root == phn) {				\
+			ph_merge_children(a_type, a_field, ph->ph_root,	\
+			    a_cmp, ph->ph_root);			\
+			return;						\
+		}							\
+	}								\
+									\
+	/* Get parent (if phn is leftmost child) before mutating. */	\
+	if ((parent = phn_prev_get(a_type, a_field, phn)) != NULL) {	\
+		if (phn_lchild_get(a_type, a_field, parent) != phn)	\
+			parent = NULL;					\
+	}								\
+	/* Find a possible replacement node, and link to parent. */	\
+	ph_merge_children(a_type, a_field, phn, a_cmp, replace);	\
+	/* Set next/prev for sibling linked list. */			\
+	if (replace != NULL) {						\
+		if (parent != NULL) {					\
+			phn_prev_set(a_type, a_field, replace, parent);	\
+			phn_lchild_set(a_type, a_field, parent,		\
+			    replace);					\
+		} else {						\
+			phn_prev_set(a_type, a_field, replace,		\
+			    phn_prev_get(a_type, a_field, phn));	\
+			if (phn_prev_get(a_type, a_field, phn) !=	\
+			    NULL) {					\
+				phn_next_set(a_type, a_field,		\
+				    phn_prev_get(a_type, a_field, phn),	\
+				    replace);				\
+			}						\
+		}							\
+		phn_next_set(a_type, a_field, replace,			\
+		    phn_next_get(a_type, a_field, phn));		\
+		if (phn_next_get(a_type, a_field, phn) != NULL) {	\
+			phn_prev_set(a_type, a_field,			\
+			    phn_next_get(a_type, a_field, phn),		\
+			    replace);					\
+		}							\
+	} else {							\
+		if (parent != NULL) {					\
+			a_type *next = phn_next_get(a_type, a_field,	\
+			    phn);					\
+			phn_lchild_set(a_type, a_field, parent, next);	\
+			if (next != NULL) {				\
+				phn_prev_set(a_type, a_field, next,	\
+				    parent);				\
+			}						\
+		} else {						\
+			assert(phn_prev_get(a_type, a_field, phn) !=	\
+			    NULL);					\
+			phn_next_set(a_type, a_field,			\
+			    phn_prev_get(a_type, a_field, phn),		\
+			    phn_next_get(a_type, a_field, phn));	\
+		}							\
+		if (phn_next_get(a_type, a_field, phn) != NULL) {	\
+			phn_prev_set(a_type, a_field,			\
+			    phn_next_get(a_type, a_field, phn),		\
+			    phn_prev_get(a_type, a_field, phn));	\
+		}							\
+	}								\
 }
 
-/* User API *******************************************************************/
-
-JEMALLOC_INLINE void
-ph_new(ph_heap_t *n)
-{
-
-	memset(n, 0, sizeof(ph_heap_t));
-}
-
-JEMALLOC_INLINE ph_node_t *
-ph_first(ph_heap_t *l)
-{
-
-	/*
-	 * For the cost of an extra pointer, a l->min could be stored instead of
-	 * merging the aux list here.  Current users always call ph_remove(l,
-	 * ph_first(l)) though, and the aux list must always be merged for
-	 * delete of the min node anyway.
-	 */
-	ph_merge_aux_list(l);
-	return (l->root);
-}
-
-JEMALLOC_INLINE void
-ph_insert(ph_heap_t *l, ph_node_t *n)
-{
-
-	memset(n, 0, sizeof(ph_node_t));
-
-	/*
-	 * Non-aux list insert:
-	 *
-	 * l->root = ph_merge(l->root, n);
-	 *
-	 * Aux list insert:
-	 */
-	if (l->root == NULL)
-		l->root = n;
-	else {
-		n->next = l->root->next;
-		if (l->root->next != NULL)
-			l->root->next->prev = n;
-		n->prev = l->root;
-		l->root->next = n;
-	}
-}
-
-JEMALLOC_INLINE ph_node_t *
-ph_remove_first(ph_heap_t *l)
-{
-	ph_node_t *ret;
-
-	ph_merge_aux_list(l);
-	if (l->root == NULL)
-		return (NULL);
-
-	ret = l->root;
-
-	l->root = ph_merge_pairs(l->root->subheaps);
-
-	return (ret);
-}
-
-JEMALLOC_INLINE void
-ph_remove(ph_heap_t *l, ph_node_t *n)
-{
-	ph_node_t *replace;
-
-	/*
-	 * We can delete from aux list without merging it, but we need to merge
-	 * if we are dealing with the root node.
-	 */
-	if (l->root == n) {
-		ph_merge_aux_list(l);
-		if (l->root == n) {
-			ph_remove_first(l);
-			return;
-		}
-	}
-
-	/* Find a possible replacement node, and link to parent. */
-	replace = ph_merge_pairs(n->subheaps);
-	if (n->parent != NULL && n->parent->subheaps == n) {
-		if (replace != NULL)
-			n->parent->subheaps = replace;
-		else
-			n->parent->subheaps = n->next;
-	}
-	/* Set next/prev for sibling linked list. */
-	if (replace != NULL) {
-		replace->parent = n->parent;
-		replace->prev = n->prev;
-		if (n->prev != NULL)
-			n->prev->next = replace;
-		replace->next = n->next;
-		if (n->next != NULL)
-			n->next->prev = replace;
-	} else {
-		if (n->prev != NULL)
-			n->prev->next = n->next;
-		if (n->next != NULL)
-			n->next->prev = n->prev;
-	}
-}
-#endif
-
-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
+#endif /* PH_H_ */
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 969c73d..551cb93 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -82,7 +82,6 @@
 arena_nthreads_get
 arena_nthreads_inc
 arena_palloc
-arena_ph_to_miscelm
 arena_postfork_child
 arena_postfork_parent
 arena_prefork
@@ -101,6 +100,12 @@
 arena_ralloc_no_move
 arena_rd_to_miscelm
 arena_redzone_corruption
+arena_run_heap_empty
+arena_run_heap_first
+arena_run_heap_insert
+arena_run_heap_new
+arena_run_heap_remove_first
+arena_run_heap_remove
 arena_run_regind
 arena_run_to_miscelm
 arena_salloc
@@ -381,15 +386,6 @@
 pages_purge
 pages_trim
 pages_unmap
-ph_first
-ph_insert
-ph_merge
-ph_merge_aux_list
-ph_merge_ordered
-ph_merge_pairs
-ph_new
-ph_remove_first
-ph_remove
 pow2_ceil_u32
 pow2_ceil_u64
 pow2_ceil_zu
diff --git a/src/arena.c b/src/arena.c
index 1d30de5..d884dc4 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -59,6 +59,23 @@
 	return (arena_mapbits_size_decode(mapbits));
 }
 
+JEMALLOC_INLINE_C int
+arena_run_addr_comp(const arena_chunk_map_misc_t *a,
+    const arena_chunk_map_misc_t *b)
+{
+	uintptr_t a_miscelm = (uintptr_t)a;
+	uintptr_t b_miscelm = (uintptr_t)b;
+
+	assert(a != NULL);
+	assert(b != NULL);
+
+	return ((a_miscelm > b_miscelm) - (a_miscelm < b_miscelm));
+}
+
+/* Generate pairing heap functions. */
+ph_gen(static UNUSED, arena_run_heap_, arena_run_heap_t, arena_chunk_map_misc_t,
+    ph_link, arena_run_addr_comp)
+
 static size_t
 run_quantize_floor_compute(size_t size)
 {
@@ -182,7 +199,7 @@
 run_quantize_t *run_quantize_ceil = JEMALLOC_N(run_quantize_ceil_impl);
 #endif
 
-static ph_heap_t *
+static arena_run_heap_t *
 arena_runs_avail_get(arena_t *arena, szind_t ind)
 {
 
@@ -200,8 +217,8 @@
 	    arena_miscelm_get_const(chunk, pageind))));
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
-	ph_insert(arena_runs_avail_get(arena, ind),
-	    &arena_miscelm_get_mutable(chunk, pageind)->ph_link);
+	arena_run_heap_insert(arena_runs_avail_get(arena, ind),
+	    arena_miscelm_get_mutable(chunk, pageind));
 }
 
 static void
@@ -212,8 +229,8 @@
 	    arena_miscelm_get_const(chunk, pageind))));
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
-	ph_remove(arena_runs_avail_get(arena, ind),
-	    &arena_miscelm_get_mutable(chunk, pageind)->ph_link);
+	arena_run_heap_remove(arena_runs_avail_get(arena, ind),
+	    arena_miscelm_get_mutable(chunk, pageind));
 }
 
 static void
@@ -1065,12 +1082,10 @@
 
 	ind = size2index(run_quantize_ceil(size));
 	for (i = ind; i < runs_avail_nclasses + runs_avail_bias; i++) {
-		ph_node_t *node = ph_first(arena_runs_avail_get(arena, i));
-		if (node != NULL) {
-			arena_chunk_map_misc_t *miscelm =
-			    arena_ph_to_miscelm(node);
+		arena_chunk_map_misc_t *miscelm = arena_run_heap_first(
+		    arena_runs_avail_get(arena, i));
+		if (miscelm != NULL)
 			return (&miscelm->run);
-		}
 	}
 
 	return (NULL);
@@ -2052,45 +2067,26 @@
 	    0));
 }
 
-static arena_run_t *
-arena_bin_runs_first(arena_bin_t *bin)
-{
-	ph_node_t *node;
-	arena_chunk_map_misc_t *miscelm;
-
-	node = ph_first(&bin->runs);
-	if (node == NULL)
-		return (NULL);
-	miscelm = arena_ph_to_miscelm(node);
-	return (&miscelm->run);
-}
-
 static void
 arena_bin_runs_insert(arena_bin_t *bin, arena_run_t *run)
 {
 	arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
 
-	ph_insert(&bin->runs, &miscelm->ph_link);
-}
-
-static void
-arena_bin_runs_remove(arena_bin_t *bin, arena_run_t *run)
-{
-	arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
-
-	ph_remove(&bin->runs, &miscelm->ph_link);
+	arena_run_heap_insert(&bin->runs, miscelm);
 }
 
 static arena_run_t *
 arena_bin_nonfull_run_tryget(arena_bin_t *bin)
 {
-	arena_run_t *run = arena_bin_runs_first(bin);
-	if (run != NULL) {
-		arena_bin_runs_remove(bin, run);
-		if (config_stats)
-			bin->stats.reruns++;
-	}
-	return (run);
+	arena_chunk_map_misc_t *miscelm;
+
+	miscelm = arena_run_heap_remove_first(&bin->runs);
+	if (miscelm == NULL)
+		return (NULL);
+	if (config_stats)
+		bin->stats.reruns++;
+
+	return (&miscelm->run);
 }
 
 static arena_run_t *
@@ -2645,13 +2641,16 @@
 		    &chunk->node), bin);
 		arena_bin_info_t *bin_info = &arena_bin_info[binind];
 
+		/*
+		 * The following block's conditional is necessary because if the
+		 * run only contains one region, then it never gets inserted
+		 * into the non-full runs tree.
+		 */
 		if (bin_info->nregs != 1) {
-			/*
-			 * This block's conditional is necessary because if the
-			 * run only contains one region, then it never gets
-			 * inserted into the non-full runs tree.
-			 */
-			arena_bin_runs_remove(bin, run);
+			arena_chunk_map_misc_t *miscelm =
+			    arena_run_to_miscelm(run);
+
+			arena_run_heap_remove(&bin->runs, miscelm);
 		}
 	}
 }
@@ -3312,7 +3311,7 @@
 	arena_bin_t *bin;
 
 	/* Compute arena size to incorporate sufficient runs_avail elements. */
-	arena_size = offsetof(arena_t, runs_avail) + (sizeof(ph_heap_t) *
+	arena_size = offsetof(arena_t, runs_avail) + (sizeof(arena_run_heap_t) *
 	    runs_avail_nclasses);
 	/*
 	 * Allocate arena, arena->lstats, and arena->hstats contiguously, mainly
@@ -3372,7 +3371,7 @@
 	arena->ndirty = 0;
 
 	for(i = 0; i < runs_avail_nclasses; i++)
-		ph_new(&arena->runs_avail[i]);
+		arena_run_heap_new(&arena->runs_avail[i]);
 	qr_new(&arena->runs_dirty, rd_link);
 	qr_new(&arena->chunks_cache, cc_link);
 
@@ -3401,7 +3400,7 @@
 		if (malloc_mutex_init(&bin->lock))
 			return (NULL);
 		bin->runcur = NULL;
-		ph_new(&bin->runs);
+		arena_run_heap_new(&bin->runs);
 		if (config_stats)
 			memset(&bin->stats, 0, sizeof(malloc_bin_stats_t));
 	}
diff --git a/src/ph.c b/src/ph.c
deleted file mode 100644
index 051a20d..0000000
--- a/src/ph.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define	JEMALLOC_PH_C_
-#include "jemalloc/internal/jemalloc_internal.h"
diff --git a/test/unit/ph.c b/test/unit/ph.c
index 103475b..da442f0 100644
--- a/test/unit/ph.c
+++ b/test/unit/ph.c
@@ -3,58 +3,275 @@
 typedef struct node_s node_t;
 
 struct node_s {
-	ph_node_t link;
+#define	NODE_MAGIC 0x9823af7e
+	uint32_t magic;
+	phn(node_t) link;
+	uint64_t key;
 };
 
+static int
+node_cmp(const node_t *a, const node_t *b)
+{
+	int ret;
+
+	ret = (a->key > b->key) - (a->key < b->key);
+	if (ret == 0) {
+		/*
+		 * Duplicates are not allowed in the heap, so force an
+		 * arbitrary ordering for non-identical items with equal keys.
+		 */
+		ret = (((uintptr_t)a) > ((uintptr_t)b))
+		    - (((uintptr_t)a) < ((uintptr_t)b));
+	}
+	return (ret);
+}
+
+static int
+node_cmp_magic(const node_t *a, const node_t *b) {
+
+	assert_u32_eq(a->magic, NODE_MAGIC, "Bad magic");
+	assert_u32_eq(b->magic, NODE_MAGIC, "Bad magic");
+
+	return (node_cmp(a, b));
+}
+
+typedef ph(node_t) heap_t;
+ph_gen(static, heap_, heap_t, node_t, link, node_cmp_magic);
+
+static void
+node_print(const node_t *node, unsigned depth)
+{
+	unsigned i;
+	node_t *leftmost_child, *sibling;
+
+	for (i = 0; i < depth; i++)
+		malloc_printf("\t");
+	malloc_printf("%2"FMTu64"\n", node->key);
+
+	leftmost_child = phn_lchild_get(node_t, link, node);
+	if (leftmost_child == NULL)
+		return;
+	node_print(leftmost_child, depth + 1);
+
+	for (sibling = phn_next_get(node_t, link, leftmost_child); sibling !=
+	    NULL; sibling = phn_next_get(node_t, link, sibling)) {
+		node_print(sibling, depth + 1);
+	}
+}
+
+static void
+heap_print(const heap_t *heap)
+{
+	node_t *auxelm;
+
+	malloc_printf("vvv heap %p vvv\n", heap);
+	if (heap->ph_root == NULL)
+		goto label_return;
+
+	node_print(heap->ph_root, 0);
+
+	for (auxelm = phn_next_get(node_t, link, heap->ph_root); auxelm != NULL;
+	    auxelm = phn_next_get(node_t, link, auxelm)) {
+		assert_ptr_eq(phn_next_get(node_t, link, phn_prev_get(node_t,
+		    link, auxelm)), auxelm,
+		    "auxelm's prev doesn't link to auxelm");
+		node_print(auxelm, 0);
+	}
+
+label_return:
+	malloc_printf("^^^ heap %p ^^^\n", heap);
+}
+
+static unsigned
+node_validate(const node_t *node, const node_t *parent)
+{
+	unsigned nnodes = 1;
+	node_t *leftmost_child, *sibling;
+
+	if (parent != NULL) {
+		assert_d_ge(node_cmp_magic(node, parent), 0,
+		    "Child is less than parent");
+	}
+
+	leftmost_child = phn_lchild_get(node_t, link, node);
+	if (leftmost_child == NULL)
+		return (nnodes);
+	assert_ptr_eq((void *)phn_prev_get(node_t, link, leftmost_child),
+	    (void *)node, "Leftmost child does not link to node");
+	nnodes += node_validate(leftmost_child, node);
+
+	for (sibling = phn_next_get(node_t, link, leftmost_child); sibling !=
+	    NULL; sibling = phn_next_get(node_t, link, sibling)) {
+		assert_ptr_eq(phn_next_get(node_t, link, phn_prev_get(node_t,
+		    link, sibling)), sibling,
+		    "sibling's prev doesn't link to sibling");
+		nnodes += node_validate(sibling, node);
+	}
+	return (nnodes);
+}
+
+static unsigned
+heap_validate(const heap_t *heap)
+{
+	unsigned nnodes = 0;
+	node_t *auxelm;
+
+	if (heap->ph_root == NULL)
+		goto label_return;
+
+	nnodes += node_validate(heap->ph_root, NULL);
+
+	for (auxelm = phn_next_get(node_t, link, heap->ph_root); auxelm != NULL;
+	    auxelm = phn_next_get(node_t, link, auxelm)) {
+		assert_ptr_eq(phn_next_get(node_t, link, phn_prev_get(node_t,
+		    link, auxelm)), auxelm,
+		    "auxelm's prev doesn't link to auxelm");
+		nnodes += node_validate(auxelm, NULL);
+	}
+
+label_return:
+	if (false)
+		heap_print(heap);
+	return (nnodes);
+}
+
 TEST_BEGIN(test_ph_empty)
 {
-	ph_heap_t heap;
+	heap_t heap;
 
-	ph_new(&heap);
-
-	assert_ptr_null(ph_first(&heap), "Unexpected node");
+	heap_new(&heap);
+	assert_true(heap_empty(&heap), "Heap should be empty");
+	assert_ptr_null(heap_first(&heap), "Unexpected node");
 }
 TEST_END
 
+static void
+node_remove(heap_t *heap, node_t *node)
+{
+
+	heap_remove(heap, node);
+
+	node->magic = 0;
+}
+
+static node_t *
+node_remove_first(heap_t *heap)
+{
+	node_t *node = heap_remove_first(heap);
+	node->magic = 0;
+	return (node);
+}
+
 TEST_BEGIN(test_ph_random)
 {
 #define	NNODES 25
+#define	NBAGS 250
 #define	SEED 42
 	sfmt_t *sfmt;
-	ph_heap_t heap;
+	uint64_t bag[NNODES];
+	heap_t heap;
 	node_t nodes[NNODES];
 	unsigned i, j, k;
 
 	sfmt = init_gen_rand(SEED);
-	for (i = 0; i < 2; i++) {
+	for (i = 0; i < NBAGS; i++) {
+		switch (i) {
+		case 0:
+			/* Insert in order. */
+			for (j = 0; j < NNODES; j++)
+				bag[j] = j;
+			break;
+		case 1:
+			/* Insert in reverse order. */
+			for (j = 0; j < NNODES; j++)
+				bag[j] = NNODES - j - 1;
+			break;
+		default:
+			for (j = 0; j < NNODES; j++)
+				bag[j] = gen_rand64_range(sfmt, NNODES);
+		}
+
 		for (j = 1; j <= NNODES; j++) {
 			/* Initialize heap and nodes. */
-			ph_new(&heap);
+			heap_new(&heap);
+			assert_u_eq(heap_validate(&heap), 0,
+			    "Incorrect node count");
+			for (k = 0; k < j; k++) {
+				nodes[k].magic = NODE_MAGIC;
+				nodes[k].key = bag[k];
+			}
 
 			/* Insert nodes. */
 			for (k = 0; k < j; k++) {
-				ph_insert(&heap, &nodes[k].link);
-
-				assert_ptr_not_null(ph_first(&heap),
-				    "Heap should not be empty");
+				heap_insert(&heap, &nodes[k]);
+				if (i % 13 == 12) {
+					/* Trigger merging. */
+					assert_ptr_not_null(heap_first(&heap),
+					    "Heap should not be empty");
+				}
+				assert_u_eq(heap_validate(&heap), k + 1,
+				    "Incorrect node count");
 			}
 
+			assert_false(heap_empty(&heap),
+			    "Heap should not be empty");
+
 			/* Remove nodes. */
-			switch (i % 2) {
+			switch (i % 4) {
 			case 0:
-				for (k = 0; k < j; k++)
-					ph_remove(&heap, &nodes[k].link);
+				for (k = 0; k < j; k++) {
+					assert_u_eq(heap_validate(&heap), j - k,
+					    "Incorrect node count");
+					node_remove(&heap, &nodes[k]);
+					assert_u_eq(heap_validate(&heap), j - k
+					    - 1, "Incorrect node count");
+				}
 				break;
 			case 1:
-				for (k = j; k > 0; k--)
-					ph_remove(&heap, &nodes[k-1].link);
+				for (k = j; k > 0; k--) {
+					node_remove(&heap, &nodes[k-1]);
+					assert_u_eq(heap_validate(&heap), k - 1,
+					    "Incorrect node count");
+				}
 				break;
-			default:
+			case 2: {
+				node_t *prev = NULL;
+				for (k = 0; k < j; k++) {
+					node_t *node = node_remove_first(&heap);
+					assert_u_eq(heap_validate(&heap), j - k
+					    - 1, "Incorrect node count");
+					if (prev != NULL) {
+						assert_d_ge(node_cmp(node,
+						    prev), 0,
+						    "Bad removal order");
+					}
+					prev = node;
+				}
+				break;
+			} case 3: {
+				node_t *prev = NULL;
+				for (k = 0; k < j; k++) {
+					node_t *node = heap_first(&heap);
+					assert_u_eq(heap_validate(&heap), j - k,
+					    "Incorrect node count");
+					if (prev != NULL) {
+						assert_d_ge(node_cmp(node,
+						    prev), 0,
+						    "Bad removal order");
+					}
+					node_remove(&heap, node);
+					assert_u_eq(heap_validate(&heap), j - k
+					    - 1, "Incorrect node count");
+					prev = node;
+				}
+				break;
+			} default:
 				not_reached();
 			}
 
-			assert_ptr_null(ph_first(&heap),
-			    "Heap should not be empty");
+			assert_ptr_null(heap_first(&heap),
+			    "Heap should be empty");
+			assert_true(heap_empty(&heap), "Heap should be empty");
 		}
 	}
 	fini_gen_rand(sfmt);