Implement tsd.

Implement tsd, which is a TLS/TSD abstraction that uses one or both
internally.  Modify bootstrapping such that no tsd's are utilized until
allocation is safe.

Remove malloc_[v]tprintf(), and use malloc_snprintf() instead.

Fix %p argument size handling in malloc_vsnprintf().

Fix a long-standing statistics-related bug in the "thread.arena"
mallctl that could cause crashes due to linked list corruption.
diff --git a/Makefile.in b/Makefile.in
index 01ed083..494ac9a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -50,7 +50,8 @@
 	@srcroot@src/ckh.c @srcroot@src/ctl.c @srcroot@src/extent.c \
 	@srcroot@src/hash.c @srcroot@src/huge.c @srcroot@src/mb.c \
 	@srcroot@src/mutex.c @srcroot@src/prof.c @srcroot@src/rtree.c \
-	@srcroot@src/stats.c @srcroot@src/tcache.c @srcroot@src/util.c
+	@srcroot@src/stats.c @srcroot@src/tcache.c @srcroot@src/util.c \
+	@srcroot@src/tsd.c
 ifeq (macho, @abi@)
 CSRCS += @srcroot@src/zone.c
 endif
diff --git a/configure.ac b/configure.ac
index 02d4f53..44ff6ee 100644
--- a/configure.ac
+++ b/configure.ac
@@ -763,6 +763,20 @@
 
 CPPFLAGS="$CPPFLAGS -D_REENTRANT"
 
+dnl Check whether the BSD-specific _malloc_thread_cleanup() exists.  If so, use
+dnl it rather than pthreads TSD cleanup functions to support cleanup during
+dnl thread exit, in order to avoid pthreads library recursion during
+dnl bootstrapping.
+force_tls="0"
+AC_CHECK_FUNC([_malloc_thread_cleanup],
+              [have__malloc_thread_cleanup="1"],
+              [have__malloc_thread_cleanup="0"]
+             )
+if test "x$have__malloc_thread_cleanup" = "x1" ; then
+  AC_DEFINE([JEMALLOC_MALLOC_THREAD_CLEANUP], [ ])
+  force_tls="1"
+fi
+
 dnl Disable lazy locking by default.
 AC_ARG_ENABLE([lazy_lock],
   [AS_HELP_STRING([--enable-lazy-lock],
@@ -795,6 +809,10 @@
 ,
 enable_tls="1"
 )
+if test "x${enable_tls}" = "x0" -a "x${force_tls}" = "x1" ; then
+  AC_MSG_RESULT([Forcing TLS to avoid allocator/threading bootstrap issues])
+  enable_tls="1"
+fi
 if test "x${enable_tls}" = "x1" ; then
 AC_MSG_CHECKING([for TLS])
 AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
@@ -812,6 +830,8 @@
 AC_SUBST([enable_tls])
 if test "x${enable_tls}" = "x1" ; then
   AC_DEFINE_UNQUOTED([JEMALLOC_TLS], [ ])
+elif test "x${force_tls}" = "x1" ; then
+  AC_MSG_ERROR([Failed to configure TLS, which is mandatory for correct function])
 fi
 
 dnl ============================================================================
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 1609adc..c521489 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -391,6 +391,7 @@
 prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
 void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 void	*arena_malloc(size_t size, bool zero);
+void	*arena_malloc_prechosen(arena_t *arena, size_t size, bool zero);
 void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #endif
 
@@ -552,7 +553,7 @@
 	tcache_t *tcache;
 
 	assert(size != 0);
-	assert(QUANTUM_CEILING(size) <= arena_maxclass);
+	assert(size <= arena_maxclass);
 
 	if (size <= SMALL_MAXCLASS) {
 		if ((tcache = tcache_get()) != NULL)
@@ -571,6 +572,19 @@
 	}
 }
 
+JEMALLOC_INLINE void *
+arena_malloc_prechosen(arena_t *arena, size_t size, bool zero)
+{
+
+	assert(size != 0);
+	assert(size <= arena_maxclass);
+
+	if (size <= SMALL_MAXCLASS)
+		return (arena_malloc_small(arena, size, zero));
+	else
+		return (arena_malloc_large(arena, size, zero));
+}
+
 JEMALLOC_INLINE void
 arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 9a62ba1..8e24e8f 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -44,7 +44,8 @@
 
 void	*chunk_alloc(size_t size, bool base, bool *zero);
 void	chunk_dealloc(void *chunk, size_t size, bool unmap);
-bool	chunk_boot(void);
+bool	chunk_boot0(void);
+bool	chunk_boot1(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index dbfd3fc..387aabb 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -289,6 +289,7 @@
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/tsd.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
@@ -316,6 +317,7 @@
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/tsd.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
@@ -335,6 +337,11 @@
 	uint64_t	allocated;
 	uint64_t	deallocated;
 } thread_allocated_t;
+/*
+ * The JEMALLOC_CONCAT() wrapper is necessary to pass {0, 0} via a cpp macro
+ * argument.
+ */
+#define	THREAD_ALLOCATED_INITIALIZER	JEMALLOC_CONCAT({0, 0})
 
 #undef JEMALLOC_H_STRUCTS
 /******************************************************************************/
@@ -356,25 +363,6 @@
 extern unsigned		ncpus;
 
 extern malloc_mutex_t	arenas_lock; /* Protects arenas initialization. */
-extern pthread_key_t	arenas_tsd;
-#ifdef JEMALLOC_TLS
-/*
- * Map of pthread_self() --> arenas[???], used for selecting an arena to use
- * for allocations.
- */
-extern __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
-#  define ARENA_GET()	arenas_tls
-#  define ARENA_SET(v)	do {						\
-	arenas_tls = (v);						\
-	pthread_setspecific(arenas_tsd, (void *)(v));			\
-} while (0)
-#else
-#  define ARENA_GET()	((arena_t *)pthread_getspecific(arenas_tsd))
-#  define ARENA_SET(v)	do {						\
-	pthread_setspecific(arenas_tsd, (void *)(v));			\
-} while (0)
-#endif
-
 /*
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
@@ -382,31 +370,8 @@
 extern arena_t		**arenas;
 extern unsigned		narenas;
 
-#ifdef JEMALLOC_TLS
-extern __thread thread_allocated_t	thread_allocated_tls;
-#    define ALLOCATED_GET() (thread_allocated_tls.allocated)
-#    define ALLOCATEDP_GET() (&thread_allocated_tls.allocated)
-#    define DEALLOCATED_GET() (thread_allocated_tls.deallocated)
-#    define DEALLOCATEDP_GET() (&thread_allocated_tls.deallocated)
-#    define ALLOCATED_ADD(a, d) do {					\
-	thread_allocated_tls.allocated += a;				\
-	thread_allocated_tls.deallocated += d;				\
-} while (0)
-#else
-#    define ALLOCATED_GET() (thread_allocated_get()->allocated)
-#    define ALLOCATEDP_GET() (&thread_allocated_get()->allocated)
-#    define DEALLOCATED_GET() (thread_allocated_get()->deallocated)
-#    define DEALLOCATEDP_GET() (&thread_allocated_get()->deallocated)
-#    define ALLOCATED_ADD(a, d) do {					\
-	thread_allocated_t *thread_allocated = thread_allocated_get();	\
-	thread_allocated->allocated += (a);				\
-	thread_allocated->deallocated += (d);				\
-} while (0)
-#endif
-extern pthread_key_t	thread_allocated_tsd;
-thread_allocated_t	*thread_allocated_get_hard(void);
-
 arena_t	*arenas_extend(unsigned ind);
+void	arenas_cleanup(void *arg);
 arena_t	*choose_arena_hard(void);
 void	jemalloc_prefork(void);
 void	jemalloc_postfork_parent(void);
@@ -420,6 +385,7 @@
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/tsd.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
@@ -447,6 +413,7 @@
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/tsd.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/base.h"
@@ -454,14 +421,22 @@
 #include "jemalloc/internal/huge.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
+malloc_tsd_protos(JEMALLOC_ATTR(unused), arenas, arena_t *)
+
 size_t	s2u(size_t size);
 size_t	sa2u(size_t size, size_t alignment, size_t *run_size_p);
 arena_t	*choose_arena(void);
-thread_allocated_t	*thread_allocated_get(void);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
 /*
+ * Map of pthread_self() --> arenas[???], used for selecting an arena to use
+ * for allocations.
+ */
+malloc_tsd_externs(arenas, arena_t *)
+malloc_tsd_funcs(JEMALLOC_INLINE, arenas, arena_t *, NULL, arenas_cleanup)
+
+/*
  * Compute usable size that would result from allocating an object with the
  * specified size.
  */
@@ -572,25 +547,13 @@
 {
 	arena_t *ret;
 
-	ret = ARENA_GET();
-	if (ret == NULL) {
+	if ((ret = *arenas_tsd_get()) == NULL) {
 		ret = choose_arena_hard();
 		assert(ret != NULL);
 	}
 
 	return (ret);
 }
-
-JEMALLOC_INLINE thread_allocated_t *
-thread_allocated_get(void)
-{
-	thread_allocated_t *thread_allocated = (thread_allocated_t *)
-	    pthread_getspecific(thread_allocated_tsd);
-
-	if (thread_allocated == NULL)
-		return (thread_allocated_get_hard());
-	return (thread_allocated);
-}
 #endif
 
 #include "jemalloc/internal/bitmap.h"
@@ -611,6 +574,7 @@
 void	idalloc(void *ptr);
 void	*iralloc(void *ptr, size_t size, size_t extra, size_t alignment,
     bool zero, bool no_move);
+malloc_tsd_protos(JEMALLOC_ATTR(unused), thread_allocated, thread_allocated_t)
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
@@ -787,6 +751,10 @@
 		}
 	}
 }
+
+malloc_tsd_externs(thread_allocated, thread_allocated_t)
+malloc_tsd_funcs(JEMALLOC_INLINE, thread_allocated, thread_allocated_t,
+    THREAD_ALLOCATED_INITIALIZER, malloc_tsd_no_cleanup)
 #endif
 
 #include "jemalloc/internal/prof.h"
diff --git a/include/jemalloc/internal/private_namespace.h b/include/jemalloc/internal/private_namespace.h
index e7370fe..7103e68 100644
--- a/include/jemalloc/internal/private_namespace.h
+++ b/include/jemalloc/internal/private_namespace.h
@@ -155,10 +155,8 @@
 #define	malloc_mutex_unlock JEMALLOC_N(malloc_mutex_unlock)
 #define	malloc_printf JEMALLOC_N(malloc_printf)
 #define	malloc_snprintf JEMALLOC_N(malloc_snprintf)
-#define	malloc_tprintf JEMALLOC_N(malloc_tprintf)
 #define	malloc_vcprintf JEMALLOC_N(malloc_vcprintf)
 #define	malloc_vsnprintf JEMALLOC_N(malloc_vsnprintf)
-#define	malloc_vtprintf JEMALLOC_N(malloc_vtprintf)
 #define	malloc_write JEMALLOC_N(malloc_write)
 #define	mb_write JEMALLOC_N(mb_write)
 #define	opt_abort JEMALLOC_N(opt_abort)
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 4823192..231a387 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -23,10 +23,13 @@
 #define	PROF_TCMAX			1024
 
 /* Initial hash table size. */
-#define	PROF_CKH_MINITEMS	64
+#define	PROF_CKH_MINITEMS		64
 
 /* Size of memory buffer to use when writing dump files. */
-#define	PROF_DUMP_BUF_SIZE	65536
+#define	PROF_DUMP_BUFSIZE		65536
+
+/* Size of stack-allocated buffer used by prof_printf(). */
+#define	PROF_PRINTF_BUFSIZE		128
 
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
@@ -179,29 +182,6 @@
  */
 extern bool	prof_promote;
 
-/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
-#ifdef JEMALLOC_TLS
-extern __thread prof_tdata_t	*prof_tdata_tls
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-#  define PROF_TCACHE_GET()	prof_tdata_tls
-#  define PROF_TCACHE_SET(v)	do {					\
-	prof_tdata_tls = (v);						\
-	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
-} while (0)
-#else
-#  define PROF_TCACHE_GET()						\
-	((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
-#  define PROF_TCACHE_SET(v)	do {					\
-	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
-} while (0)
-#endif
-/*
- * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
- * called when a thread exits, so that prof_tdata_tls contents can be merged,
- * unlinked, and deallocated.
- */
-extern pthread_key_t	prof_tdata_tsd;
-
 void	bt_init(prof_bt_t *bt, void **vec);
 void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
 prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
@@ -209,6 +189,7 @@
 bool	prof_mdump(const char *filename);
 void	prof_gdump(void);
 prof_tdata_t	*prof_tdata_init(void);
+void	prof_tdata_cleanup(void *arg);
 void	prof_boot0(void);
 void	prof_boot1(void);
 bool	prof_boot2(void);
@@ -223,7 +204,7 @@
 									\
 	assert(size == s2u(size));					\
 									\
-	prof_tdata = PROF_TCACHE_GET();					\
+	prof_tdata = *prof_tdata_tsd_get();				\
 	if (prof_tdata == NULL) {					\
 		prof_tdata = prof_tdata_init();				\
 		if (prof_tdata == NULL) {				\
@@ -270,6 +251,8 @@
 } while (0)
 
 #ifndef JEMALLOC_ENABLE_INLINE
+malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
+
 void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
 prof_ctx_t	*prof_ctx_get(const void *ptr);
 void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
@@ -281,6 +264,11 @@
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
+/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
+malloc_tsd_externs(prof_tdata, prof_tdata_t *)
+malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
+    prof_tdata_cleanup)
+
 JEMALLOC_INLINE void
 prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 {
@@ -359,7 +347,7 @@
 	/* Sampling logic is unnecessary if the interval is 1. */
 	assert(opt_lg_prof_sample != 0);
 
-	prof_tdata = PROF_TCACHE_GET();
+	prof_tdata = *prof_tdata_tsd_get();
 	assert(prof_tdata != NULL);
 
 	/* Take care to avoid integer overflow. */
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index ed037cf..30e63a5 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -75,23 +75,6 @@
 
 extern tcache_bin_info_t	*tcache_bin_info;
 
-/* Map of thread-specific caches. */
-#ifdef JEMALLOC_TLS
-extern __thread tcache_t	*tcache_tls
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-#  define TCACHE_GET()	tcache_tls
-#  define TCACHE_SET(v)	do {						\
-	tcache_tls = (tcache_t *)(v);					\
-	pthread_setspecific(tcache_tsd, (void *)(v));			\
-} while (0)
-#else
-#  define TCACHE_GET()	((tcache_t *)pthread_getspecific(tcache_tsd))
-#  define TCACHE_SET(v)	do {						\
-	pthread_setspecific(tcache_tsd, (void *)(v));			\
-} while (0)
-#endif
-extern pthread_key_t		tcache_tsd;
-
 /*
  * Number of tcache bins.  There are NBINS small-object bins, plus 0 or more
  * large-object bins.
@@ -105,18 +88,24 @@
     tcache_t *tcache);
 void	tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
     tcache_t *tcache);
+void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
+void	tcache_arena_dissociate(tcache_t *tcache);
 tcache_t *tcache_create(arena_t *arena);
 void	*tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin,
     size_t binind);
 void	tcache_destroy(tcache_t *tcache);
+void	tcache_thread_cleanup(void *arg);
 void	tcache_stats_merge(tcache_t *tcache, arena_t *arena);
-bool	tcache_boot(void);
+bool	tcache_boot0(void);
+bool	tcache_boot1(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+malloc_tsd_protos(JEMALLOC_ATTR(unused), tcache, tcache_t *)
+
 void	tcache_event(tcache_t *tcache);
 tcache_t *tcache_get(void);
 void	*tcache_alloc_easy(tcache_bin_t *tbin);
@@ -127,6 +116,11 @@
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TCACHE_C_))
+/* Map of thread-specific caches. */
+malloc_tsd_externs(tcache, tcache_t *)
+malloc_tsd_funcs(JEMALLOC_INLINE, tcache, tcache_t *, NULL,
+    tcache_thread_cleanup)
+
 JEMALLOC_INLINE tcache_t *
 tcache_get(void)
 {
@@ -139,7 +133,7 @@
 	else if (opt_tcache == false)
 		return (NULL);
 
-	tcache = TCACHE_GET();
+	tcache = *tcache_tsd_get();
 	if ((uintptr_t)tcache <= (uintptr_t)2) {
 		if (tcache == NULL) {
 			tcache = tcache_create(choose_arena());
@@ -152,7 +146,8 @@
 				 * called after the tcache_thread_cleanup() was
 				 * called.
 				 */
-				TCACHE_SET((uintptr_t)2);
+				tcache = (tcache_t *)(uintptr_t)2;
+				tcache_tsd_set(&tcache);
 			}
 			return (NULL);
 		}
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
new file mode 100644
index 0000000..5a174ac
--- /dev/null
+++ b/include/jemalloc/internal/tsd.h
@@ -0,0 +1,319 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+/* Maximum number of malloc_tsd users with cleanup functions. */
+#define	MALLOC_TSD_CLEANUPS_MAX	8
+
+typedef struct malloc_tsd_cleanup_s malloc_tsd_cleanup_t;
+struct malloc_tsd_cleanup_s {
+	bool	(*f)(void *);
+	void	*arg;
+};
+
+/*
+ * TLS/TSD-agnostic macro-based implementation of thread-specific data.  There
+ * are four macros that support (at least) three use cases: file-private,
+ * library-private, and library-private inlined.  Following is an example
+ * library-private tsd variable:
+ *
+ * In example.h:
+ *   typedef struct {
+ *           int x;
+ *           int y;
+ *   } example_t;
+ *   #define EX_INITIALIZER JEMALLOC_CONCAT({0, 0})
+ *   malloc_tsd_protos(, example, example_t *)
+ *   malloc_tsd_externs(example, example_t *)
+ * In example.c:
+ *   malloc_tsd_data(, example, example_t *, EX_INITIALIZER)
+ *   malloc_tsd_funcs(, example, example_t *, EX_INITIALIZER,
+ *       example_tsd_cleanup)
+ *
+ * The result is a set of generated functions, e.g.:
+ *
+ *   bool example_tsd_boot(void) {...}
+ *   example_t **example_tsd_get() {...}
+ *   void example_tsd_set(example_t **val) {...}
+ *
+ * Note that all of the functions deal in terms of (a_type *) rather than
+ * (a_type)  so that it is possible to support non-pointer types (unlike
+ * pthreads TSD).  example_tsd_cleanup() is passed an (a_type *) pointer that is
+ * cast to (void *).  This means that the cleanup function needs to cast *and*
+ * dereference the function argument, e.g.:
+ *
+ *   void
+ *   example_tsd_cleanup(void *arg)
+ *   {
+ *           example_t *example = *(example_t **)arg;
+ *
+ *           [...]
+ *           if ([want the cleanup function to be called again]) {
+ *                   example_tsd_set(&example);
+ *           }
+ *   }
+ *
+ * If example_tsd_set() is called within example_tsd_cleanup(), it will be
+ * called again.  This is similar to how pthreads TSD destruction works, except
+ * that pthreads only calls the cleanup function again if the value was set to
+ * non-NULL.
+ */
+
+/* malloc_tsd_protos(). */
+#define	malloc_tsd_protos(a_attr, a_name, a_type)			\
+a_attr bool								\
+a_name##_tsd_boot(void);						\
+a_attr a_type *								\
+a_name##_tsd_get(void);							\
+a_attr void								\
+a_name##_tsd_set(a_type *val);
+
+/* malloc_tsd_externs(). */
+#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
+#define	malloc_tsd_externs(a_name, a_type)				\
+extern __thread a_type	a_name##_tls;					\
+extern __thread bool	*a_name##_initialized;				\
+extern bool		a_name##_booted;
+#elif (defined(JEMALLOC_TLS))
+#define	malloc_tsd_externs(a_name, a_type)				\
+extern __thread a_type	a_name##_tls;					\
+extern pthread_key_t	a_name##_tsd;					\
+extern bool		a_name##_booted;
+#else
+#define	malloc_tsd_externs(a_name, a_type)				\
+extern pthread_key_t	a_name##_tsd;					\
+extern bool		a_name##_booted;
+#endif
+
+/* malloc_tsd_data(). */
+#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
+#define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
+a_attr __thread a_type JEMALLOC_ATTR(tls_model("initial-exec"))		\
+    a_name##_tls = a_initializer;					\
+a_attr __thread bool JEMALLOC_ATTR(tls_model("initial-exec"))		\
+    a_name##_initialized = false;					\
+a_attr bool		a_name##_booted = false;
+#elif (defined(JEMALLOC_TLS))
+#define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
+a_attr __thread a_type JEMALLOC_ATTR(tls_model("initial-exec"))		\
+    a_name##_tls = a_initializer;					\
+a_attr pthread_key_t	a_name##_tsd;					\
+a_attr bool		a_name##_booted = false;
+#else
+#define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
+a_attr pthread_key_t	a_name##_tsd;					\
+a_attr bool		a_name##_booted = false;
+#endif
+
+/* malloc_tsd_funcs(). */
+#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
+#define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
+    a_cleanup)								\
+/* Initialization/cleanup. */						\
+a_attr void								\
+a_name##_tsd_cleanup_wrapper(void *arg)					\
+{									\
+									\
+}									\
+bool									\
+a_name##_tsd_cleanup_pending(void *arg)					\
+{									\
+	bool (*cleanup)(void *) = arg;					\
+									\
+	if (a_name##_initialized) {					\
+		a_name##_initialized = false;				\
+		cleanup(&a_name##_tls);					\
+	}								\
+	return (a_name##_initialized);					\
+}									\
+a_attr bool								\
+a_name##_tsd_boot(void)							\
+{									\
+									\
+	if (a_cleanup != malloc_tsd_no_cleanup) {			\
+		malloc_tsd_cleanup_register(				\
+		    &a_name##_tsd_cleanup_pending, a_cleanup);		\
+	}								\
+	a_name##_booted = true;						\
+	return (false);							\
+}									\
+/* Get/set. */								\
+a_attr a_type *								\
+a_name##_tsd_get(void)							\
+{									\
+									\
+	assert(a_name##_booted);					\
+	return (&a_name##_tls);						\
+}									\
+a_attr void								\
+a_name##_tsd_set(a_type *val)						\
+{									\
+									\
+	assert(a_name##_booted);					\
+	a_name##_tls = (*val);						\
+	if (a_cleanup != malloc_tsd_no_cleanup)				\
+		a_name##_initialized = true;				\
+}
+#elif (defined(JEMALLOC_TLS))
+#define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
+    a_cleanup)								\
+/* Initialization/cleanup. */						\
+a_attr void								\
+a_name##_tsd_cleanup_wrapper(void *arg)					\
+{									\
+									\
+}									\
+a_attr bool								\
+a_name##_tsd_boot(void)							\
+{									\
+									\
+	if (a_cleanup != malloc_tsd_no_cleanup) {			\
+		if (pthread_key_create(&a_name##_tsd, a_cleanup) != 0)	\
+			return (true);					\
+	}								\
+	a_name##_booted = true;						\
+	return (false);							\
+}									\
+/* Get/set. */								\
+a_attr a_type *								\
+a_name##_tsd_get(void)							\
+{									\
+									\
+	assert(a_name##_booted);					\
+	return (&a_name##_tls);						\
+}									\
+a_attr void								\
+a_name##_tsd_set(a_type *val)						\
+{									\
+									\
+	assert(a_name##_booted);					\
+	a_name##_tls = (*val);						\
+	if (a_cleanup != malloc_tsd_no_cleanup) {			\
+		if (pthread_setspecific(a_name##_tsd,			\
+		    (void *)(&a_name##_tls))) {				\
+			malloc_write("<jemalloc>: Error"		\
+			    " setting TSD for "#a_name"\n");		\
+			if (opt_abort)					\
+				abort();				\
+		}							\
+	}								\
+}
+#else
+#define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
+    a_cleanup)								\
+/* Data structure. */							\
+typedef struct {							\
+	bool	isstatic;						\
+	bool	initialized;						\
+	a_type	val;							\
+} a_name##_tsd_wrapper_t;						\
+/* Initialization/cleanup. */						\
+a_attr void								\
+a_name##_tsd_cleanup_wrapper(void *arg)					\
+{									\
+	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)arg;\
+									\
+	if (a_cleanup != malloc_tsd_no_cleanup &&			\
+	    wrapper->initialized) {					\
+		wrapper->initialized = false;				\
+		a_cleanup(&wrapper->val);				\
+		if (wrapper->initialized) {				\
+			/* Trigger another cleanup round. */		\
+			if (pthread_setspecific(a_name##_tsd,		\
+			    (void *)wrapper)) {				\
+				malloc_write("<jemalloc>: Error"	\
+				    " setting TSD for "#a_name"\n");	\
+				if (opt_abort)				\
+					abort();			\
+			}						\
+			return;						\
+		}							\
+	}								\
+	if (wrapper->isstatic == false)					\
+		malloc_tsd_dalloc(wrapper);				\
+}									\
+a_attr bool								\
+a_name##_tsd_boot(void)							\
+{									\
+									\
+	if (pthread_key_create(&a_name##_tsd,				\
+	    a_name##_tsd_cleanup_wrapper) != 0)				\
+		return (true);						\
+	a_name##_booted = true;						\
+	return (false);							\
+}									\
+/* Get/set. */								\
+a_attr a_name##_tsd_wrapper_t *						\
+a_name##_tsd_get_wrapper(void)						\
+{									\
+	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)	\
+	    pthread_getspecific(a_name##_tsd);				\
+									\
+	if (wrapper == NULL) {						\
+		wrapper = (a_name##_tsd_wrapper_t *)			\
+		    malloc_tsd_malloc(sizeof(a_name##_tsd_wrapper_t));	\
+		if (wrapper == NULL) {					\
+			static a_name##_tsd_wrapper_t			\
+			    a_name##_tsd_static_data =			\
+			    {true, false, a_initializer};		\
+			malloc_write("<jemalloc>: Error allocating"	\
+			    " TSD for "#a_name"\n");			\
+			if (opt_abort)					\
+				abort();				\
+			wrapper = &a_name##_tsd_static_data;		\
+		} else {						\
+			static a_type tsd_static_data = a_initializer;	\
+			wrapper->isstatic = false;			\
+			wrapper->val = tsd_static_data;			\
+		}							\
+		if (pthread_setspecific(a_name##_tsd,			\
+		    (void *)wrapper)) {					\
+			malloc_write("<jemalloc>: Error setting"	\
+			    " TSD for "#a_name"\n");			\
+			if (opt_abort)					\
+				abort();				\
+		}							\
+	}								\
+	return (wrapper);						\
+}									\
+a_attr a_type *								\
+a_name##_tsd_get(void)							\
+{									\
+	a_name##_tsd_wrapper_t *wrapper;				\
+									\
+	assert(a_name##_booted);					\
+	wrapper = a_name##_tsd_get_wrapper();				\
+	return (&wrapper->val);						\
+}									\
+a_attr void								\
+a_name##_tsd_set(a_type *val)						\
+{									\
+	a_name##_tsd_wrapper_t *wrapper;				\
+									\
+	assert(a_name##_booted);					\
+	wrapper = a_name##_tsd_get_wrapper();				\
+	wrapper->val = *(val);						\
+	if (a_cleanup != malloc_tsd_no_cleanup)				\
+		wrapper->initialized = true;				\
+}
+#endif
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+void	*malloc_tsd_malloc(size_t size);
+void	malloc_tsd_dalloc(void *wrapper);
+void	malloc_tsd_no_cleanup(void *);
+void	malloc_tsd_cleanup_register(bool (*f)(void *), void *arg);
+void	malloc_tsd_boot(void);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index c5f7520..fb354da 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -5,12 +5,18 @@
 #define	BUFERROR_BUF		64
 
 /*
- * Size of static buffer used by malloc_[v]{,c,t}printf().  This must be large
- * enough for all possible uses within jemalloc.
+ * Size of stack-allocated buffer used by malloc_{,v,vc}printf().  This must be
+ * large enough for all possible uses within jemalloc.
  */
 #define	MALLOC_PRINTF_BUFSIZE	4096
 
 /*
+ * Wrap a cpp argument that contains commas such that it isn't broken up into
+ * multiple arguments.
+ */
+#define JEMALLOC_CONCAT(...) __VA_ARGS__
+
+/*
  * Define a custom assert() in order to reduce the chances of deadlock during
  * assertion failure.
  */
@@ -77,13 +83,6 @@
     va_list ap);
 int	malloc_snprintf(char *str, size_t size, const char *format, ...)
     JEMALLOC_ATTR(format(printf, 3, 4));
-/*
- * malloc_[v]tprintf() prints to a thread-local string buffer, so the result is
- * overwritten by the next call to malloc_[v]{,c,t}printf().
- */
-const char *	malloc_vtprintf(const char *format, va_list ap);
-const char *	malloc_tprintf(const char *format, ...)
-    JEMALLOC_ATTR(format(printf, 1, 2));
 void	malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *format, va_list ap);
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
diff --git a/include/jemalloc/jemalloc_defs.h.in b/include/jemalloc/jemalloc_defs.h.in
index 434dd36..838f561 100644
--- a/include/jemalloc/jemalloc_defs.h.in
+++ b/include/jemalloc/jemalloc_defs.h.in
@@ -59,6 +59,15 @@
  */
 #undef JEMALLOC_OSSPIN
 
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+#undef JEMALLOC_MALLOC_THREAD_CLEANUP
+
 /* Defined if __attribute__((...)) syntax is supported. */
 #undef JEMALLOC_HAVE_ATTR
 #ifdef JEMALLOC_HAVE_ATTR
diff --git a/src/chunk.c b/src/chunk.c
index b908650..f50e840 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -100,7 +100,7 @@
 }
 
 bool
-chunk_boot(void)
+chunk_boot0(void)
 {
 
 	/* Set variables according to the value of opt_lg_chunk. */
@@ -114,8 +114,6 @@
 			return (true);
 		memset(&stats_chunks, 0, sizeof(chunk_stats_t));
 	}
-	if (chunk_mmap_boot())
-		return (true);
 	if (config_dss && chunk_dss_boot())
 		return (true);
 	if (config_ivsalloc) {
@@ -127,3 +125,13 @@
 
 	return (false);
 }
+
+bool
+chunk_boot1(void)
+{
+
+	if (chunk_mmap_boot())
+		return (true);
+
+	return (false);
+}
diff --git a/src/chunk_mmap.c b/src/chunk_mmap.c
index 6ea2118..749a2da 100644
--- a/src/chunk_mmap.c
+++ b/src/chunk_mmap.c
@@ -8,20 +8,9 @@
  * Used by chunk_alloc_mmap() to decide whether to attempt the fast path and
  * potentially avoid some system calls.
  */
-#ifdef JEMALLOC_TLS
-static __thread bool	mmap_unaligned_tls
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-#define	MMAP_UNALIGNED_GET()	mmap_unaligned_tls
-#define	MMAP_UNALIGNED_SET(v)	do {					\
-	mmap_unaligned_tls = (v);					\
-} while (0)
-#else
-static pthread_key_t	mmap_unaligned_tsd;
-#define	MMAP_UNALIGNED_GET()	((bool)pthread_getspecific(mmap_unaligned_tsd))
-#define	MMAP_UNALIGNED_SET(v)	do {					\
-	pthread_setspecific(mmap_unaligned_tsd, (void *)(v));		\
-} while (0)
-#endif
+malloc_tsd_data(static, mmap_unaligned, bool, false)
+malloc_tsd_funcs(JEMALLOC_INLINE, mmap_unaligned, bool, false,
+    malloc_tsd_no_cleanup)
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
@@ -128,8 +117,10 @@
 	 * the next chunk_alloc_mmap() execution tries the fast allocation
 	 * method.
 	 */
-	if (unaligned == false)
-		MMAP_UNALIGNED_SET(false);
+	if (unaligned == false && mmap_unaligned_booted) {
+		bool mu = false;
+		mmap_unaligned_tsd_set(&mu);
+	}
 
 	return (ret);
 }
@@ -167,7 +158,7 @@
 	 * fast method next time.
 	 */
 
-	if (MMAP_UNALIGNED_GET() == false) {
+	if (mmap_unaligned_booted && *mmap_unaligned_tsd_get() == false) {
 		size_t offset;
 
 		ret = pages_map(NULL, size, noreserve);
@@ -176,7 +167,8 @@
 
 		offset = CHUNK_ADDR2OFFSET(ret);
 		if (offset != 0) {
-			MMAP_UNALIGNED_SET(true);
+			bool mu = true;
+			mmap_unaligned_tsd_set(&mu);
 			/* Try to extend chunk boundary. */
 			if (pages_map((void *)((uintptr_t)ret + size),
 			    chunksize - offset, noreserve) == NULL) {
@@ -225,11 +217,15 @@
 chunk_mmap_boot(void)
 {
 
-#ifndef JEMALLOC_TLS
-	if (pthread_key_create(&mmap_unaligned_tsd, NULL) != 0) {
-		malloc_write("<jemalloc>: Error in pthread_key_create()\n");
+	/*
+	 * XXX For the non-TLS implementation of tsd, the first access from
+	 * each thread causes memory allocation.  The result is a bootstrapping
+	 * problem for this particular use case, so for now just disable it by
+	 * leaving it in an unbooted state.
+	 */
+#ifdef JEMALLOC_TLS
+	if (mmap_unaligned_tsd_boot())
 		return (true);
-	}
 #endif
 
 	return (false);
diff --git a/src/ctl.c b/src/ctl.c
index 1ef84e8..e17e503 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -978,13 +978,13 @@
 
 	VOID();
 
-	tcache = TCACHE_GET();
-	if (tcache == NULL) {
+	if ((tcache = *tcache_tsd_get()) == NULL) {
 		ret = 0;
 		goto RETURN;
 	}
 	tcache_destroy(tcache);
-	TCACHE_SET(NULL);
+	tcache = NULL;
+	tcache_tsd_set(&tcache);
 
 	ret = 0;
 RETURN:
@@ -1012,23 +1012,26 @@
 
 		/* Initialize arena if necessary. */
 		malloc_mutex_lock(&arenas_lock);
-		if ((arena = arenas[newind]) == NULL)
-			arena = arenas_extend(newind);
-		arenas[oldind]->nthreads--;
-		arenas[newind]->nthreads++;
-		malloc_mutex_unlock(&arenas_lock);
-		if (arena == NULL) {
+		if ((arena = arenas[newind]) == NULL && (arena =
+		    arenas_extend(newind)) == NULL) {
+			malloc_mutex_unlock(&arenas_lock);
 			ret = EAGAIN;
 			goto RETURN;
 		}
+		assert(arena == arenas[newind]);
+		arenas[oldind]->nthreads--;
+		arenas[newind]->nthreads++;
+		malloc_mutex_unlock(&arenas_lock);
 
 		/* Set new arena association. */
-		ARENA_SET(arena);
 		if (config_tcache) {
-			tcache_t *tcache = TCACHE_GET();
-			if (tcache != NULL)
-				tcache->arena = arena;
+			tcache_t *tcache;
+			if ((tcache = *tcache_tsd_get()) != NULL) {
+				tcache_arena_dissociate(tcache);
+				tcache_arena_associate(tcache, arena);
+			}
 		}
+		arenas_tsd_set(&arena);
 	}
 
 	ret = 0;
@@ -1036,11 +1039,14 @@
 	return (ret);
 }
 
-CTL_RO_NL_CGEN(config_stats, thread_allocated, ALLOCATED_GET(), uint64_t)
-CTL_RO_NL_CGEN(config_stats, thread_allocatedp, ALLOCATEDP_GET(), uint64_t *)
-CTL_RO_NL_CGEN(config_stats, thread_deallocated, DEALLOCATED_GET(), uint64_t)
-CTL_RO_NL_CGEN(config_stats, thread_deallocatedp, DEALLOCATEDP_GET(),
-    uint64_t *)
+CTL_RO_NL_CGEN(config_stats, thread_allocated,
+    thread_allocated_tsd_get()->allocated, uint64_t)
+CTL_RO_NL_CGEN(config_stats, thread_allocatedp,
+    &thread_allocated_tsd_get()->allocated, uint64_t *)
+CTL_RO_NL_CGEN(config_stats, thread_deallocated,
+    thread_allocated_tsd_get()->deallocated, uint64_t)
+CTL_RO_NL_CGEN(config_stats, thread_deallocatedp,
+    &thread_allocated_tsd_get()->deallocated, uint64_t *)
 
 /******************************************************************************/
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 2610452..331e473 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -4,36 +4,9 @@
 /******************************************************************************/
 /* Data. */
 
-malloc_mutex_t		arenas_lock;
-arena_t			**arenas;
-unsigned		narenas;
-
-pthread_key_t		arenas_tsd;
-#ifdef JEMALLOC_TLS
-__thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
-#endif
-
-#ifdef JEMALLOC_TLS
-__thread thread_allocated_t	thread_allocated_tls;
-#endif
-pthread_key_t		thread_allocated_tsd;
-
-/* Set to true once the allocator has been initialized. */
-static bool		malloc_initialized = false;
-
-/* Used to let the initializing thread recursively allocate. */
-static pthread_t	malloc_initializer = (unsigned long)0;
-
-/* Used to avoid initialization races. */
-static malloc_mutex_t	init_lock = MALLOC_MUTEX_INITIALIZER;
-
-#ifdef DYNAMIC_PAGE_SHIFT
-size_t		pagesize;
-size_t		pagesize_mask;
-size_t		lg_pagesize;
-#endif
-
-unsigned	ncpus;
+malloc_tsd_data(, arenas, arena_t *, NULL)
+malloc_tsd_data(, thread_allocated, thread_allocated_t,
+    THREAD_ALLOCATED_INITIALIZER)
 
 /* Runtime configuration options. */
 const char	*je_malloc_conf JEMALLOC_ATTR(visibility("default"));
@@ -52,15 +25,32 @@
 bool	opt_zero = false;
 size_t	opt_narenas = 0;
 
+#ifdef DYNAMIC_PAGE_SHIFT
+size_t		pagesize;
+size_t		pagesize_mask;
+size_t		lg_pagesize;
+#endif
+
+unsigned	ncpus;
+
+malloc_mutex_t		arenas_lock;
+arena_t			**arenas;
+unsigned		narenas;
+
+/* Set to true once the allocator has been initialized. */
+static bool		malloc_initialized = false;
+
+/* Used to let the initializing thread recursively allocate. */
+static pthread_t	malloc_initializer = (unsigned long)0;
+
+/* Used to avoid initialization races. */
+static malloc_mutex_t	init_lock = MALLOC_MUTEX_INITIALIZER;
+
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
 static void	stats_print_atexit(void);
 static unsigned	malloc_ncpus(void);
-static void	arenas_cleanup(void *arg);
-#ifndef JEMALLOC_TLS
-static void	thread_allocated_cleanup(void *arg);
-#endif
 static bool	malloc_conf_next(char const **opts_p, char const **k_p,
     size_t *klen_p, char const **v_p, size_t *vlen_p);
 static void	malloc_conf_error(const char *msg, const char *k, size_t klen,
@@ -156,7 +146,7 @@
 		malloc_mutex_unlock(&arenas_lock);
 	}
 
-	ARENA_SET(ret);
+	arenas_tsd_set(&ret);
 
 	return (ret);
 }
@@ -197,26 +187,6 @@
 	je_malloc_stats_print(NULL, NULL, NULL);
 }
 
-thread_allocated_t *
-thread_allocated_get_hard(void)
-{
-	thread_allocated_t *thread_allocated = (thread_allocated_t *)
-	    imalloc(sizeof(thread_allocated_t));
-	if (thread_allocated == NULL) {
-		static thread_allocated_t static_thread_allocated = {0, 0};
-		malloc_write("<jemalloc>: Error allocating TSD;"
-		    " mallctl(\"thread.{de,}allocated[p]\", ...)"
-		    " will be inaccurate\n");
-		if (opt_abort)
-			abort();
-		return (&static_thread_allocated);
-	}
-	pthread_setspecific(thread_allocated_tsd, thread_allocated);
-	thread_allocated->allocated = 0;
-	thread_allocated->deallocated = 0;
-	return (thread_allocated);
-}
-
 /*
  * End miscellaneous support functions.
  */
@@ -241,32 +211,16 @@
 	return (ret);
 }
 
-static void
+void
 arenas_cleanup(void *arg)
 {
-	arena_t *arena = (arena_t *)arg;
+	arena_t *arena = *(arena_t **)arg;
 
 	malloc_mutex_lock(&arenas_lock);
 	arena->nthreads--;
 	malloc_mutex_unlock(&arenas_lock);
 }
 
-#ifndef JEMALLOC_TLS
-static void
-thread_allocated_cleanup(void *arg)
-{
-	uint64_t *allocated = (uint64_t *)arg;
-
-	if (allocated != NULL)
-		idalloc(allocated);
-}
-#endif
-
-/*
- * FreeBSD's pthreads implementation calls malloc(3), so the malloc
- * implementation has to take pains to avoid infinite recursion during
- * initialization.
- */
 static inline bool
 malloc_init(void)
 {
@@ -604,6 +558,7 @@
 	}
 #endif
 
+	malloc_tsd_boot();
 	if (config_prof)
 		prof_boot0();
 
@@ -631,7 +586,7 @@
 		}
 	}
 
-	if (chunk_boot()) {
+	if (chunk_boot0()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
@@ -646,7 +601,7 @@
 
 	arena_boot();
 
-	if (config_tcache && tcache_boot()) {
+	if (config_tcache && tcache_boot0()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
@@ -656,23 +611,9 @@
 		return (true);
 	}
 
-#ifndef JEMALLOC_TLS
-	/* Initialize allocation counters before any allocations can occur. */
-	if (config_stats && pthread_key_create(&thread_allocated_tsd,
-	    thread_allocated_cleanup) != 0) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
-#endif
-
 	if (malloc_mutex_init(&arenas_lock))
 		return (true);
 
-	if (pthread_key_create(&arenas_tsd, arenas_cleanup) != 0) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
-
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
 	 * malloc_ncpus().
@@ -691,25 +632,38 @@
 		return (true);
 	}
 
-	/*
-	 * Assign the initial arena to the initial thread, in order to avoid
-	 * spurious creation of an extra arena if the application switches to
-	 * threaded mode.
-	 */
-	ARENA_SET(arenas[0]);
-	arenas[0]->nthreads++;
+	/* Initialize allocation counters before any allocations can occur. */
+	if (config_stats && thread_allocated_tsd_boot()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
 
 	if (config_prof && prof_boot2()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
 
+	if (arenas_tsd_boot()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+
+	if (config_tcache && tcache_boot1()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+
 	/* Get number of CPUs. */
 	malloc_initializer = pthread_self();
 	malloc_mutex_unlock(&init_lock);
 	ncpus = malloc_ncpus();
 	malloc_mutex_lock(&init_lock);
 
+	if (chunk_boot1()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+
 	if (opt_narenas == 0) {
 		/*
 		 * For SMP systems, create more than one arena per CPU by
@@ -844,7 +798,7 @@
 		prof_malloc(ret, usize, cnt);
 	if (config_stats && ret != NULL) {
 		assert(usize == isalloc(ret));
-		ALLOCATED_ADD(usize, 0);
+		thread_allocated_tsd_get()->allocated += usize;
 	}
 	return (ret);
 }
@@ -939,7 +893,7 @@
 RETURN:
 	if (config_stats && result != NULL) {
 		assert(usize == isalloc(result));
-		ALLOCATED_ADD(usize, 0);
+		thread_allocated_tsd_get()->allocated += usize;
 	}
 	if (config_prof && opt_prof && result != NULL)
 		prof_malloc(result, usize, cnt);
@@ -1044,7 +998,7 @@
 		prof_malloc(ret, usize, cnt);
 	if (config_stats && ret != NULL) {
 		assert(usize == isalloc(ret));
-		ALLOCATED_ADD(usize, 0);
+		thread_allocated_tsd_get()->allocated += usize;
 	}
 	return (ret);
 }
@@ -1173,8 +1127,11 @@
 	if (config_prof && opt_prof)
 		prof_realloc(ret, usize, cnt, old_size, old_ctx);
 	if (config_stats && ret != NULL) {
+		thread_allocated_t *ta;
 		assert(usize == isalloc(ret));
-		ALLOCATED_ADD(usize, old_size);
+		ta = thread_allocated_tsd_get();
+		ta->allocated += usize;
+		ta->deallocated += old_size;
 	}
 	return (ret);
 }
@@ -1197,7 +1154,7 @@
 			usize = isalloc(ptr);
 		}
 		if (config_stats)
-			ALLOCATED_ADD(0, usize);
+			thread_allocated_tsd_get()->deallocated += usize;
 		idalloc(ptr);
 	}
 }
@@ -1412,7 +1369,7 @@
 	*ptr = p;
 	if (config_stats) {
 		assert(usize == isalloc(p));
-		ALLOCATED_ADD(usize, 0);
+		thread_allocated_tsd_get()->allocated += usize;
 	}
 	return (ALLOCM_SUCCESS);
 OOM:
@@ -1502,8 +1459,12 @@
 	}
 
 	*ptr = q;
-	if (config_stats)
-		ALLOCATED_ADD(usize, old_size);
+	if (config_stats) {
+		thread_allocated_t *ta;
+		ta = thread_allocated_tsd_get();
+		ta->allocated += usize;
+		ta->deallocated += old_size;
+	}
 	return (ALLOCM_SUCCESS);
 ERR:
 	if (no_move)
@@ -1556,7 +1517,7 @@
 		prof_free(ptr, usize);
 	}
 	if (config_stats)
-		ALLOCATED_ADD(0, usize);
+		thread_allocated_tsd_get()->deallocated += usize;
 	idalloc(ptr);
 
 	return (ALLOCM_SUCCESS);
diff --git a/src/prof.c b/src/prof.c
index 9c32737..ba0b64e 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -14,6 +14,8 @@
 /******************************************************************************/
 /* Data. */
 
+malloc_tsd_data(, prof_tdata, prof_tdata_t *, NULL)
+
 bool		opt_prof = false;
 bool		opt_prof_active = true;
 size_t		opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
@@ -26,12 +28,6 @@
 uint64_t	prof_interval;
 bool		prof_promote;
 
-#ifdef JEMALLOC_TLS
-__thread prof_tdata_t	*prof_tdata_tls
-    JEMALLOC_ATTR(tls_model("initial-exec"));
-#endif
-pthread_key_t	prof_tdata_tsd;
-
 /*
  * Global hash of (prof_bt_t *)-->(prof_ctx_t *).  This is the master data
  * structure that knows about all backtraces currently captured.
@@ -50,7 +46,7 @@
  * all profile dumps.  The buffer is implicitly protected by bt2ctx_mtx, since
  * it must be locked anyway during dumping.
  */
-static char		prof_dump_buf[PROF_DUMP_BUF_SIZE];
+static char		prof_dump_buf[PROF_DUMP_BUFSIZE];
 static unsigned		prof_dump_buf_end;
 static int		prof_dump_fd;
 
@@ -91,7 +87,6 @@
 static void	prof_bt_hash(const void *key, unsigned minbits, size_t *hash1,
     size_t *hash2);
 static bool	prof_bt_keycomp(const void *k1, const void *k2);
-static void	prof_tdata_cleanup(void *arg);
 
 /******************************************************************************/
 
@@ -439,7 +434,7 @@
 
 	cassert(config_prof);
 
-	prof_tdata = PROF_TCACHE_GET();
+	prof_tdata = *prof_tdata_tsd_get();
 	if (prof_tdata == NULL) {
 		prof_tdata = prof_tdata_init();
 		if (prof_tdata == NULL)
@@ -599,16 +594,16 @@
 	slen = strlen(s);
 	while (i < slen) {
 		/* Flush the buffer if it is full. */
-		if (prof_dump_buf_end == PROF_DUMP_BUF_SIZE)
+		if (prof_dump_buf_end == PROF_DUMP_BUFSIZE)
 			if (prof_flush(propagate_err) && propagate_err)
 				return (true);
 
-		if (prof_dump_buf_end + slen <= PROF_DUMP_BUF_SIZE) {
+		if (prof_dump_buf_end + slen <= PROF_DUMP_BUFSIZE) {
 			/* Finish writing. */
 			n = slen - i;
 		} else {
 			/* Write as much of s as will fit. */
-			n = PROF_DUMP_BUF_SIZE - prof_dump_buf_end;
+			n = PROF_DUMP_BUFSIZE - prof_dump_buf_end;
 		}
 		memcpy(&prof_dump_buf[prof_dump_buf_end], &s[i], n);
 		prof_dump_buf_end += n;
@@ -624,10 +619,12 @@
 {
 	bool ret;
 	va_list ap;
+	char buf[PROF_PRINTF_BUFSIZE];
 
 	va_start(ap, format);
-	ret = prof_write(propagate_err, malloc_vtprintf(format, ap));
+	malloc_snprintf(buf, sizeof(buf), format, ap);
 	va_end(ap);
+	ret = prof_write(propagate_err, buf);
 
 	return (ret);
 }
@@ -795,11 +792,13 @@
 prof_dump_maps(bool propagate_err)
 {
 	int mfd;
+	char filename[PATH_MAX + 1];
 
 	cassert(config_prof);
 
-	mfd = open(malloc_tprintf("/proc/%d/maps", (int)getpid()),
-	    O_RDONLY);
+	malloc_snprintf(filename, sizeof(filename), "/proc/%d/maps",
+	    (int)getpid());
+	mfd = open(filename, O_RDONLY);
 	if (mfd != -1) {
 		ssize_t nread;
 
@@ -809,13 +808,13 @@
 		nread = 0;
 		do {
 			prof_dump_buf_end += nread;
-			if (prof_dump_buf_end == PROF_DUMP_BUF_SIZE) {
+			if (prof_dump_buf_end == PROF_DUMP_BUFSIZE) {
 				/* Make space in prof_dump_buf before read(). */
 				if (prof_flush(propagate_err) && propagate_err)
 					return (true);
 			}
 			nread = read(mfd, &prof_dump_buf[prof_dump_buf_end],
-			    PROF_DUMP_BUF_SIZE - prof_dump_buf_end);
+			    PROF_DUMP_BUFSIZE - prof_dump_buf_end);
 		} while (nread > 0);
 		close(mfd);
 	} else
@@ -1098,16 +1097,16 @@
 	prof_tdata->threshold = 0;
 	prof_tdata->accum = 0;
 
-	PROF_TCACHE_SET(prof_tdata);
+	prof_tdata_tsd_set(&prof_tdata);
 
 	return (prof_tdata);
 }
 
-static void
+void
 prof_tdata_cleanup(void *arg)
 {
 	prof_thr_cnt_t *cnt;
-	prof_tdata_t *prof_tdata = (prof_tdata_t *)arg;
+	prof_tdata_t *prof_tdata = *(prof_tdata_t **)arg;
 
 	cassert(config_prof);
 
@@ -1127,7 +1126,8 @@
 	idalloc(prof_tdata->vec);
 
 	idalloc(prof_tdata);
-	PROF_TCACHE_SET(NULL);
+	prof_tdata = NULL;
+	prof_tdata_tsd_set(&prof_tdata);
 }
 
 void
@@ -1182,8 +1182,7 @@
 			return (true);
 		if (malloc_mutex_init(&bt2ctx_mtx))
 			return (true);
-		if (pthread_key_create(&prof_tdata_tsd, prof_tdata_cleanup)
-		    != 0) {
+		if (prof_tdata_tsd_boot()) {
 			malloc_write(
 			    "<jemalloc>: Error in pthread_key_create()\n");
 			abort();
diff --git a/src/tcache.c b/src/tcache.c
index f90308c..3442406 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -4,30 +4,16 @@
 /******************************************************************************/
 /* Data. */
 
+malloc_tsd_data(, tcache, tcache_t *, NULL)
+
 bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 
 tcache_bin_info_t	*tcache_bin_info;
 static unsigned		stack_nelms; /* Total stack elms per tcache. */
 
-/* Map of thread-specific caches. */
-#ifdef JEMALLOC_TLS
-__thread tcache_t	*tcache_tls JEMALLOC_ATTR(tls_model("initial-exec"));
-#endif
-
-/*
- * Same contents as tcache, but initialized such that the TSD destructor is
- * called when a thread exits, so that the cache can be cleaned up.
- */
-pthread_key_t		tcache_tsd;
-
-size_t				nhbins;
-size_t				tcache_maxclass;
-
-/******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static void	tcache_thread_cleanup(void *arg);
+size_t			nhbins;
+size_t			tcache_maxclass;
 
 /******************************************************************************/
 
@@ -196,6 +182,33 @@
 		tbin->low_water = tbin->ncached;
 }
 
+void
+tcache_arena_associate(tcache_t *tcache, arena_t *arena)
+{
+
+	if (config_stats) {
+		/* Link into list of extant tcaches. */
+		malloc_mutex_lock(&arena->lock);
+		ql_elm_new(tcache, link);
+		ql_tail_insert(&arena->tcache_ql, tcache, link);
+		malloc_mutex_unlock(&arena->lock);
+	}
+	tcache->arena = arena;
+}
+
+void
+tcache_arena_dissociate(tcache_t *tcache)
+{
+
+	if (config_stats) {
+		/* Unlink from list of extant tcaches. */
+		malloc_mutex_lock(&tcache->arena->lock);
+		ql_remove(&tcache->arena->tcache_ql, tcache, link);
+		malloc_mutex_unlock(&tcache->arena->lock);
+		tcache_stats_merge(tcache, tcache->arena);
+	}
+}
+
 tcache_t *
 tcache_create(arena_t *arena)
 {
@@ -228,15 +241,8 @@
 	if (tcache == NULL)
 		return (NULL);
 
-	if (config_stats) {
-		/* Link into list of extant tcaches. */
-		malloc_mutex_lock(&arena->lock);
-		ql_elm_new(tcache, link);
-		ql_tail_insert(&arena->tcache_ql, tcache, link);
-		malloc_mutex_unlock(&arena->lock);
-	}
+	tcache_arena_associate(tcache, arena);
 
-	tcache->arena = arena;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
 	for (i = 0; i < nhbins; i++) {
 		tcache->tbins[i].lg_fill_div = 1;
@@ -245,7 +251,7 @@
 		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 	}
 
-	TCACHE_SET(tcache);
+	tcache_tsd_set(&tcache);
 
 	return (tcache);
 }
@@ -256,13 +262,7 @@
 	unsigned i;
 	size_t tcache_size;
 
-	if (config_stats) {
-		/* Unlink from list of extant tcaches. */
-		malloc_mutex_lock(&tcache->arena->lock);
-		ql_remove(&tcache->arena->tcache_ql, tcache, link);
-		malloc_mutex_unlock(&tcache->arena->lock);
-		tcache_stats_merge(tcache, tcache->arena);
-	}
+	tcache_arena_dissociate(tcache);
 
 	for (i = 0; i < NBINS; i++) {
 		tcache_bin_t *tbin = &tcache->tbins[i];
@@ -323,10 +323,10 @@
 		idalloc(tcache);
 }
 
-static void
+void
 tcache_thread_cleanup(void *arg)
 {
-	tcache_t *tcache = (tcache_t *)arg;
+	tcache_t *tcache = *(tcache_t **)arg;
 
 	if (tcache == (void *)(uintptr_t)1) {
 		/*
@@ -341,11 +341,13 @@
 		 * destructor was called.  Reset tcache to 1 in order to
 		 * receive another callback.
 		 */
-		TCACHE_SET((uintptr_t)1);
+		tcache = (tcache_t *)(uintptr_t)1;
+		tcache_tsd_set(&tcache);
 	} else if (tcache != NULL) {
 		assert(tcache != (void *)(uintptr_t)1);
 		tcache_destroy(tcache);
-		TCACHE_SET((uintptr_t)1);
+		tcache = (tcache_t *)(uintptr_t)1;
+		tcache_tsd_set(&tcache);
 	}
 }
 
@@ -374,7 +376,7 @@
 }
 
 bool
-tcache_boot(void)
+tcache_boot0(void)
 {
 
 	if (opt_tcache) {
@@ -385,8 +387,8 @@
 		 * SMALL_MAXCLASS and arena_maxclass are known.
 		 * XXX Can this be done earlier?
 		 */
-		if (opt_lg_tcache_max < 0 || (1U <<
-		    opt_lg_tcache_max) < SMALL_MAXCLASS)
+		if (opt_lg_tcache_max < 0 || (1U << opt_lg_tcache_max) <
+		    SMALL_MAXCLASS)
 			tcache_maxclass = SMALL_MAXCLASS;
 		else if ((1U << opt_lg_tcache_max) > arena_maxclass)
 			tcache_maxclass = arena_maxclass;
@@ -416,13 +418,18 @@
 			tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE;
 			stack_nelms += tcache_bin_info[i].ncached_max;
 		}
+	}
 
-		if (pthread_key_create(&tcache_tsd, tcache_thread_cleanup) !=
-		    0) {
-			malloc_write(
-			    "<jemalloc>: Error in pthread_key_create()\n");
-			abort();
-		}
+	return (false);
+}
+
+bool
+tcache_boot1(void)
+{
+
+	if (opt_tcache) {
+		if (tcache_tsd_boot())
+			return (true);
 	}
 
 	return (false);
diff --git a/src/tsd.c b/src/tsd.c
new file mode 100644
index 0000000..669ea8f
--- /dev/null
+++ b/src/tsd.c
@@ -0,0 +1,72 @@
+#define	JEMALLOC_TSD_C_
+#include "jemalloc/internal/jemalloc_internal.h"
+
+/******************************************************************************/
+/* Data. */
+
+static unsigned ncleanups;
+static malloc_tsd_cleanup_t cleanups[MALLOC_TSD_CLEANUPS_MAX];
+
+/******************************************************************************/
+
+void *
+malloc_tsd_malloc(size_t size)
+{
+
+	/* Avoid choose_arena() in order to dodge bootstrapping issues. */
+	return arena_malloc_prechosen(arenas[0], size, false);
+}
+
+void
+malloc_tsd_dalloc(void *wrapper)
+{
+
+	idalloc(wrapper);
+}
+
+void
+malloc_tsd_no_cleanup(void *arg)
+{
+
+	not_reached();
+}
+
+#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
+void
+_malloc_thread_cleanup(void)
+{
+	bool pending[ncleanups], again;
+	unsigned i;
+
+	for (i = 0; i < ncleanups; i++)
+		pending[i] = true;
+
+	do {
+		again = false;
+		for (i = 0; i < ncleanups; i++) {
+			if (pending[i]) {
+				pending[i] = cleanups[i].f(cleanups[i].arg);
+				if (pending[i])
+					again = true;
+			}
+		}
+	} while (again);
+}
+#endif
+
+void
+malloc_tsd_cleanup_register(bool (*f)(void *), void *arg)
+{
+
+	assert(ncleanups < MALLOC_TSD_CLEANUPS_MAX);
+	cleanups[ncleanups].f = f;
+	cleanups[ncleanups].arg = arg;
+	ncleanups++;
+}
+
+void
+malloc_tsd_boot(void)
+{
+
+	ncleanups = 0;
+}
diff --git a/src/util.c b/src/util.c
index 47e7b66..96c87f7 100644
--- a/src/util.c
+++ b/src/util.c
@@ -222,6 +222,9 @@
 	case 'z':							\
 		val = va_arg(ap, size_t);				\
 		break;							\
+	case 'p': /* Synthetic; used for %p. */				\
+		val = va_arg(ap, uintptr_t);				\
+		break;							\
 	default: not_reached();						\
 	}								\
 } while (0)
@@ -410,7 +413,7 @@
 				uintmax_t val;
 				char buf[X2S_BUFSIZE];
 
-				GET_ARG_NUMERIC(val, len);
+				GET_ARG_NUMERIC(val, 'p');
 				s = x2s(val, true, false, buf, &slen);
 				APPEND_PADDED_S(s, slen, width, left_justify);
 				f++;
@@ -466,34 +469,11 @@
 	return (ret);
 }
 
-const char *
-malloc_vtprintf(const char *format, va_list ap)
-{
-	static __thread char buf[MALLOC_PRINTF_BUFSIZE];
-
-	malloc_vsnprintf(buf, sizeof(buf), format, ap);
-
-	return (buf);
-}
-
-JEMALLOC_ATTR(format(printf, 1, 2))
-const char *
-malloc_tprintf(const char *format, ...)
-{
-	const char *ret;
-	va_list ap;
-
-	va_start(ap, format);
-	ret = malloc_vtprintf(format, ap);
-	va_end(ap);
-
-	return (ret);
-}
-
 void
 malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *format, va_list ap)
 {
+	char buf[MALLOC_PRINTF_BUFSIZE];
 
 	if (write_cb == NULL) {
 		/*
@@ -505,7 +485,8 @@
 		cbopaque = NULL;
 	}
 
-	write_cb(cbopaque, malloc_vtprintf(format, ap));
+	malloc_vsnprintf(buf, sizeof(buf), format, ap);
+	write_cb(cbopaque, buf);
 }
 
 /*