Port to FreeBSD.

Use FreeBSD-specific functions (_pthread_mutex_init_calloc_cb(),
_malloc_{pre,post}fork()) to avoid bootstrapping issues due to
allocation in libc and libthr.

Add malloc_strtoumax() and use it instead of strtoul().  Disable
validation code in malloc_vsnprintf() and malloc_strtoumax() until
jemalloc is initialized.  This is necessary because locale
initialization causes allocation for both vsnprintf() and strtoumax().

Force the lazy-lock feature on in order to avoid pthread_self(),
because it causes allocation.

Use syscall(SYS_write, ...) rather than write(...), because libthr wraps
write() and causes allocation.  Without this workaround, it would not be
possible to print error messages in malloc_conf_init() without
substantially reworking bootstrapping.

Fix choose_arena_hard() to look at how many threads are assigned to the
candidate choice, rather than checking whether the arena is
uninitialized.  This bug potentially caused more arenas to be
initialized than necessary.
diff --git a/README b/README
index 4d7b552..a7864f3 100644
--- a/README
+++ b/README
@@ -1,10 +1,10 @@
 jemalloc is a general-purpose scalable concurrent malloc(3) implementation.
 This distribution is a stand-alone "portable" implementation that currently
-targets Linux and Apple OS X.  jemalloc is included as the default allocator in
-the FreeBSD and NetBSD operating systems, and it is used by the Mozilla Firefox
-web browser on Microsoft Windows-related platforms.  Depending on your needs,
-one of the other divergent versions may suit your needs better than this
-distribution.
+targets FreeBSD, Linux and Apple OS X.  jemalloc is included as the default
+allocator in the FreeBSD and NetBSD operating systems, and it is used by the
+Mozilla Firefox web browser on Microsoft Windows-related platforms.  Depending
+on your needs, one of the other divergent versions may suit your needs better
+than this distribution.
 
 The COPYING file contains copyright and licensing information.
 
diff --git a/configure.ac b/configure.ac
index 7e4f221..478ae9d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -777,6 +777,17 @@
   force_tls="1"
 fi
 
+dnl Check whether the BSD-specific _pthread_mutex_init_calloc_cb() exists.  If
+dnl so, mutex initialization causes allocation, and we need to implement this
+dnl callback function in order to prevent recursive allocation.
+AC_CHECK_FUNC([_pthread_mutex_init_calloc_cb],
+              [have__pthread_mutex_init_calloc_cb="1"],
+              [have__pthread_mutex_init_calloc_cb="0"]
+             )
+if test "x$have__pthread_mutex_init_calloc_cb" = "x1" ; then
+  AC_DEFINE([JEMALLOC_MUTEX_INIT_CB])
+fi
+
 dnl Disable lazy locking by default.
 AC_ARG_ENABLE([lazy_lock],
   [AS_HELP_STRING([--enable-lazy-lock],
diff --git a/include/jemalloc/internal/base.h b/include/jemalloc/internal/base.h
index 796a283..9cf75ff 100644
--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@@ -10,6 +10,7 @@
 #ifdef JEMALLOC_H_EXTERNS
 
 void	*base_alloc(size_t size);
+void	*base_calloc(size_t number, size_t size);
 extent_node_t *base_node_alloc(void);
 void	base_node_dealloc(extent_node_t *node);
 bool	base_boot(void);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index e055814..4f55779 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -1,5 +1,6 @@
 #include <sys/mman.h>
 #include <sys/param.h>
+#include <sys/syscall.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/uio.h>
@@ -370,6 +371,8 @@
 extern arena_t		**arenas;
 extern unsigned		narenas;
 
+extern bool		malloc_initialized;
+
 arena_t	*arenas_extend(unsigned ind);
 void	arenas_cleanup(void *arg);
 arena_t	*choose_arena_hard(void);
diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h
index 10637e9..98f2cba 100644
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@@ -6,9 +6,12 @@
 #define	MALLOC_MUTEX_INITIALIZER 0
 #else
 typedef pthread_mutex_t malloc_mutex_t;
-#  ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#  if (defined(PTHREAD_MUTEX_ADAPTIVE_NP) &&				\
+       defined(PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP))
+#    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_ADAPTIVE_NP
 #    define MALLOC_MUTEX_INITIALIZER PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
 #  else
+#    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
 #    define MALLOC_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
 #  endif
 #endif
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 5a174ac..0e32c61 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -71,7 +71,7 @@
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
 #define	malloc_tsd_externs(a_name, a_type)				\
 extern __thread a_type	a_name##_tls;					\
-extern __thread bool	*a_name##_initialized;				\
+extern __thread bool	a_name##_initialized;				\
 extern bool		a_name##_booted;
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_externs(a_name, a_type)				\
diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index 5156399..3d3ea3a 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -85,6 +85,7 @@
 extern void	(*je_malloc_message)(void *wcbopaque, const char *s);
 
 int	buferror(int errnum, char *buf, size_t buflen);
+uintmax_t	malloc_strtoumax(const char *nptr, char **endptr, int base);
 
 /*
  * malloc_vsnprintf() supports a subset of snprintf(3) that avoids floating
diff --git a/include/jemalloc/jemalloc_defs.h.in b/include/jemalloc/jemalloc_defs.h.in
index 838f561..f150413 100644
--- a/include/jemalloc/jemalloc_defs.h.in
+++ b/include/jemalloc/jemalloc_defs.h.in
@@ -68,6 +68,20 @@
  */
 #undef JEMALLOC_MALLOC_THREAD_CLEANUP
 
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+#undef JEMALLOC_THREADED_INIT
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+#undef JEMALLOC_MUTEX_INIT_CB
+
 /* Defined if __attribute__((...)) syntax is supported. */
 #undef JEMALLOC_HAVE_ATTR
 #ifdef JEMALLOC_HAVE_ATTR
diff --git a/src/base.c b/src/base.c
index eb68334..696c362 100644
--- a/src/base.c
+++ b/src/base.c
@@ -66,6 +66,17 @@
 	return (ret);
 }
 
+void *
+base_calloc(size_t number, size_t size)
+{
+	void *ret = base_alloc(number * size);
+
+	if (ret != NULL)
+		memset(ret, 0, number * size);
+
+	return (ret);
+}
+
 extent_node_t *
 base_node_alloc(void)
 {
diff --git a/src/ctl.c b/src/ctl.c
index e17e503..943c292 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -615,19 +615,19 @@
 				goto RETURN;
 			}
 		} else {
-			unsigned long index;
+			uintmax_t index;
 			const ctl_node_t *inode;
 
 			/* Children are indexed. */
-			index = strtoul(elm, NULL, 10);
-			if (index == ULONG_MAX) {
+			index = malloc_strtoumax(elm, NULL, 10);
+			if (index == UINTMAX_MAX || index > SIZE_T_MAX) {
 				ret = ENOENT;
 				goto RETURN;
 			}
 
 			inode = &node->u.named.children[0];
 			node = inode->u.indexed.index(mibp, *depthp,
-			    index);
+			    (size_t)index);
 			if (node == NULL) {
 				ret = ENOENT;
 				goto RETURN;
diff --git a/src/jemalloc.c b/src/jemalloc.c
index b3e898c..3e168fd 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -38,10 +38,18 @@
 unsigned		narenas;
 
 /* Set to true once the allocator has been initialized. */
-static bool		malloc_initialized = false;
+bool			malloc_initialized = false;
 
+#ifdef JEMALLOC_THREADED_INIT
 /* Used to let the initializing thread recursively allocate. */
 static pthread_t	malloc_initializer = (unsigned long)0;
+#  define INITIALIZER		pthread_self()
+#  define IS_INITIALIZER	(malloc_initializer == pthread_self())
+#else
+static bool		malloc_initializer = false;
+#  define INITIALIZER		true
+#  define IS_INITIALIZER	malloc_initializer
+#endif
 
 /* Used to avoid initialization races. */
 static malloc_mutex_t	init_lock = MALLOC_MUTEX_INITIALIZER;
@@ -127,7 +135,7 @@
 			}
 		}
 
-		if (arenas[choose] == 0 || first_null == narenas) {
+		if (arenas[choose]->nthreads == 0 || first_null == narenas) {
 			/*
 			 * Use an unloaded arena, or the least loaded arena if
 			 * all arenas are already initialized.
@@ -413,22 +421,22 @@
 #define	CONF_HANDLE_SIZE_T(o, n, min, max)				\
 			if (sizeof(#n)-1 == klen && strncmp(#n, k,	\
 			    klen) == 0) {				\
-				unsigned long ul;			\
+				uintmax_t um;			\
 				char *end;				\
 									\
 				errno = 0;				\
-				ul = strtoul(v, &end, 0);		\
+				um = malloc_strtoumax(v, &end, 0);	\
 				if (errno != 0 || (uintptr_t)end -	\
 				    (uintptr_t)v != vlen) {		\
 					malloc_conf_error(		\
 					    "Invalid conf value",	\
 					    k, klen, v, vlen);		\
-				} else if (ul < min || ul > max) {	\
+				} else if (um < min || um > max) {	\
 					malloc_conf_error(		\
 					    "Out-of-range conf value",	\
 					    k, klen, v, vlen);		\
 				} else					\
-					o = ul;				\
+					o = um;				\
 				continue;				\
 			}
 #define	CONF_HANDLE_SSIZE_T(o, n, min, max)				\
@@ -519,7 +527,7 @@
 	arena_t *init_arenas[1];
 
 	malloc_mutex_lock(&init_lock);
-	if (malloc_initialized || malloc_initializer == pthread_self()) {
+	if (malloc_initialized || IS_INITIALIZER) {
 		/*
 		 * Another thread initialized the allocator before this one
 		 * acquired init_lock, or this thread is the initializing
@@ -528,7 +536,8 @@
 		malloc_mutex_unlock(&init_lock);
 		return (false);
 	}
-	if (malloc_initializer != (unsigned long)0) {
+#ifdef JEMALLOC_THREADED_INIT
+	if (IS_INITIALIZER == false) {
 		/* Busy-wait until the initializing thread completes. */
 		do {
 			malloc_mutex_unlock(&init_lock);
@@ -538,6 +547,8 @@
 		malloc_mutex_unlock(&init_lock);
 		return (false);
 	}
+#endif
+	malloc_initializer = INITIALIZER;
 
 #ifdef DYNAMIC_PAGE_SHIFT
 	/* Get page size. */
@@ -564,6 +575,7 @@
 
 	malloc_conf_init();
 
+#ifndef JEMALLOC_MUTEX_INIT_CB
 	/* Register fork handlers. */
 	if (pthread_atfork(jemalloc_prefork, jemalloc_postfork_parent,
 	    jemalloc_postfork_child) != 0) {
@@ -571,11 +583,7 @@
 		if (opt_abort)
 			abort();
 	}
-
-	if (ctl_boot()) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
+#endif
 
 	if (opt_stats_print) {
 		/* Print statistics at exit. */
@@ -596,6 +604,11 @@
 		return (true);
 	}
 
+	if (ctl_boot()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+
 	if (config_prof)
 		prof_boot1();
 
@@ -654,7 +667,6 @@
 	}
 
 	/* Get number of CPUs. */
-	malloc_initializer = pthread_self();
 	malloc_mutex_unlock(&init_lock);
 	ncpus = malloc_ncpus();
 	malloc_mutex_lock(&init_lock);
@@ -1018,8 +1030,7 @@
 	}
 
 	if (ptr != NULL) {
-		assert(malloc_initialized || malloc_initializer ==
-		    pthread_self());
+		assert(malloc_initialized || IS_INITIALIZER);
 
 		if (config_prof || config_stats)
 			old_size = isalloc(ptr);
@@ -1124,8 +1135,7 @@
 	if (ptr != NULL) {
 		size_t usize;
 
-		assert(malloc_initialized || malloc_initializer ==
-		    pthread_self());
+		assert(malloc_initialized || IS_INITIALIZER);
 
 		if (config_prof && opt_prof) {
 			usize = isalloc(ptr);
@@ -1208,7 +1218,7 @@
 {
 	size_t ret;
 
-	assert(malloc_initialized || malloc_initializer == pthread_self());
+	assert(malloc_initialized || IS_INITIALIZER);
 
 	if (config_ivsalloc)
 		ret = ivsalloc(ptr);
@@ -1372,7 +1382,7 @@
 	assert(*ptr != NULL);
 	assert(size != 0);
 	assert(SIZE_T_MAX - size >= extra);
-	assert(malloc_initialized || malloc_initializer == pthread_self());
+	assert(malloc_initialized || IS_INITIALIZER);
 
 	p = *ptr;
 	if (config_prof && opt_prof) {
@@ -1457,7 +1467,7 @@
 {
 	size_t sz;
 
-	assert(malloc_initialized || malloc_initializer == pthread_self());
+	assert(malloc_initialized || IS_INITIALIZER);
 
 	if (config_ivsalloc)
 		sz = ivsalloc(ptr);
@@ -1479,7 +1489,7 @@
 	size_t usize;
 
 	assert(ptr != NULL);
-	assert(malloc_initialized || malloc_initializer == pthread_self());
+	assert(malloc_initialized || IS_INITIALIZER);
 
 	if (config_stats)
 		usize = isalloc(ptr);
@@ -1528,8 +1538,13 @@
  * malloc during fork().
  */
 
+#ifndef JEMALLOC_MUTEX_INIT_CB
 void
 jemalloc_prefork(void)
+#else
+void
+_malloc_prefork(void)
+#endif
 {
 	unsigned i;
 
@@ -1544,8 +1559,13 @@
 	chunk_dss_prefork();
 }
 
+#ifndef JEMALLOC_MUTEX_INIT_CB
 void
 jemalloc_postfork_parent(void)
+#else
+void
+_malloc_postfork(void)
+#endif
 {
 	unsigned i;
 
diff --git a/src/mutex.c b/src/mutex.c
index 07d2a03..0b20bbf 100644
--- a/src/mutex.c
+++ b/src/mutex.c
@@ -56,21 +56,25 @@
 
 /******************************************************************************/
 
+#ifdef JEMALLOC_MUTEX_INIT_CB
+int	_pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
+    void *(calloc_cb)(size_t, size_t));
+#endif
+
 bool
 malloc_mutex_init(malloc_mutex_t *mutex)
 {
 #ifdef JEMALLOC_OSSPIN
 	*mutex = 0;
+#elif (defined(JEMALLOC_MUTEX_INIT_CB))
+	if (_pthread_mutex_init_calloc_cb(mutex, base_calloc) != 0)
+		return (true);
 #else
 	pthread_mutexattr_t attr;
 
 	if (pthread_mutexattr_init(&attr) != 0)
 		return (true);
-#ifdef PTHREAD_MUTEX_ADAPTIVE_NP
-	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
-#else
-	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_DEFAULT);
-#endif
+	pthread_mutexattr_settype(&attr, MALLOC_MUTEX_TYPE);
 	if (pthread_mutex_init(mutex, &attr) != 0) {
 		pthread_mutexattr_destroy(&attr);
 		return (true);
@@ -99,10 +103,14 @@
 malloc_mutex_postfork_child(malloc_mutex_t *mutex)
 {
 
+#ifdef JEMALLOC_MUTEX_INIT_CB
+	malloc_mutex_unlock(mutex);
+#else
 	if (malloc_mutex_init(mutex)) {
 		malloc_printf("<jemalloc>: Error re-initializing mutex in "
 		    "child\n");
 		if (opt_abort)
 			abort();
 	}
+#endif
 }
diff --git a/src/util.c b/src/util.c
index 698b53a..090e1f0 100644
--- a/src/util.c
+++ b/src/util.c
@@ -44,7 +44,7 @@
 void
 wrtmessage(void *cbopaque, const char *s)
 {
-	UNUSED int result = write(STDERR_FILENO, s, strlen(s));
+	UNUSED int result = syscall(SYS_write, STDERR_FILENO, s, strlen(s));
 }
 
 void	(*je_malloc_message)(void *, const char *s)
@@ -69,6 +69,123 @@
 #endif
 }
 
+uintmax_t
+malloc_strtoumax(const char *nptr, char **endptr, int base)
+{
+	uintmax_t ret, digit;
+	int b;
+	bool neg;
+	const char *p, *ns;
+
+	if (base < 0 || base == 1 || base > 36) {
+		errno = EINVAL;
+		return (UINTMAX_MAX);
+	}
+	b = base;
+
+	/* Swallow leading whitespace and get sign, if any. */
+	neg = false;
+	p = nptr;
+	while (true) {
+		switch (*p) {
+		case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
+			p++;
+			break;
+		case '-':
+			neg = true;
+			/* Fall through. */
+		case '+':
+			p++;
+			/* Fall through. */
+		default:
+			goto PREFIX;
+		}
+	}
+
+	/* Get prefix, if any. */
+	PREFIX:
+	/*
+	 * Note where the first non-whitespace/sign character is so that it is
+	 * possible to tell whether any digits are consumed (e.g., "  0" vs.
+	 * "  -x").
+	 */
+	ns = p;
+	if (*p == '0') {
+		switch (p[1]) {
+		case '0': case '1': case '2': case '3': case '4': case '5':
+		case '6': case '7':
+			if (b == 0)
+				b = 8;
+			if (b == 8)
+				p++;
+			break;
+		case 'x':
+			switch (p[2]) {
+			case '0': case '1': case '2': case '3': case '4':
+			case '5': case '6': case '7': case '8': case '9':
+			case 'A': case 'B': case 'C': case 'D': case 'E':
+			case 'F':
+			case 'a': case 'b': case 'c': case 'd': case 'e':
+			case 'f':
+				if (b == 0)
+					b = 16;
+				if (b == 16)
+					p += 2;
+				break;
+			default:
+				break;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	if (b == 0)
+		b = 10;
+
+	/* Convert. */
+	ret = 0;
+	while ((*p >= '0' && *p <= '9' && (digit = *p - '0') < b)
+	    || (*p >= 'A' && *p <= 'Z' && (digit = 10 + *p - 'A') < b)
+	    || (*p >= 'a' && *p <= 'z' && (digit = 10 + *p - 'a') < b)) {
+		uintmax_t pret = ret;
+		ret *= b;
+		ret += digit;
+		if (ret < pret) {
+			/* Overflow. */
+			errno = ERANGE;
+			return (UINTMAX_MAX);
+		}
+		p++;
+	}
+	if (neg)
+		ret = -ret;
+
+	if (endptr != NULL) {
+		if (p == ns) {
+			/* No characters were converted. */
+			*endptr = (char *)nptr;
+		} else
+			*endptr = (char *)p;
+	}
+
+	if (config_debug && malloc_initialized) {
+		uintmax_t tret;
+		int perrno;
+		char *pend;
+
+		perrno = errno;
+		if (endptr != NULL)
+			pend = *endptr;
+		tret = strtoumax(nptr, endptr, base);
+		assert(tret == ret);
+		assert(errno == perrno);
+		assert(endptr == NULL || *endptr == pend);
+	}
+
+	return (ret);
+}
+
 static char *
 u2s(uintmax_t x, unsigned base, bool uppercase, char *s, size_t *slen_p)
 {
@@ -220,7 +337,7 @@
 		val = va_arg(ap, ptrdiff_t);				\
 		break;							\
 	case 'z':							\
-		val = va_arg(ap, size_t);				\
+		val = va_arg(ap, ssize_t);				\
 		break;							\
 	case 'p': /* Synthetic; used for %p. */				\
 		val = va_arg(ap, uintptr_t);				\
@@ -289,10 +406,11 @@
 				break;
 			case '0': case '1': case '2': case '3': case '4':
 			case '5': case '6': case '7': case '8': case '9': {
-				unsigned long uwidth;
+				uintmax_t uwidth;
 				errno = 0;
-				uwidth = strtoul(f, (char **)&f, 10);
-				assert(uwidth != ULONG_MAX || errno != ERANGE);
+				uwidth = malloc_strtoumax(f, (char **)&f, 10);
+				assert(uwidth != UINTMAX_MAX || errno !=
+				    ERANGE);
 				width = (int)uwidth;
 				if (*f == '.') {
 					f++;
@@ -314,10 +432,10 @@
 				break;
 			case '0': case '1': case '2': case '3': case '4':
 			case '5': case '6': case '7': case '8': case '9': {
-				unsigned long uprec;
+				uintmax_t uprec;
 				errno = 0;
-				uprec = strtoul(f, (char **)&f, 10);
-				assert(uprec != ULONG_MAX || errno != ERANGE);
+				uprec = malloc_strtoumax(f, (char **)&f, 10);
+				assert(uprec != UINTMAX_MAX || errno != ERANGE);
 				prec = (int)uprec;
 				break;
 			}
@@ -435,7 +553,7 @@
 		str[size - 1] = '\0';
 	ret = i;
 
-	if (config_debug) {
+	if (config_debug && malloc_initialized) {
 		char buf[MALLOC_PRINTF_BUFSIZE];
 		int tret;