Clean up the manpage and conditionalize various portions according to how
jemalloc is configured.

Modify arena_malloc() API to avoid unnecessary choose_arena() calls.  Remove
unnecessary code from choose_arena().

Enable lazy-lock by default, now that choose_arena() is both faster and out of
the critical path.

Implement objdir support in the build system.
diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL
new file mode 100644
index 0000000..1320ba9
--- /dev/null
+++ b/jemalloc/INSTALL
@@ -0,0 +1,172 @@
+Building and installing jemalloc can be as simple as typing the following while
+in the root directory of the source tree:
+
+    ./configure
+    make
+    make install
+
+=== Advanced configuration =====================================================
+
+The 'configure' script supports numerous options that allow control of which
+functionality is enabled, where jemalloc is installed, etc.  Optionally, pass
+any of the following arguments (not a definitive list) to 'configure':
+
+--help
+    Print a definitive list of options.
+
+--prefix=<install-root-dir>
+    Set the base directory in which to install.  For example:
+
+        ./configure --prefix=/usr/local
+
+    will cause files to be installed into /usr/local/include, /usr/local/lib,
+    and /usr/local/man.
+
+--with-rpath=<colon-separated-rpath>
+    Embed one or more library paths, so that Crux's internal shared library can
+    find the libraries it is linked to.  This works only on ELF-based systems.
+
+--enable-debug
+    Enable assertions and validation code.  This incurs a substantial
+    performance hit, but is very useful during application development.
+
+--enable-stats
+    Enable statistics gathering functionality.  Use the 'P' option to print
+    detailed allocation statistics at exit, and/or the 'U' option to print a
+    detailed allocation trace log.
+
+--disable-tiny
+    Disable tiny (sub-quantum-sized) object support.  Technically it is not
+    legal for a malloc implementation to allocate objects with less than
+    quantum alignment (8 or 16 bytes, depending on architecture), but in
+    practice it never causes any problems if, for example, 4-byte allocationsj
+    are 4-byte-aligned.
+
+--disable-mag
+    Disable thread-specific caches for sub-page-sized objects.  Objects are
+    cached and released in bulk using "magazines" -- a term coined by the
+    developers of Solaris's umem allocator.
+
+--disable-balance
+    Disable dynamic rebalancing of thread-->arena assignments.
+
+--enable-dss
+    Enable support for page allocation/deallocation via sbrk(2), in addition to
+    mmap(2).
+
+--enable-fill
+    Enable support for junk/zero filling of memory.  Use the 'J' option to
+    control junk filling, or the 'Z' option to control zero filling.
+
+--enable-xmalloc
+    Enable support for optional immediate termination due to out-of-memory
+    errors, as is commonly implemented by "xmalloc" wrapper function for malloc.
+    Use the 'X' option to control termination behavior.
+
+--enable-sysv
+    Enable support for System V semantics, wherein malloc(0) returns NULL
+    rather than a minimal allocation.  Use the 'V' option to control System V
+    compatibility.
+
+--enable-dynamic-page-shift
+    Under most conditions, the system page size never changes (usually 4KiB or
+    8KiB, depending on architecture and configuration), and unless this option
+    is enabled, jemalloc assumes that page size can safely be determined during
+    configuration and hard-coded.  Enabling dynamic page size determination has
+    a measurable impact on performance, since the compiler is forced to load
+    the page size from memory rather than embedding immediate values.
+
+--disable-lazy-lock
+    Disable code that wraps pthread_create() to detect when an application
+    switches from single-threaded to multi-threaded mode, so that it can avoid
+    mutex locking/unlocking operations while in single-threaded mode.  In
+    practice, this feature usually has little impact on performance unless
+    magazines are disabled.
+
+The following environment variables (not a definitive list) impact configure's
+behavior:
+
+CFLAGS="?"
+    Pass these flags to the compiler.  You probably shouldn't define this unless
+    you know what you are doing.  (Use EXTRA_CFLAGS instead.)
+
+EXTRA_CFLAGS="?"
+    Append these flags to CFLAGS.  This makes it possible to add flags such as
+    -Werror, while allowing the configure script to determine what other flags
+    are appropriate for the specified configuration.
+
+    The configure script specifically checks whether an optimization flag (-O*)
+    is specified in EXTRA_CFLAGS, and refrains from specifying an optimization
+    level if it finds that one has already been specified.
+
+CPPFLAGS="?"
+    Pass these flags to the C preprocessor.  Note that CFLAGS is not passed to
+    'cpp' when 'configure' is looking for include files, so you must use
+    CPPFLAGS instead if you need to help 'configure' find header files.
+
+LD_LIBRARY_PATH="?"
+    'ld' uses this colon-separated list to find libraries.
+
+LDFLAGS="?"
+    Pass these flags when linking.
+
+PATH="?"
+    'configure' uses this to find programs.
+
+=== Advanced compilation =======================================================
+
+To run integrated regression tests, type:
+
+    make check
+
+To clean up build results to varying degrees, use the following make targets:
+
+    clean
+    distclean
+    relclean
+
+=== Advanced installation ======================================================
+
+Optionally, define make variables when invoking make, including (not
+exclusively):
+
+INCLUDEDIR="?"
+    Use this as the installation prefix for header files.
+
+LIBDIR="?"
+    Use this as the installation prefix for libraries.
+
+MANDIR="?"
+    Use this as the installation prefix for man pages.
+
+CC="?"
+    Use this to invoke the C compiler.
+
+CFLAGS="?"
+    Pass these flags to the compiler.
+
+CPPFLAGS="?"
+    Pass these flags to the C preprocessor.
+
+LDFLAGS="?"
+    Pass these flags when linking.
+
+PATH="?"
+    Use this to search for programs used during configuration and building.
+
+=== Development ================================================================
+
+If you intend to make non-trivial changes to jemalloc, use the 'autogen.sh'
+script rather than 'configure'.  This re-generates 'configure', enables
+configuration dependency rules, and enables re-generation of automatically
+generated source files.
+
+The build system supports using an object directory separate from the source
+tree.  For example, you can create an 'obj' directory, and from within that
+directory, issue configuration and build commands:
+
+    autoconf
+    mkdir obj
+    cd obj
+    ../configure --enable-autogen
+    make
diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index 1652ec9..e8d212e 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -11,10 +11,8 @@
 CC := @CC@
 
 # Configuration parameters.
-BINDIR := @BINDIR@
 INCLUDEDIR := @INCLUDEDIR@
 LIBDIR := @LIBDIR@
-DATADIR := @DATADIR@
 MANDIR := @MANDIR@
 
 # Build parameters.
@@ -34,20 +32,20 @@
 REV := 0
 
 # File lists.
-CHDRS := src/jemalloc.h
-CSRCS := src/jemalloc.c
-DSO := lib/libjemalloc.so.$(REV)
-MAN3 := doc/jemalloc.3
+CHDRS := @srcroot@src/jemalloc.h @objroot@src/jemalloc_defs.h
+CSRCS := @srcroot@src/jemalloc.c
+DSOS := @objroot@lib/libjemalloc.so.$(REV) @objroot@lib/libjemalloc.so
+MAN3 := @objroot@doc/jemalloc.3
 
 .PHONY: all dist install check clean distclean relclean
 
 # Default target.
-all: $(DSO)
+all: $(DSOS)
 
-src/%.o: src/%.c
+@objroot@src/%.o: @srcroot@src/%.c
 	$(CC) $(CFLAGS) -c $(CPPFLAGS) -o $@ $+
 
-$(DSO): $(CSRCS:%.c=%.o)
+$(DSOS): $(CSRCS:@srcroot@%.c=@objroot@%.o)
 	@mkdir -p $(@D)
 	gcc -shared -o $@ $+ $(LDFLAGS) $(LIBS)
 	ln -sf libjemalloc.so.$(REV) lib/libjemalloc.so
@@ -59,7 +57,10 @@
 	install -m 644 $$h $(INCLUDEDIR); \
 done
 	install -d $(LIBDIR)
-	install -m 755 $(DSO) $(LIBDIR)
+	@for s in $(DSOS); do \
+	echo "install -m 755 $$s $(LIBDIR)"; \
+	install -m 755 $$s $(LIBDIR); \
+done
 	install -d $(MANDIR)
 	@for m in $(MAN3); do \
 	echo "install -m 644 $$m $(MANDIR)/man3"; \
@@ -69,9 +70,9 @@
 check:
 
 clean:
-	rm -f src/*.o
-	rm -f lib/libjemalloc.so
-	rm -f lib/libjemalloc.so.$(REV)
+	rm -f @objroot@src/*.o
+	rm -f @objroot@lib/libjemalloc.so
+	rm -f @objroot@lib/libjemalloc.so.$(REV)
 
 distclean: clean
 	rm -f @objroot@config.log
diff --git a/jemalloc/README b/jemalloc/README
new file mode 100644
index 0000000..de3a0a8
--- /dev/null
+++ b/jemalloc/README
@@ -0,0 +1,4 @@
+jemalloc is a general-purpose scalable concurrent malloc(3) implementation.
+
+The INSTALL file contains information on how to configure, build, and install
+jemalloc.
diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index fa0c1bc..e0bf8f5 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -41,7 +41,7 @@
 MANDIR=`eval echo $MANDIR`
 AC_SUBST([MANDIR])
 
-cfgoutputs="Makefile"
+cfgoutputs="Makefile doc/jemalloc.3"
 cfghdrs="src/jemalloc_defs.h"
 
 dnl If CFLAGS isn't defined and using gcc, set CFLAGS to something reasonable.
@@ -219,6 +219,12 @@
   AC_DEFINE([JEMALLOC_STATS], [ ])
 fi
 AC_SUBST([enable_stats])
+if test "x$enable_stats" = "x0" ; then
+  roff_stats=".\\\" "
+else
+  roff_stats=""
+fi
+AC_SUBST([roff_stats])
 
 dnl Enable tiny allocations by default.
 AC_ARG_ENABLE([tiny],
@@ -235,6 +241,15 @@
   AC_DEFINE([JEMALLOC_TINY], [ ])
 fi
 AC_SUBST([enable_tiny])
+if test "x$enable_tiny" = "x0" ; then
+  roff_tiny=".\\\" "
+  roff_no_tiny=""
+else
+  roff_tiny=""
+  roff_no_tiny=".\\\" "
+fi
+AC_SUBST([roff_tiny])
+AC_SUBST([roff_no_tiny])
 
 dnl Enable magazines by default.
 AC_ARG_ENABLE([mag],
@@ -251,6 +266,12 @@
   AC_DEFINE([JEMALLOC_MAG], [ ])
 fi
 AC_SUBST([enable_mag])
+if test "x$enable_mag" = "x0" ; then
+  roff_mag=".\\\" "
+else
+  roff_mag=""
+fi
+AC_SUBST([roff_mag])
 
 dnl Enable dynamic arena load balancing by default.
 AC_ARG_ENABLE([balance],
@@ -267,6 +288,12 @@
   AC_DEFINE([JEMALLOC_BALANCE], [ ])
 fi
 AC_SUBST([enable_balance])
+if test "x$enable_balance" = "x0" ; then
+  roff_balance=".\\\" "
+else
+  roff_balance=""
+fi
+AC_SUBST([roff_balance])
 
 dnl Do not enable allocation from DSS by default.
 AC_ARG_ENABLE([dss],
@@ -283,6 +310,12 @@
   AC_DEFINE([JEMALLOC_DSS], [ ])
 fi
 AC_SUBST([enable_dss])
+if test "x$enable_dss" = "x0" ; then
+  roff_dss=".\\\" "
+else
+  roff_dss=""
+fi
+AC_SUBST([roff_dss])
 
 dnl Do not support the junk/zero filling option by default.
 AC_ARG_ENABLE([fill],
@@ -299,6 +332,12 @@
   AC_DEFINE([JEMALLOC_FILL], [ ])
 fi
 AC_SUBST([enable_fill])
+if test "x$enable_fill" = "x0" ; then
+  roff_fill=".\\\" "
+else
+  roff_fill=""
+fi
+AC_SUBST([roff_fill])
 
 dnl Do not support the xmalloc option by default.
 AC_ARG_ENABLE([xmalloc],
@@ -315,6 +354,12 @@
   AC_DEFINE([JEMALLOC_XMALLOC], [ ])
 fi
 AC_SUBST([enable_xmalloc])
+if test "x$enable_xmalloc" = "x0" ; then
+  roff_xmalloc=".\\\" "
+else
+  roff_xmalloc=""
+fi
+AC_SUBST([roff_xmalloc])
 
 dnl Do not support the SYSV option by default.
 AC_ARG_ENABLE([sysv],
@@ -331,6 +376,12 @@
   AC_DEFINE([JEMALLOC_SYSV], [ ])
 fi
 AC_SUBST([enable_sysv])
+if test "x$enable_sysv" = "x0" ; then
+  roff_sysv=".\\\" "
+else
+  roff_sysv=""
+fi
+AC_SUBST([roff_sysv])
 
 dnl Do not determine page shift at run time by default.
 AC_ARG_ENABLE([dynamic_page_shift],
@@ -380,6 +431,7 @@
 dnl jemalloc configuration.
 dnl 
 jemalloc_version=`cat ${srcroot}VERSION`
+AC_DEFINE_UNQUOTED([JEMALLOC_VERSION], ["$jemalloc_version"])
 AC_SUBST([jemalloc_version])
 
 dnl ============================================================================
@@ -400,21 +452,24 @@
 
     return 0;
 ]])],
-              AC_MSG_RESULT([yes]),
+              AC_MSG_RESULT([yes])
+              roff_tls="",
               AC_MSG_RESULT([no])
+              roff_tls=".\\\" "
               AC_DEFINE_UNQUOTED([NO_TLS], [ ]))
+AC_SUBST([roff_tls])
 
-dnl Do not enable lazy locking by default.
+dnl Enable lazy locking by default.
 AC_ARG_ENABLE([lazy_lock],
   [AS_HELP_STRING([--enable-lazy-lock],
-  [Enable lazy locking (avoid locking unless multiple threads)])],
+  [Disable lazy locking (always lock, even when single-threaded)])],
 [if test "x$enable_lazy_lock" = "xno" ; then
   enable_lazy_lock="0"
 else
   enable_lazy_lock="1"
 fi
 ],
-[enable_lazy_lock="0"]
+[enable_lazy_lock="1"]
 )
 if test "x$enable_lazy_lock" = "x1" ; then
   AC_CHECK_HEADERS([dlfcn.h], , [AC_MSG_ERROR([dlfcn.h is missing])])
diff --git a/jemalloc/doc/jemalloc.3 b/jemalloc/doc/jemalloc.3.in
similarity index 64%
rename from jemalloc/doc/jemalloc.3
rename to jemalloc/doc/jemalloc.3.in
index b26ec39..8d10b39 100644
--- a/jemalloc/doc/jemalloc.3
+++ b/jemalloc/doc/jemalloc.3.in
@@ -1,5 +1,5 @@
-.\" Copyright (c) 2006-2008 Jason Evans <jasone@canonware.com>.
 .\" Copyright (c) 2009 Facebook, Inc.  All rights reserved.
+.\" Copyright (c) 2006-2008 Jason Evans <jasone@canonware.com>.
 .\" All rights reserved.
 .\" Copyright (c) 1980, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
@@ -42,7 +42,7 @@
 .Nm malloc , calloc , posix_memalign , realloc , free , malloc_usable_size
 .Nd general purpose memory allocation functions
 .Sh LIBRARY
-.Lb libc
+.Lb libjemalloc
 .Sh SYNOPSIS
 .In stdlib.h
 .Ft void *
@@ -55,22 +55,23 @@
 .Fn realloc "void *ptr" "size_t size"
 .Ft void
 .Fn free "void *ptr"
+.In jemalloc.h
+.Ft size_t
+.Fn malloc_usable_size "const void *ptr"
 .Ft const char *
 .Va jemalloc_options ;
 .Ft void
 .Fo \*(lp*jemalloc_message\*(rp
 .Fa "const char *p1" "const char *p2" "const char *p3" "const char *p4"
 .Fc
-.In malloc_np.h
-.Ft size_t
-.Fn malloc_usable_size "const void *ptr"
 .Sh DESCRIPTION
 The
 .Fn malloc
 function allocates
 .Fa size
 bytes of uninitialized memory.
-The allocated space is suitably aligned (after possible pointer coercion)
+The allocated space is suitably aligned
+@roff_tiny@(after possible pointer coercion)
 for storage of any type of object.
 .Pp
 The
@@ -187,31 +188,32 @@
 The process will call
 .Xr abort 3
 in these cases.
-.It B
-Double/halve the per-arena lock contention threshold at which a thread is
-randomly re-assigned to an arena.
-This dynamic load balancing tends to push threads away from highly contended
-arenas, which avoids worst case contention scenarios in which threads
-disproportionately utilize arenas.
-However, due to the highly dynamic load that applications may place on the
-allocator, it is impossible for the allocator to know in advance how sensitive
-it should be to contention over arenas.
-Therefore, some applications may benefit from increasing or decreasing this
-threshold parameter.
-This option is not available for some configurations (non-PIC).
+@roff_balance@@roff_tls@.It B
+@roff_balance@@roff_tls@Double/halve the per-arena lock contention threshold at
+@roff_balance@@roff_tls@which a thread is randomly re-assigned to an arena.
+@roff_balance@@roff_tls@This dynamic load balancing tends to push threads away
+@roff_balance@@roff_tls@from highly contended arenas, which avoids worst case
+@roff_balance@@roff_tls@contention scenarios in which threads disproportionately
+@roff_balance@@roff_tls@utilize arenas.
+@roff_balance@@roff_tls@However, due to the highly dynamic load that
+@roff_balance@@roff_tls@applications may place on the allocator, it is
+@roff_balance@@roff_tls@impossible for the allocator to know in advance how
+@roff_balance@@roff_tls@sensitive it should be to contention over arenas.
+@roff_balance@@roff_tls@Therefore, some applications may benefit from increasing
+@roff_balance@@roff_tls@or decreasing this threshold parameter.
 .It C
 Double/halve the size of the maximum size class that is a multiple of the
 cacheline size (64).
 Above this size, subpage spacing (256 bytes) is used for size classes.
 The default value is 512 bytes.
-.It D
-Use
-.Xr sbrk 2
-to acquire memory in the data storage segment (DSS).
-This option is enabled by default.
-See the
-.Dq M
-option for related information and interactions.
+@roff_dss@.It D
+@roff_dss@Use
+@roff_dss@.Xr sbrk 2
+@roff_dss@to acquire memory in the data storage segment (DSS).
+@roff_dss@This option is enabled by default.
+@roff_dss@See the
+@roff_dss@.Dq M
+@roff_dss@option for related information and interactions.
 .It F
 Double/halve the per-arena maximum number of dirty unused pages that are
 allowed to accumulate before informing the kernel about at least half of those
@@ -222,46 +224,48 @@
 The default is 512 pages per arena;
 .Ev JEMALLOC_OPTIONS=10f
 will prevent any dirty unused pages from accumulating.
-.It G
-When there are multiple threads, use thread-specific caching for objects that
-are smaller than one page.
-This option is enabled by default.
-Thread-specific caching allows many allocations to be satisfied without
-performing any thread synchronization, at the cost of increased memory use.
-See the
-.Dq R
-option for related tuning information.
-This option is not available for some configurations (non-PIC).
-.It J
-Each byte of new memory allocated by
-.Fn malloc
-or
-.Fn realloc
-will be initialized to 0xa5.
-All memory returned by
-.Fn free
-or
-.Fn realloc
-will be initialized to 0x5a.
-This is intended for debugging and will impact performance negatively.
+@roff_mag@@roff_tls@.It G
+@roff_mag@@roff_tls@When there are multiple threads, use thread-specific caching
+@roff_mag@@roff_tls@for objects that are smaller than one page.
+@roff_mag@@roff_tls@This option is enabled by default.
+@roff_mag@@roff_tls@Thread-specific caching allows many allocations to be
+@roff_mag@@roff_tls@satisfied without performing any thread synchronization, at
+@roff_mag@@roff_tls@the cost of increased memory use.
+@roff_mag@@roff_tls@See the
+@roff_mag@@roff_tls@.Dq R
+@roff_mag@@roff_tls@option for related tuning information.
+@roff_fill@.It J
+@roff_fill@Each byte of new memory allocated by
+@roff_fill@.Fn malloc
+@roff_fill@or
+@roff_fill@.Fn realloc
+@roff_fill@will be initialized to 0xa5.
+@roff_fill@All memory returned by
+@roff_fill@.Fn free
+@roff_fill@or
+@roff_fill@.Fn realloc
+@roff_fill@will be initialized to 0x5a.
+@roff_fill@This is intended for debugging and will impact performance
+@roff_fill@negatively.
 .It K
 Double/halve the virtual memory chunk size.
 The default chunk size is 1 MB.
-.It M
-Use
-.Xr mmap 2
-to acquire anonymously mapped memory.
-This option is enabled by default.
-If both the
-.Dq D
-and
-.Dq M
-options are enabled, the allocator prefers the DSS over anonymous mappings,
-but allocation only fails if memory cannot be acquired via either method.
-If neither option is enabled, then the
-.Dq M
-option is implicitly enabled in order to assure that there is a method for
-acquiring memory.
+@roff_dss@.It M
+@roff_dss@Use
+@roff_dss@.Xr mmap 2
+@roff_dss@to acquire anonymously mapped memory.
+@roff_dss@This option is enabled by default.
+@roff_dss@If both the
+@roff_dss@.Dq D
+@roff_dss@and
+@roff_dss@.Dq M
+@roff_dss@options are enabled, the allocator prefers the DSS over anonymous
+@roff_dss@mappings, but allocation only fails if memory cannot be acquired via
+@roff_dss@either method.
+@roff_dss@If neither option is enabled, then the
+@roff_dss@.Dq M
+@roff_dss@option is implicitly enabled in order to assure that there is a method
+@roff_dss@for acquiring memory.
 .It N
 Double/halve the number of arenas.
 The default number of arenas is two times the number of CPUs, or one if there
@@ -279,88 +283,70 @@
 quantum (8 or 16 bytes, depending on architecture).
 Above this size, cacheline spacing is used for size classes.
 The default value is 128 bytes.
-.It R
-Double/halve magazine size, which approximately doubles/halves the number of
-rounds in each magazine.
-Magazines are used by the thread-specific caching machinery to acquire and
-release objects in bulk.
-Increasing the magazine size decreases locking overhead, at the expense of
-increased memory usage.
-This option is not available for some configurations (non-PIC).
-.It U
-Generate
-.Dq utrace
-entries for
-.Xr ktrace 1 ,
-for all operations.
-Consult the source for details on this option.
-.It V
-Attempting to allocate zero bytes will return a
-.Dv NULL
-pointer instead of
-a valid pointer.
-(The default behavior is to make a minimal allocation and return a
-pointer to it.)
-This option is provided for System V compatibility.
-This option is incompatible with the
-.Dq X
-option.
-.It X
-Rather than return failure for any allocation function,
-display a diagnostic message on
-.Dv stderr
-and cause the program to drop
-core (using
-.Xr abort 3 ) .
-This option should be set at compile time by including the following in
-the source code:
-.Bd -literal -offset indent
-jemalloc_options = "X";
-.Ed
-.It Z
-Each byte of new memory allocated by
-.Fn malloc
-or
-.Fn realloc
-will be initialized to 0.
-Note that this initialization only happens once for each byte, so
-.Fn realloc
-calls do not zero memory that was previously allocated.
-This is intended for debugging and will impact performance negatively.
+@roff_mag@@roff_tls@.It R
+@roff_mag@@roff_tls@Double/halve magazine size, which approximately
+@roff_mag@@roff_tls@doubles/halves the number of rounds in each magazine.
+@roff_mag@@roff_tls@Magazines are used by the thread-specific caching machinery
+@roff_mag@@roff_tls@to acquire and release objects in bulk.
+@roff_mag@@roff_tls@Increasing the magazine size decreases locking overhead, at
+@roff_mag@@roff_tls@the expense of increased memory usage.
+@roff_stats@.It U
+@roff_stats@Generate a verbose trace log via
+@roff_stats@.Fn jemalloc_message
+@roff_stats@for all allocation operations.
+@roff_sysv@.It V
+@roff_sysv@Attempting to allocate zero bytes will return a
+@roff_sysv@.Dv NULL
+@roff_sysv@pointer instead of a valid pointer.
+@roff_sysv@(The default behavior is to make a minimal allocation and return a
+@roff_sysv@pointer to it.)
+@roff_sysv@This option is provided for System V compatibility.
+@roff_sysv@@roff_xmalloc@This option is incompatible with the
+@roff_sysv@@roff_xmalloc@.Dq X
+@roff_sysv@@roff_xmalloc@option.
+@roff_xmalloc@.It X
+@roff_xmalloc@Rather than return failure for any allocation function, display a
+@roff_xmalloc@diagnostic message on
+@roff_xmalloc@.Dv stderr
+@roff_xmalloc@and cause the program to drop core (using
+@roff_xmalloc@.Xr abort 3 ) .
+@roff_xmalloc@This option should be set at compile time by including the
+@roff_xmalloc@following in the source code:
+@roff_xmalloc@.Bd -literal -offset indent
+@roff_xmalloc@jemalloc_options = "X";
+@roff_xmalloc@.Ed
+@roff_fill@.It Z
+@roff_fill@Each byte of new memory allocated by
+@roff_fill@.Fn malloc
+@roff_fill@or
+@roff_fill@.Fn realloc
+@roff_fill@will be initialized to 0.
+@roff_fill@Note that this initialization only happens once for each byte, so
+@roff_fill@.Fn realloc
+@roff_fill@calls do not zero memory that was previously allocated.
+@roff_fill@This is intended for debugging and will impact performance
+@roff_fill@negatively.
 .El
 .Pp
-The
-.Dq J
-and
-.Dq Z
-options are intended for testing and debugging.
-An application which changes its behavior when these options are used
-is flawed.
+@roff_fill@The
+@roff_fill@.Dq J
+@roff_fill@and
+@roff_fill@.Dq Z
+@roff_fill@options are intended for testing and debugging.
+@roff_fill@An application which changes its behavior when these options are used
+@roff_fill@is flawed.
 .Sh IMPLEMENTATION NOTES
-Traditionally, allocators have used
-.Xr sbrk 2
-to obtain memory, which is suboptimal for several reasons, including race
-conditions, increased fragmentation, and artificial limitations on maximum
-usable memory.
-This allocator uses both
-.Xr sbrk 2
-and
-.Xr mmap 2
-by default, but it can be configured at run time to use only one or the other.
-If resource limits are not a primary concern, the preferred configuration is
-.Ev JEMALLOC_OPTIONS=dM
-or
-.Ev JEMALLOC_OPTIONS=DM .
-When so configured, the
-.Ar datasize
-resource limit has little practical effect for typical applications; use
-.Ev JEMALLOC_OPTIONS=Dm
-if that is a concern.
-Regardless of allocator configuration, the
-.Ar vmemoryuse
-resource limit can be used to bound the total virtual memory used by a
-process, as described in
-.Xr limits 1 .
+@roff_dss@Traditionally, allocators have used
+@roff_dss@.Xr sbrk 2
+@roff_dss@to obtain memory, which is suboptimal for several reasons, including
+@roff_dss@race conditions, increased fragmentation, and artificial limitations
+@roff_dss@on maximum usable memory.
+@roff_dss@This allocator uses both
+@roff_dss@.Xr sbrk 2
+@roff_dss@and
+@roff_dss@.Xr mmap 2
+@roff_dss@by default, but it can be configured at run time to use only one or
+@roff_dss@the other.
 .Pp
 This allocator uses multiple arenas in order to reduce lock contention for
 threaded programs on multi-processor systems.
@@ -375,13 +361,14 @@
 However, it may make sense to reduce the number of arenas if an application
 does not make much use of the allocation functions.
 .Pp
-In addition to multiple arenas, this allocator supports thread-specific
-caching for small objects (smaller than one page), in order to make it
-possible to completely avoid synchronization for most small allocation requests.
-Such caching allows very fast allocation in the common case, but it increases
-memory usage and fragmentation, since a bounded number of objects can remain
-allocated in each thread cache.
-.Pp
+@roff_mag@In addition to multiple arenas, this allocator supports
+@roff_mag@thread-specific caching for small objects (smaller than one page), in
+@roff_mag@order to make it possible to completely avoid synchronization for most
+@roff_mag@small allocation requests.
+@roff_mag@Such caching allows very fast allocation in the common case, but it
+@roff_mag@increases memory usage and fragmentation, since a bounded number of
+@roff_mag@objects can remain allocated in each thread cache.
+@roff_mag@.Pp
 Memory is conceptually broken into equal-sized chunks, where the chunk size is
 a power of two that is greater than the page size.
 Chunks are always aligned to multiples of the chunk size.
@@ -406,12 +393,16 @@
 .Pp
 Small objects are managed in groups by page runs.
 Each run maintains a bitmap that tracks which regions are in use.
-Allocation requests that are no more than half the quantum (8 or 16, depending
-on architecture) are rounded up to the nearest power of two.
-Allocation requests that are more than half the quantum, but no more than the
-minimum cacheline-multiple size class (see the
+@roff_tiny@Allocation requests that are no more than half the quantum (8 or 16,
+@roff_tiny@depending on architecture) are rounded up to the nearest power of
+@roff_tiny@two.
+Allocation requests that are
+@roff_tiny@more than half the quantum, but
+no more than the minimum cacheline-multiple size class (see the
 .Dq Q
-option) are rounded up to the nearest multiple of the quantum.
+option) are rounded up to the nearest multiple of the
+@roff_tiny@quantum.
+@roff_no_tiny@quantum (8 or 16, depending on architecture).
 Allocation requests that are more than the minumum cacheline-multiple size
 class, but no more than the minimum subpage-multiple size class (see the
 .Dq C
@@ -440,26 +431,26 @@
 It is probably also a good idea to recompile the program with suitable
 options and symbols for debugger support.
 .Pp
-If the program starts to give unusual results, coredump or generally behave
-differently without emitting any of the messages mentioned in the next
-section, it is likely because it depends on the storage being filled with
-zero bytes.
-Try running it with the
-.Dq Z
-option set;
-if that improves the situation, this diagnosis has been confirmed.
-If the program still misbehaves,
-the likely problem is accessing memory outside the allocated area.
-.Pp
-Alternatively, if the symptoms are not easy to reproduce, setting the
-.Dq J
-option may help provoke the problem.
-.Pp
-In truly difficult cases, the
-.Dq U
-option, if supported by the kernel, can provide a detailed trace of
-all calls made to these functions.
-.Pp
+@roff_fill@If the program starts to give unusual results, coredump or generally
+@roff_fill@behave differently without emitting any of the messages mentioned in
+@roff_fill@the next section, it is likely because it depends on the storage
+@roff_fill@being filled with zero bytes.
+@roff_fill@Try running it with the
+@roff_fill@.Dq Z
+@roff_fill@option set;
+@roff_fill@if that improves the situation, this diagnosis has been confirmed.
+@roff_fill@If the program still misbehaves,
+@roff_fill@the likely problem is accessing memory outside the allocated area.
+@roff_fill@.Pp
+@roff_fill@Alternatively, if the symptoms are not easy to reproduce, setting the
+@roff_fill@.Dq J
+@roff_fill@option may help provoke the problem.
+@roff_fill@.Pp
+@roff_stats@In truly difficult cases, the
+@roff_stats@.Dq U
+@roff_stats@option can provide a detailed trace of all calls made to these
+@roff_stats@functions.
+@roff_stats@.Pp
 Unfortunately this implementation does not provide much detail about
 the problems it detects; the performance impact for storing such information
 would be prohibitive.
@@ -476,7 +467,7 @@
 option is set, all warnings are treated as errors.
 .Pp
 The
-.Va _malloc_message
+.Va jemalloc_message
 variable allows the programmer to override the function which emits
 the text strings forming the errors and warnings if for some reason
 the
@@ -486,7 +477,7 @@
 this function is likely to result in a crash or deadlock.
 .Pp
 All messages are prefixed by
-.Dq Ao Ar progname Ac Ns Li : (malloc) .
+.Dq <jemalloc>: .
 .Sh RETURN VALUES
 The
 .Fn malloc
@@ -564,15 +555,12 @@
 jemalloc_options = "X";
 .Ed
 .Sh SEE ALSO
-.Xr limits 1 ,
 .Xr madvise 2 ,
 .Xr mmap 2 ,
 .Xr sbrk 2 ,
 .Xr alloca 3 ,
 .Xr atexit 3 ,
-.Xr getpagesize 3 ,
-.Xr memory 3 ,
-.Xr posix_memalign 3
+.Xr getpagesize 3
 .Sh STANDARDS
 The
 .Fn malloc ,
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 71b09c4..65ce18e 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -1178,8 +1178,8 @@
 static unsigned	malloc_ncpus(void);
 static bool	malloc_init_hard(void);
 static void	thread_cleanup(void *arg);
-void		jemalloc_prefork(void);
-void		jemalloc_postfork(void);
+static void	jemalloc_prefork(void);
+static void	jemalloc_postfork(void);
 
 /*
  * End function prototypes.
@@ -1231,9 +1231,10 @@
 #  define assert(e) do {						\
 	if (!(e)) {							\
 		char line_buf[UMAX2S_BUFSIZE];				\
-		jemalloc_message(__FILE__, ":", umax2s(__LINE__,	\
-		    line_buf), ": Failed assertion: ");			\
-		jemalloc_message("\"", #e, "\"\n", "");			\
+		jemalloc_message("<jemalloc>: ", __FILE__, ":",		\
+		    umax2s(__LINE__, line_buf));			\
+		jemalloc_message(": Failed assertion: ", "\"", #e,	\
+		    "\"\n");						\
 		abort();						\
 	}								\
 } while (0)
@@ -1250,15 +1251,17 @@
 	assert(len == sizeof(malloc_utrace_t));
 
 	if (ut->p == NULL && ut->s == 0 && ut->r == NULL)
-		malloc_printf("%d x USER malloc_init()\n", getpid());
+		malloc_printf("<jemalloc>:utrace: %d malloc_init()\n",
+		    getpid());
 	else if (ut->p == NULL && ut->r != NULL) {
-		malloc_printf("%d x USER %p = malloc(%zu)\n", getpid(), ut->r,
-		    ut->s);
+		malloc_printf("<jemalloc>:utrace: %d %p = malloc(%zu)\n",
+		    getpid(), ut->r, ut->s);
 	} else if (ut->p != NULL && ut->r != NULL) {
-		malloc_printf("%d x USER %p = realloc(%p, %zu)\n", getpid(),
-		    ut->r, ut->p, ut->s);
+		malloc_printf("<jemalloc>:utrace: %d %p = realloc(%p, %zu)\n",
+		    getpid(), ut->r, ut->p, ut->s);
 	} else
-		malloc_printf("%d x USER free(%p)\n", getpid(), ut->p);
+		malloc_printf("<jemalloc>:utrace: %d free(%p)\n", getpid(),
+		    ut->p);
 
 	return (0);
 }
@@ -2247,11 +2250,6 @@
 	 * introduces a bootstrapping issue.
 	 */
 #ifndef NO_TLS
-	if (isthreaded == false) {
-	    /* Avoid the overhead of TLS for single-threaded operation. */
-	    return (arenas[0]);
-	}
-
 	ret = arenas_map;
 	if (ret == NULL) {
 		ret = choose_arena_hard();
@@ -3405,11 +3403,9 @@
 }
 
 static inline void *
-arena_malloc(arena_t *arena, size_t size, bool zero)
+arena_malloc(size_t size, bool zero)
 {
 
-	assert(arena != NULL);
-	assert(arena->magic == ARENA_MAGIC);
 	assert(size != 0);
 	assert(QUANTUM_CEILING(size) <= arena_maxclass);
 
@@ -3418,7 +3414,7 @@
 		if (opt_mag) {
 			mag_rack_t *rack = mag_rack;
 			if (rack == NULL) {
-				rack = mag_rack_create(arena);
+				rack = mag_rack_create(choose_arena());
 				if (rack == NULL)
 					return (NULL);
 				mag_rack = rack;
@@ -3427,9 +3423,9 @@
 			return (mag_rack_alloc(rack, size, zero));
 		} else
 #endif
-			return (arena_malloc_small(arena, size, zero));
+			return (arena_malloc_small(choose_arena(), size, zero));
 	} else
-		return (arena_malloc_large(arena, size, zero));
+		return (arena_malloc_large(choose_arena(), size, zero));
 }
 
 static inline void *
@@ -3439,7 +3435,7 @@
 	assert(size != 0);
 
 	if (size <= arena_maxclass)
-		return (arena_malloc(choose_arena(), size, false));
+		return (arena_malloc(size, false));
 	else
 		return (huge_malloc(size, false));
 }
@@ -3449,7 +3445,7 @@
 {
 
 	if (size <= arena_maxclass)
-		return (arena_malloc(choose_arena(), size, true));
+		return (arena_malloc(size, true));
 	else
 		return (huge_malloc(size, true));
 }
@@ -3553,7 +3549,7 @@
 
 	if (ceil_size <= PAGE_SIZE || (alignment <= PAGE_SIZE
 	    && ceil_size <= arena_maxclass))
-		ret = arena_malloc(choose_arena(), ceil_size, false);
+		ret = arena_malloc(ceil_size, false);
 	else {
 		size_t run_size;
 
@@ -4113,7 +4109,7 @@
 	 * need to move the object.  In that case, fall back to allocating new
 	 * space and copying.
 	 */
-	ret = arena_malloc(choose_arena(), size, false);
+	ret = arena_malloc(size, false);
 	if (ret == NULL)
 		return (NULL);
 
@@ -5725,7 +5721,7 @@
  * is threaded here.
  */
 
-void
+static void
 jemalloc_prefork(void)
 {
 	bool again;
@@ -5773,7 +5769,7 @@
 #endif
 }
 
-void
+static void
 jemalloc_postfork(void)
 {
 	unsigned i;
diff --git a/jemalloc/src/jemalloc.h b/jemalloc/src/jemalloc.h
index dbff468..21b8de5 100644
--- a/jemalloc/src/jemalloc.h
+++ b/jemalloc/src/jemalloc.h
@@ -28,10 +28,24 @@
  *******************************************************************************
  */
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef JEMALLOC_H_
+#define	JEMALLOC_H_
+
+#include "jemalloc_defs.h"
+
+size_t	malloc_usable_size(const void *ptr);
+
 extern const char	*jemalloc_options;
 extern void		(*jemalloc_message)(const char *p1, const char *p2,
     const char *p3, const char *p4);
 
-void	jemalloc_thread_cleanup(void);
-void	jemalloc_prefork(void);
-void	jemalloc_postfork(void);
+#endif /* JEMALLOC_H_ */
+
+#ifdef __cplusplus
+};
+#endif
+
diff --git a/jemalloc/src/jemalloc_defs.h.in b/jemalloc/src/jemalloc_defs.h.in
index 6ca6018..eae3d0a 100644
--- a/jemalloc/src/jemalloc_defs.h.in
+++ b/jemalloc/src/jemalloc_defs.h.in
@@ -28,6 +28,14 @@
  *******************************************************************************
  */
 
+#ifndef JEMALLOC_DEFS_H_
+#define	JEMALLOC_DEFS_H_
+
+/*
+ * jemalloc version string.
+ */
+#undef JEMALLOC_VERSION
+
 /*
  * Hyper-threaded CPUs may need a special instruction inside spin loops in
  * order to yield to another virtual CPU.
@@ -92,3 +100,5 @@
 
 /* sizeof(void *) == 2^SIZEOF_PTR_2POW. */
 #undef SIZEOF_PTR_2POW
+
+#endif /* JEMALLOC_DEFS_H_ */