Add support for sized deallocation.

This adds a new `sdallocx` function to the external API, allowing the
size to be passed by the caller.  It avoids some extra reads in the
thread cache fast path.  In the case where stats are enabled, this
avoids the work of calculating the size from the pointer.

An assertion validates the size that's passed in, so enabling debugging
will allow users of the API to debug cases where an incorrect size is
passed in.

The performance win for a contrived microbenchmark doing an allocation
and immediately freeing it is ~10%.  It may have a different impact on a
real workload.

Closes #28
diff --git a/Makefile.in b/Makefile.in
index 1446dbe..ac56d8f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -136,6 +136,7 @@
 	$(srcroot)test/unit/prof_accum_b.c
 TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/allocated.c \
+	$(srcroot)test/integration/sdallocx.c \
 	$(srcroot)test/integration/mallocx.c \
 	$(srcroot)test/integration/MALLOCX_ARENA.c \
 	$(srcroot)test/integration/posix_memalign.c \
diff --git a/configure.ac b/configure.ac
index ce4af21..d221876 100644
--- a/configure.ac
+++ b/configure.ac
@@ -452,7 +452,7 @@
 AC_PATH_PROG([LD], [ld], [false], [$PATH])
 AC_PATH_PROG([AUTOCONF], [autoconf], [false], [$PATH])
 
-public_syms="malloc_conf malloc_message malloc calloc posix_memalign aligned_alloc realloc free mallocx rallocx xallocx sallocx dallocx nallocx mallctl mallctlnametomib mallctlbymib malloc_stats_print malloc_usable_size"
+public_syms="malloc_conf malloc_message malloc calloc posix_memalign aligned_alloc realloc free mallocx rallocx xallocx sallocx dallocx sdallocx nallocx mallctl mallctlnametomib mallctlbymib malloc_stats_print malloc_usable_size"
 
 dnl Check for allocator-related functions that should be wrapped.
 AC_CHECK_FUNC([memalign],
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 8f4327f..e5c229f 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -38,6 +38,7 @@
     <refname>xallocx</refname>
     <refname>sallocx</refname>
     <refname>dallocx</refname>
+    <refname>sdallocx</refname>
     <refname>nallocx</refname>
     <refname>mallctl</refname>
     <refname>mallctlnametomib</refname>
@@ -121,6 +122,12 @@
           <paramdef>int <parameter>flags</parameter></paramdef>
         </funcprototype>
         <funcprototype>
+          <funcdef>void <function>sdallocx</function></funcdef>
+          <paramdef>void *<parameter>ptr</parameter></paramdef>
+          <paramdef>size_t <parameter>size</parameter></paramdef>
+          <paramdef>int <parameter>flags</parameter></paramdef>
+        </funcprototype>
+        <funcprototype>
           <funcdef>size_t <function>nallocx</function></funcdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
           <paramdef>int <parameter>flags</parameter></paramdef>
@@ -228,7 +235,8 @@
       <function>rallocx<parameter/></function>,
       <function>xallocx<parameter/></function>,
       <function>sallocx<parameter/></function>,
-      <function>dallocx<parameter/></function>, and
+      <function>dallocx<parameter/></function>,
+      <function>sdallocx<parameter/></function>, and
       <function>nallocx<parameter/></function> functions all have a
       <parameter>flags</parameter> argument that can be used to specify
       options.  The functions only check the options that are contextually
@@ -312,6 +320,15 @@
       memory referenced by <parameter>ptr</parameter> to be made available for
       future allocations.</para>
 
+      <para>The <function>sdallocx<parameter/></function> function is an
+      extension of <function>dallocx<parameter/></function> with a
+      <parameter>size</parameter> parameter to allow the caller to pass in the
+      allocation size as an optimization.  The minimum valid input size is the
+      original requested size of the allocation, and the maximum valid input
+      size is the corresponding value returned by
+      <function>nallocx<parameter/></function> or
+      <function>sallocx<parameter/></function>.</para>
+
       <para>The <function>nallocx<parameter/></function> function allocates no
       memory, but it performs the same size computation as the
       <function>mallocx<parameter/></function> function, and returns the real
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 166d052..6ab0ae7 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -488,6 +488,7 @@
 void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
 void	arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache);
+void	arena_sdalloc(arena_chunk_t *chunk, void *ptr, size_t size, bool try_tcache);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
@@ -1139,9 +1140,7 @@
 	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
 		/* Small allocation. */
 		if (try_tcache && (tcache = tcache_get(false)) != NULL) {
-			size_t binind;
-
-			binind = arena_ptr_small_binind_get(ptr, mapbits);
+			size_t binind = arena_ptr_small_binind_get(ptr, mapbits);
 			tcache_dalloc_small(tcache, ptr, binind);
 		} else
 			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
@@ -1157,6 +1156,34 @@
 			arena_dalloc_large(chunk->arena, chunk, ptr);
 	}
 }
+
+JEMALLOC_ALWAYS_INLINE void
+arena_sdalloc(arena_chunk_t *chunk, void *ptr, size_t size, bool try_tcache)
+{
+	tcache_t *tcache;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	if (size < PAGE) {
+		/* Small allocation. */
+		if (try_tcache && (tcache = tcache_get(false)) != NULL) {
+			size_t binind = small_size2bin(size);
+			tcache_dalloc_small(tcache, ptr, binind);
+		} else {
+			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
+		}
+	} else {
+		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+
+		if (try_tcache && size <= tcache_maxclass && (tcache =
+		    tcache_get(false)) != NULL) {
+			tcache_dalloc_large(tcache, ptr, size);
+		} else
+			arena_dalloc_large(chunk->arena, chunk, ptr);
+	}
+}
 #  endif /* JEMALLOC_ARENA_INLINE_C */
 #endif
 
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 59ae8d5..c0e326d 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -634,8 +634,10 @@
 size_t	u2rz(size_t usize);
 size_t	p2rz(const void *ptr);
 void	idalloct(void *ptr, bool try_tcache);
+void	isdalloct(void *ptr, size_t size, bool try_tcache);
 void	idalloc(void *ptr);
 void	iqalloc(void *ptr, bool try_tcache);
+void	isqalloc(void *ptr, size_t size, bool try_tcache);
 void	*iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
     size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
     arena_t *arena);
@@ -788,6 +790,20 @@
 }
 
 JEMALLOC_ALWAYS_INLINE void
+isdalloct(void *ptr, size_t size, bool try_tcache)
+{
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr)
+		arena_sdalloc(chunk, ptr, size, try_tcache);
+	else
+		huge_dalloc(ptr);
+}
+
+JEMALLOC_ALWAYS_INLINE void
 idalloc(void *ptr)
 {
 
@@ -804,6 +820,16 @@
 		idalloct(ptr, try_tcache);
 }
 
+JEMALLOC_ALWAYS_INLINE void
+isqalloc(void *ptr, size_t size, bool try_tcache)
+{
+
+	if (config_fill && opt_quarantine)
+		quarantine(ptr);
+	else
+		idalloct(ptr, try_tcache);
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
     size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 84f0591..3b990b0 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -61,6 +61,7 @@
 arena_redzone_corruption
 arena_run_regind
 arena_salloc
+arena_sdalloc
 arena_stats_merge
 arena_tcache_fill_small
 arenas
@@ -228,7 +229,9 @@
 iralloct
 iralloct_realign
 isalloc
+isdalloct
 isthreaded
+isqalloc
 ivsalloc
 ixalloc
 jemalloc_postfork_child
diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index b365eb4..f81adc1 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -25,6 +25,7 @@
 JEMALLOC_EXPORT size_t	@je_@sallocx(const void *ptr, int flags)
     JEMALLOC_ATTR(pure);
 JEMALLOC_EXPORT void	@je_@dallocx(void *ptr, int flags);
+JEMALLOC_EXPORT void	@je_@sdallocx(void *ptr, size_t size, int flags);
 JEMALLOC_EXPORT size_t	@je_@nallocx(size_t size, int flags)
     JEMALLOC_ATTR(pure);
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 71e921b..527782e 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1223,6 +1223,24 @@
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
 }
 
+JEMALLOC_INLINE_C void
+isfree(void *ptr, size_t usize, bool try_tcache)
+{
+	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
+
+	assert(ptr != NULL);
+	assert(malloc_initialized || IS_INITIALIZER);
+
+	if (config_prof && opt_prof)
+		prof_free(ptr, usize);
+	if (config_stats)
+		thread_allocated_tsd_get()->deallocated += usize;
+	if (config_valgrind && in_valgrind)
+		rzsize = p2rz(ptr);
+	isqalloc(ptr, usize, try_tcache);
+	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
+}
+
 void *
 je_realloc(void *ptr, size_t size)
 {
@@ -1820,6 +1838,32 @@
 	ifree(ptr, try_tcache);
 }
 
+void
+je_sdallocx(void *ptr, size_t size, int flags)
+{
+	bool try_tcache;
+
+	assert(ptr != NULL);
+	assert(malloc_initialized || IS_INITIALIZER);
+	assert(size == isalloc(ptr, config_prof));
+
+	if ((flags & MALLOCX_LG_ALIGN_MASK) == 0)
+		size = s2u(size);
+	else
+		size = sa2u(size, MALLOCX_ALIGN_GET_SPECIFIED(flags));
+
+	if ((flags & MALLOCX_ARENA_MASK) != 0) {
+		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+		try_tcache = (chunk == ptr || chunk->arena !=
+		    arenas[arena_ind]);
+	} else
+		try_tcache = true;
+
+	UTRACE(ptr, 0, 0);
+	isfree(ptr, size, try_tcache);
+}
+
 size_t
 je_nallocx(size_t size, int flags)
 {
diff --git a/test/integration/sdallocx.c b/test/integration/sdallocx.c
new file mode 100644
index 0000000..b84817d
--- /dev/null
+++ b/test/integration/sdallocx.c
@@ -0,0 +1,57 @@
+#include "test/jemalloc_test.h"
+
+#define	MAXALIGN (((size_t)1) << 25)
+#define	NITER 4
+
+TEST_BEGIN(test_basic)
+{
+	void *ptr = mallocx(64, 0);
+	sdallocx(ptr, 64, 0);
+}
+TEST_END
+
+TEST_BEGIN(test_alignment_and_size)
+{
+	size_t nsz, sz, alignment, total;
+	unsigned i;
+	void *ps[NITER];
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		for (sz = 1;
+		    sz < 3 * alignment && sz < (1U << 31);
+		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				nsz = nallocx(sz, MALLOCX_ALIGN(alignment) |
+				    MALLOCX_ZERO);
+				ps[i] = mallocx(sz, MALLOCX_ALIGN(alignment) |
+				    MALLOCX_ZERO);
+				total += nsz;
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					sdallocx(ps[i], sz,
+					    MALLOCX_ALIGN(alignment));
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(
+	    test_basic,
+	    test_alignment_and_size));
+}
diff --git a/test/stress/microbench.c b/test/stress/microbench.c
index 60c02db..a8267c3 100644
--- a/test/stress/microbench.c
+++ b/test/stress/microbench.c
@@ -72,6 +72,17 @@
 	dallocx(p, 0);
 }
 
+static void
+malloc_sdallocx(void)
+{
+	void *p = malloc(1);
+	if (p == NULL) {
+		test_fail("Unexpected malloc() failure");
+		return;
+	}
+	sdallocx(p, 1, 0);
+}
+
 TEST_BEGIN(test_free_vs_dallocx)
 {
 
@@ -80,6 +91,14 @@
 }
 TEST_END
 
+TEST_BEGIN(test_dallocx_vs_sdallocx)
+{
+
+	compare_funcs(10*1000*1000, 100*1000*1000, "dallocx", malloc_dallocx,
+	    "sdallocx", malloc_sdallocx);
+}
+TEST_END
+
 static void
 malloc_mus_free(void)
 {
@@ -135,6 +154,7 @@
 	return (test(
 	    test_malloc_vs_mallocx,
 	    test_free_vs_dallocx,
+	    test_dallocx_vs_sdallocx,
 	    test_mus_vs_sallocx,
 	    test_sallocx_vs_nallocx));
 }