Fix off-by-one backtracing issues.

Rewrite prof_alloc_prep() as a cpp macro, PROF_ALLOC_PREP(), in order to
remove any doubt as to whether an additional stack frame is created.
Prior to this change, it was assumed that inlining would reduce the
total number of frames in the backtrace, but in practice behavior wasn't
completely predictable.

Create imemalign() and call it from posix_memalign(), memalign(), and
valloc(), so that all entry points require the same number of stack
frames to be ignored during backtracing.
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 4d10e90..14a0c7c 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -84,6 +84,7 @@
     const char *v, size_t vlen);
 static void	malloc_conf_init(void);
 static bool	malloc_init_hard(void);
+static int	imemalign(void **memptr, size_t alignment, size_t size);
 
 /******************************************************************************/
 /* malloc_message() setup. */
@@ -939,7 +940,8 @@
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
 		usize = s2u(size);
-		if ((cnt = prof_alloc_prep(usize)) == NULL) {
+		PROF_ALLOC_PREP(1, usize, cnt);
+		if (cnt == NULL) {
 			ret = NULL;
 			goto OOM;
 		}
@@ -988,9 +990,15 @@
 }
 
 JEMALLOC_ATTR(nonnull(1))
-JEMALLOC_ATTR(visibility("default"))
-int
-JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
+#ifdef JEMALLOC_PROF
+/*
+ * Avoid any uncertainty as to how many backtrace frames to ignore in 
+ * PROF_ALLOC_PREP().
+ */
+JEMALLOC_ATTR(noinline)
+#endif
+static int
+imemalign(void **memptr, size_t alignment, size_t size)
 {
 	int ret;
 	size_t usize
@@ -1057,7 +1065,8 @@
 
 #ifdef JEMALLOC_PROF
 		if (opt_prof) {
-			if ((cnt = prof_alloc_prep(usize)) == NULL) {
+			PROF_ALLOC_PREP(2, usize, cnt);
+			if (cnt == NULL) {
 				result = NULL;
 				ret = EINVAL;
 			} else {
@@ -1110,6 +1119,15 @@
 	return (ret);
 }
 
+JEMALLOC_ATTR(nonnull(1))
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
+{
+
+	return imemalign(memptr, alignment, size);
+}
+
 JEMALLOC_ATTR(malloc)
 JEMALLOC_ATTR(visibility("default"))
 void *
@@ -1165,7 +1183,8 @@
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
 		usize = s2u(num_size);
-		if ((cnt = prof_alloc_prep(usize)) == NULL) {
+		PROF_ALLOC_PREP(1, usize, cnt);
+		if (cnt == NULL) {
 			ret = NULL;
 			goto RETURN;
 		}
@@ -1278,7 +1297,8 @@
 		if (opt_prof) {
 			usize = s2u(size);
 			old_ctx = prof_ctx_get(ptr);
-			if ((cnt = prof_alloc_prep(usize)) == NULL) {
+			PROF_ALLOC_PREP(1, usize, cnt);
+			if (cnt == NULL) {
 				ret = NULL;
 				goto OOM;
 			}
@@ -1327,7 +1347,8 @@
 #ifdef JEMALLOC_PROF
 			if (opt_prof) {
 				usize = s2u(size);
-				if ((cnt = prof_alloc_prep(usize)) == NULL)
+				PROF_ALLOC_PREP(1, usize, cnt);
+				if (cnt == NULL)
 					ret = NULL;
 				else {
 					if (prof_promote && (uintptr_t)cnt !=
@@ -1432,7 +1453,7 @@
 #ifdef JEMALLOC_CC_SILENCE
 	int result =
 #endif
-	    JEMALLOC_P(posix_memalign)(&ret, alignment, size);
+	    imemalign(&ret, alignment, size);
 #ifdef JEMALLOC_CC_SILENCE
 	if (result != 0)
 		return (NULL);
@@ -1451,7 +1472,7 @@
 #ifdef JEMALLOC_CC_SILENCE
 	int result =
 #endif
-	    JEMALLOC_P(posix_memalign)(&ret, PAGE_SIZE, size);
+	    imemalign(&ret, PAGE_SIZE, size);
 #ifdef JEMALLOC_CC_SILENCE
 	if (result != 0)
 		return (NULL);
@@ -1573,7 +1594,8 @@
 
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
-		if ((cnt = prof_alloc_prep(usize)) == NULL)
+		PROF_ALLOC_PREP(1, usize, cnt);
+		if (cnt == NULL)
 			goto OOM;
 		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
 		    small_maxclass) {
@@ -1660,7 +1682,7 @@
 		/*
 		 * usize isn't knowable before iralloc() returns when extra is
 		 * non-zero.  Therefore, compute its maximum possible value and
-		 * use that in prof_alloc_prep() to decide whether to capture a
+		 * use that in PROF_ALLOC_PREP() to decide whether to capture a
 		 * backtrace.  prof_realloc() will use the actual usize to
 		 * decide whether to sample.
 		 */
@@ -1668,7 +1690,8 @@
 		    sa2u(size+extra, alignment, NULL);
 		old_size = isalloc(p);
 		old_ctx = prof_ctx_get(p);
-		if ((cnt = prof_alloc_prep(max_usize)) == NULL)
+		PROF_ALLOC_PREP(1, max_usize, cnt);
+		if (cnt == NULL)
 			goto OOM;
 		/*
 		 * Use minimum usize to determine whether promotion may happen.