Various heap profiling improvements.

Add the --disable-prof-libgcc configure option, and add backtracing
based on libgcc, which is used by default.

Fix a bug in hash().

Fix various configuration-dependent compilation errors.
diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL
index 057d8d5..c02e252 100644
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@@ -48,12 +48,16 @@
     Enable heap profiling and leak detection functionality.  Use the 'B', 'F',
     'I', 'L', and 'U' options to control these features.
 
+--disable-prof-libgcc
+    Disable the use of libgcc's backtracing functionality.  Ordinarily, libgcc's
+    backtracing functionality is superior to the alternatives, but it may fail
+    to capture backtraces on some systems.
+
 --enable-prof-libunwind
     Use the libunwind library (http://www.nongnu.org/libunwind/) for stack
-    backtracing, rather than frame pointers.  libunwind is quite slow in
-    comparison to frame pointer-based backtracing, but it has the advantage of
-    working on applications/libraries that were compiled with
-    -fomit-frame-pointer.
+    backtracing.  libunwind is quite slow, but it tends to work across a wider
+    variety of system configurations than the default backtracing code, which is
+    based on libgcc functionality or gcc intrinsics.
 
 --disable-tiny
     Disable tiny (sub-quantum-sized) object support.  Technically it is not
diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index 1c4e335..17c7aa6 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -373,6 +373,17 @@
 ],
 [enable_prof="0"]
 )
+AC_ARG_ENABLE([prof-libgcc],
+  [AS_HELP_STRING([--disable-prof-libgcc],
+  [Do not use libgcc for backtracing])],
+[if test "x$enable_prof_libgcc" = "xno" ; then
+  enable_prof_libgcc="0"
+else
+  enable_prof_libgcc="1"
+fi
+],
+[enable_prof_libgcc="1"]
+)
 AC_ARG_ENABLE([prof-libunwind],
   [AS_HELP_STRING([--enable-prof-libunwind], [Use libunwind for backtracing])],
 [if test "x$enable_prof_libunwind" = "xno" ; then
@@ -682,6 +693,18 @@
 fi
 AC_SUBST([roff_prof])
 
+dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics
+dnl for backtracing.
+if test "x$enable_prof" = "x1" -a "x$enable_prof_libunwind" = "x0" \
+ -a "x$GCC" = "xyes" -a "x$enable_prof_libgcc" = "x1" ; then
+  enable_prof_libgcc="1"
+  AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
+  AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
+  if test "x${enable_prof_libgcc}" = "x1" ; then
+    AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
+  fi
+fi
+
 dnl ============================================================================
 dnl Configure libgd for mtrgraph.
 bins="${objroot}bin/jemtr2mtr${install_suffix}"
diff --git a/jemalloc/src/internal/hash.h b/jemalloc/src/internal/hash.h
index 182babd..d12cdb8 100644
--- a/jemalloc/src/internal/hash.h
+++ b/jemalloc/src/internal/hash.h
@@ -30,8 +30,10 @@
 	const int r = 47;
 	uint64_t h = seed ^ (len * m);
 	const uint64_t *data = (const uint64_t *)key;
-	const unsigned char *data2 = (const unsigned char*)data;
 	const uint64_t *end = data + (len/8);
+	const unsigned char *data2;
+
+	assert(((uintptr_t)key & 0x7) == 0);
 
 	while(data != end) {
 		uint64_t k = *data++;
@@ -44,6 +46,7 @@
 		h *= m;
 	}
 
+	data2 = (const unsigned char *)data;
 	switch(len & 7) {
 		case 7: h ^= ((uint64_t)(data2[6])) << 48;
 		case 6: h ^= ((uint64_t)(data2[5])) << 40;
diff --git a/jemalloc/src/internal/jemalloc_chunk.h b/jemalloc/src/internal/jemalloc_chunk.h
index 40541e7..00b2e1d 100644
--- a/jemalloc/src/internal/jemalloc_chunk.h
+++ b/jemalloc/src/internal/jemalloc_chunk.h
@@ -32,7 +32,7 @@
 extern bool		opt_overcommit;
 #endif
 
-#ifdef JEMALLOC_STATS
+#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 /* Protects stats_chunks; currently not used for any other purpose. */
 extern malloc_mutex_t	chunks_mtx;
 /* Chunk statistics. */
diff --git a/jemalloc/src/internal/jemalloc_internal.h.in b/jemalloc/src/internal/jemalloc_internal.h.in
index 88e33e3..8f52fa3 100644
--- a/jemalloc/src/internal/jemalloc_internal.h.in
+++ b/jemalloc/src/internal/jemalloc_internal.h.in
@@ -32,10 +32,8 @@
 #endif
 
 #include "internal/rb.h"
-#if (defined(JEMALLOC_TCACHE) && defined(JEMALLOC_STATS))
 #include "internal/qr.h"
 #include "internal/ql.h"
-#endif
 
 extern void	(*JEMALLOC_P(malloc_message))(void *w4opaque, const char *p1,
     const char *p2, const char *p3, const char *p4);
@@ -106,7 +104,7 @@
 #ifdef __sparc64__
 #  define LG_QUANTUM		4
 #endif
-#ifdef __amd64__
+#if (defined(__amd64__) || defined(__x86_64__))
 #  define LG_QUANTUM		4
 #endif
 #ifdef __arm__
@@ -172,7 +170,6 @@
 	(((s) + PAGE_MASK) & ~PAGE_MASK)
 
 #include "internal/prn.h"
-#include "internal/hash.h"
 #include "internal/mb.h"
 #include "internal/ckh.h"
 #include "internal/jemalloc_stats.h"
@@ -185,6 +182,7 @@
 #include "internal/jemalloc_huge.h"
 #include "internal/jemalloc_tcache.h"
 #include "internal/jemalloc_trace.h"
+#include "internal/hash.h"
 #include "internal/prof.h"
 
 #undef JEMALLOC_H_TYPES
@@ -192,7 +190,6 @@
 #define JEMALLOC_H_STRUCTS
 
 #include "internal/prn.h"
-#include "internal/hash.h"
 #include "internal/mb.h"
 #include "internal/ckh.h"
 #include "internal/jemalloc_stats.h"
@@ -205,6 +202,7 @@
 #include "internal/jemalloc_huge.h"
 #include "internal/jemalloc_tcache.h"
 #include "internal/jemalloc_trace.h"
+#include "internal/hash.h"
 #include "internal/prof.h"
 
 #undef JEMALLOC_H_STRUCTS
@@ -255,7 +253,6 @@
 #endif
 
 #include "internal/prn.h"
-#include "internal/hash.h"
 #include "internal/mb.h"
 #include "internal/ckh.h"
 #include "internal/jemalloc_stats.h"
@@ -268,6 +265,7 @@
 #include "internal/jemalloc_huge.h"
 #include "internal/jemalloc_tcache.h"
 #include "internal/jemalloc_trace.h"
+#include "internal/hash.h"
 #include "internal/prof.h"
 
 #undef JEMALLOC_H_EXTERNS
@@ -275,7 +273,6 @@
 #define JEMALLOC_H_INLINES
 
 #include "internal/prn.h"
-#include "internal/hash.h"
 #include "internal/mb.h"
 #include "internal/ckh.h"
 #include "internal/jemalloc_stats.h"
@@ -376,6 +373,7 @@
 #include "internal/jemalloc_tcache.h"
 #include "internal/jemalloc_arena.h"
 #include "internal/jemalloc_trace.h"
+#include "internal/hash.h"
 #include "internal/prof.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
diff --git a/jemalloc/src/internal/jemalloc_stats.h b/jemalloc/src/internal/jemalloc_stats.h
index 12f0676..36dc5fe 100644
--- a/jemalloc/src/internal/jemalloc_stats.h
+++ b/jemalloc/src/internal/jemalloc_stats.h
@@ -8,6 +8,8 @@
 typedef struct malloc_bin_stats_s malloc_bin_stats_t;
 typedef struct malloc_large_stats_s malloc_large_stats_t;
 typedef struct arena_stats_s arena_stats_t;
+#endif
+#if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 typedef struct chunk_stats_s chunk_stats_t;
 #endif
 
diff --git a/jemalloc/src/internal/mb.h b/jemalloc/src/internal/mb.h
index 0a272e7..1707aa9 100644
--- a/jemalloc/src/internal/mb.h
+++ b/jemalloc/src/internal/mb.h
@@ -54,7 +54,7 @@
 	    );
 #endif
 }
-#elif defined(__amd64_)
+#elif (defined(__amd64_) || defined(__x86_64__))
 JEMALLOC_INLINE void
 mb_write(void)
 {
diff --git a/jemalloc/src/internal/prof.h b/jemalloc/src/internal/prof.h
index 326d558..1721ad8 100644
--- a/jemalloc/src/internal/prof.h
+++ b/jemalloc/src/internal/prof.h
@@ -35,6 +35,15 @@
 	unsigned		len;
 };
 
+#ifdef JEMALLOC_PROF_LIBGCC
+/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
+typedef struct {
+	prof_bt_t *bt;
+	unsigned nignore;
+	unsigned max;
+} prof_unwind_data_t;
+#endif
+
 struct prof_cnt_s {
 	/*
 	 * Profiling counters.  An allocation/deallocation pair can operate on
diff --git a/jemalloc/src/jemalloc_defs.h.in b/jemalloc/src/jemalloc_defs.h.in
index 247b596..942694f 100644
--- a/jemalloc/src/jemalloc_defs.h.in
+++ b/jemalloc/src/jemalloc_defs.h.in
@@ -54,6 +54,9 @@
 /* Use libunwind for profile backtracing if defined. */
 #undef JEMALLOC_PROF_LIBUNWIND
 
+/* Use libgcc for profile backtracing if defined. */
+#undef JEMALLOC_PROF_LIBGCC
+
 /*
  * JEMALLOC_TINY enables support for tiny objects, which are smaller than one
  * quantum.
diff --git a/jemalloc/src/jemalloc_stats.c b/jemalloc/src/jemalloc_stats.c
index 580f0fe..b0efe74 100644
--- a/jemalloc/src/jemalloc_stats.c
+++ b/jemalloc/src/jemalloc_stats.c
@@ -219,8 +219,8 @@
 			    size_t);
 			if (config_tcache) {
 				malloc_cprintf(write4, w4opaque,
-				    "%13u %1s %5u %4u %3u %10"PRIu64" %9"PRIu64
-				    " %9"PRIu64" %9"PRIu64""
+				    "%13u %1s %5zu %4u %3zu %10"PRIu64
+				    " %9"PRIu64" %9"PRIu64" %9"PRIu64""
 				    " %9"PRIu64" %7zu %7zu\n",
 				    j,
 				    j < ntbins_ ? "T" : j < ntbins_ + nqbins ?
@@ -232,8 +232,8 @@
 				    highruns, curruns);
 			} else {
 				malloc_cprintf(write4, w4opaque,
-				    "%13u %1s %5u %4u %3u %10"PRIu64" %9"PRIu64
-				    " %9"PRIu64" %7zu %7zu\n",
+				    "%13u %1s %5zu %4u %3zu %10"PRIu64
+				    " %9"PRIu64" %9"PRIu64" %7zu %7zu\n",
 				    j,
 				    j < ntbins_ ? "T" : j < ntbins_ + nqbins ?
 				    "Q" : j < ntbins_ + nqbins + ncbins ? "C" :
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index a7d9cc3..db56659 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -3,6 +3,10 @@
 #ifdef JEMALLOC_PROF
 /******************************************************************************/
 
+#ifdef JEMALLOC_PROF_LIBGCC
+#include <unwind.h>
+#endif
+
 #ifdef JEMALLOC_PROF_LIBUNWIND
 #define	UNW_LOCAL_ONLY
 #include <libunwind.h>
@@ -82,7 +86,13 @@
 
 static prof_bt_t	*bt_dup(prof_bt_t *bt);
 static void	bt_init(prof_bt_t *bt, void **vec);
-static bool	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
+#ifdef JEMALLOC_PROF_LIBGCC
+static _Unwind_Reason_Code	prof_unwind_init_callback(
+    struct _Unwind_Context *context, void *arg);
+static _Unwind_Reason_Code	prof_unwind_callback(
+    struct _Unwind_Context *context, void *arg);
+#endif
+static void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
 static prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
 static void	prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt);
 static void	prof_flush(void);
@@ -160,8 +170,40 @@
 		prof_udump();
 }
 
-#ifdef JEMALLOC_PROF_LIBUNWIND
-static bool
+#ifdef JEMALLOC_PROF_LIBGCC
+static _Unwind_Reason_Code
+prof_unwind_init_callback(struct _Unwind_Context *context, void *arg)
+{
+
+	return (_URC_NO_REASON);
+}
+
+static _Unwind_Reason_Code
+prof_unwind_callback(struct _Unwind_Context *context, void *arg)
+{
+	prof_unwind_data_t *data = (prof_unwind_data_t *)arg;
+
+	if (data->nignore > 0)
+		data->nignore--;
+	else {
+		data->bt->vec[data->bt->len] = (void *)_Unwind_GetIP(context);
+		data->bt->len++;
+		if (data->bt->len == data->max)
+			return (_URC_END_OF_STACK);
+	}
+
+	return (_URC_NO_REASON);
+}
+
+static void
+prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
+{
+	prof_unwind_data_t data = {bt, nignore, max};
+
+	_Unwind_Backtrace(prof_unwind_callback, &data);
+}
+#elif defined(JEMALLOC_PROF_LIBUNWIND)
+static void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
 	unw_context_t uc;
@@ -180,7 +222,7 @@
 	for (i = 0; i < nignore + 1; i++) {
 		err = unw_step(&cursor);
 		if (err <= 0)
-			return (false);
+			return;
 	}
 
 	/*
@@ -195,11 +237,9 @@
 			break;
 		}
 	}
-
-	return (false);
 }
 #else
-static bool
+static void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
 #define	NIGNORE	3
@@ -207,16 +247,16 @@
 	if ((i) < NIGNORE + max) {					\
 		void *p;						\
 		if (__builtin_frame_address(i) == 0)			\
-			return (false);					\
+			return;						\
 		p = __builtin_return_address(i);			\
 		if (p == NULL)						\
-			return (false);					\
+			return;						\
 		if (i >= NIGNORE) {					\
 			bt->vec[(i) - NIGNORE] = p;			\
 			bt->len = (i) - NIGNORE + 1;			\
 		}							\
 	} else								\
-		return (false);
+		return;
 
 	assert(max <= (1U << opt_lg_prof_bt_max));
 
@@ -376,9 +416,7 @@
 	BT_FRAME(128)
 	BT_FRAME(129)
 	BT_FRAME(130)
-
 #undef BT_FRAME
-	return (false);
 }
 #endif
 
@@ -1039,6 +1077,14 @@
 		}
 	}
 
+#ifdef JEMALLOC_PROF_LIBGCC
+	/*
+	 * Cause the backtracing machinery to allocate its internal state
+	 * before enabling profiling.
+	 */
+	_Unwind_Backtrace(prof_unwind_init_callback, NULL);
+#endif
+
 	prof_booted = true;
 
 	return (false);