Optimize Valgrind integration.

Forcefully disable tcache if running inside Valgrind, and remove
Valgrind calls in tcache-specific code.

Restructure Valgrind-related code to move most Valgrind calls out of the
fast path functions.

Take advantage of static knowledge to elide some branches in
JEMALLOC_VALGRIND_REALLOC().
diff --git a/Makefile.in b/Makefile.in
index f7aa7d8..e411804 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -48,6 +48,7 @@
 cfgoutputs_out := @cfgoutputs_out@
 enable_autogen := @enable_autogen@
 enable_code_coverage := @enable_code_coverage@
+enable_valgrind := @enable_valgrind@
 enable_zone_allocator := @enable_zone_allocator@
 DSO_LDFLAGS = @DSO_LDFLAGS@
 SOREV = @SOREV@
@@ -82,6 +83,9 @@
 	$(srcroot)src/mb.c $(srcroot)src/mutex.c $(srcroot)src/prof.c \
 	$(srcroot)src/quarantine.c $(srcroot)src/rtree.c $(srcroot)src/stats.c \
 	$(srcroot)src/tcache.c $(srcroot)src/util.c $(srcroot)src/tsd.c
+ifeq ($(enable_valgrind), 1)
+C_SRCS += $(srcroot)src/valgrind.c
+endif
 ifeq ($(enable_zone_allocator), 1)
 C_SRCS += $(srcroot)src/zone.c
 endif