diff --git a/coregrind/Makefile.am b/coregrind/Makefile.am
index 94587ff..4a3ecf8 100644
--- a/coregrind/Makefile.am
+++ b/coregrind/Makefile.am
@@ -39,7 +39,6 @@
 valgrind_LDADD=
 
 stage2_SOURCES = \
-	stage2.c \
 	ume.c \
 	x86/ume_entry.S \
 	x86/ume_go.c \
diff --git a/coregrind/docs/coregrind_core.html b/coregrind/docs/coregrind_core.html
index a558eea..6baf039 100644
--- a/coregrind/docs/coregrind_core.html
+++ b/coregrind/docs/coregrind_core.html
@@ -475,11 +475,6 @@
   valgrind --tool=<i>tool_name</i> [options-for-Valgrind] your-prog [options for your-prog]
 </pre>
 
-<p>Note that Valgrind also reads options from the environment variable
-<code>$VALGRIND_OPTS</code>, and processes them before the command-line
-options.  Options for the Valgrind core may be freely mixed with those
-for the selected tool.
-
 <p>Valgrind's default settings succeed in giving reasonable behaviour
 in most cases.  We group the available options by rough categories.
 
@@ -711,10 +706,7 @@
       etc, return 4-byte aligned addresses.  These are suitable for
       any accesses on x86 processors. 
       Some programs might however assume that <code>malloc</code> et
-      al return 8- or more aligned memory.
-      These programs are broken and should be fixed, but
-      if this is impossible for whatever reason the alignment can be
-      increased using this parameter.  The supplied value must be
+      al return 8- or more aligned memory.  The supplied value must be
       between 4 and 4096 inclusive, and must be a power of two.</li><br><p>
 
   <li><code>--sloppy-malloc=no</code> [default]<br>
@@ -940,6 +932,33 @@
       <p>
 </ul>
 
+<h4>Setting default options</h4>
+
+<p>Note that Valgrind also reads options from three places:
+<ul>
+<li>The file <code>~/.valgrindrc</code>
+<li>The environment variable <code>$VALGRIND_OPTS</code>
+<li>The file <code>./.valgrindrc</code>
+</ul>
+These are processed in the given order, before the command-line options.
+Options processed later override those processed earlier;  for example,
+options in <code>./.valgrindrc</code> will take precedence over those in
+<code>~/.valgrindrc</code>.  The first two are particularly useful for
+setting the default tool to use.
+<p>
+Any tool-specific options put in <code>$VALGRIND_OPTS</code> or the
+<code>.valgrindrc</code> files should be prefixed with the tool name and
+a colon.  For example, if you want Memcheck to always do leak checking,
+you can put the following entry in <code>~/.valgrindrc</code>:
+
+<pre>
+    --memcheck:leak-check=yes
+</pre>
+
+This will be ignored if any tool other than Memcheck is run.
+Without the <code>memcheck:</code> part, this will cause problems if you
+select other tools that don't understand <code>--leak-check=yes</code>.
+
 
 <a name="clientreq"></a>
 <h3>2.7&nbsp; The Client Request mechanism</h3>
@@ -1151,7 +1170,7 @@
 
 <p>The translator/instrumentor has a lot of assertions in it.  They
 are permanently enabled, and I have no plans to disable them.  If one
-of these breaks, please mail me!
+of these breaks, please mail us!
 
 <p>If you get an assertion failure on the expression
 <code>chunkSane(ch)</code> in <code>vg_free()</code> in
diff --git a/coregrind/stage1.c b/coregrind/stage1.c
index 7056784..1b91f84 100644
--- a/coregrind/stage1.c
+++ b/coregrind/stage1.c
@@ -1,6 +1,6 @@
 
 /*--------------------------------------------------------------------*/
-/*--- Startup: stage 1                                    stage1.c ---*/
+/*--- Startup: preliminaries                              stage1.c ---*/
 /*--------------------------------------------------------------------*/
 
 /*
diff --git a/coregrind/ume.c b/coregrind/ume.c
index 2a12f01..d0a5f5a 100644
--- a/coregrind/ume.c
+++ b/coregrind/ume.c
@@ -590,8 +590,11 @@
    }
    
    info->argv0 = strdup(interp);
-   if (arg != NULL && *arg != '\0')
+   assert(NULL != info->argv0);
+   if (arg != NULL && *arg != '\0') {
       info->argv1 = strdup(arg);
+      assert(NULL != info->argv1);
+   }
 
    if (info->argv && info->argv[0] != NULL)
       info->argv[0] = (char *)name;
diff --git a/coregrind/ume.h b/coregrind/ume.h
index a1647dd..286e356 100644
--- a/coregrind/ume.h
+++ b/coregrind/ume.h
@@ -57,10 +57,10 @@
    addr_t	brkbase;	/* base address of brk segment		*/
 
    /* these are the extra args added by #! scripts */
-   char		*argv0;		/* the interpreter name */
-   char		*argv1;		/* the args for the interpreter */
+   char		*argv0;		/* INPUT: the interpreter name */
+   char		*argv1;		/* INPUT: the args for the interpreter */
 
-   char		**argv;		/* the original argv */
+   char		**argv;		/* INPUT: the original argv */
 };
 
 int do_exec(const char *exe, struct exeinfo *info);
diff --git a/coregrind/vg_dispatch.S b/coregrind/vg_dispatch.S
index 225e1ff..997bf17 100644
--- a/coregrind/vg_dispatch.S
+++ b/coregrind/vg_dispatch.S
@@ -109,8 +109,8 @@
 	
 	   If %ebp has any other value, we panic.
 	*/
-	cmpl	$VG_(baseBlock), %ebp
-	jnz	dispatch_exceptional
+	/*cmpl	$VG_(baseBlock), %ebp*/
+	/*jnz	dispatch_exceptional*/
 	/* fall into main loop */
 
 
diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h
index 120b18d..006de0f 100644
--- a/coregrind/vg_include.h
+++ b/coregrind/vg_include.h
@@ -1369,44 +1369,6 @@
    Exports of vg_main.c
    ------------------------------------------------------------------ */
 
-/* structure used for transporting values from stage2 into Valgrind
-   proper */
-typedef struct {
-   Addr	client_esp;		/* initial client ESP			*/
-   Addr client_eip;		/* initial client EIP			*/
-   Char **client_envp;		/* client envp				*/
-   UInt	*client_auxv;		/* client auxv				*/
-   Addr client_brkbase;		/* initial value of brk			*/
-
-   Int	argc;			/* Valgrind's argc/argv			*/
-   Char **argv;
-   const Char *libdir;		/* library directory                    */
-
-   Int  vgexecfd;		/* fd of our own (stage1) executable    */
-   Int  clexecfd;		/* fd of the client executable          */
-
-   Addr client_base;		/* start of client address space	*/
-   Addr	client_end;		/* end of client address space		*/
-   Addr client_mapbase;		/* base address of !MAP_FIXED mappings  */
-   Addr	clstk_base;		/* lowest address of client stack	*/
-   Addr	clstk_end;		/* highest address of client stack	*/
-   Addr cl_tramp_code;		/* syscall+signal trampoline code       */
-
-   Addr	shadow_base;		/* start of skin's shadow memory	*/
-   Addr shadow_end;		/* end of skin's shadow memory		*/
-
-   Addr	vg_base;		/* start of Valgrind's memory		*/
-   Addr vg_mmap_end;		/* end of Valgrind's mmap area		*/
-   Addr	vg_end;			/* end of Valgrind's memory		*/
-} KickstartParams;
-
-/* Entrypoint for kickstart */
-typedef void (kickstart_main_t)(const KickstartParams *kp, 
-				void (*tool_init)(void), void *tool_dlhandle);
-extern kickstart_main_t VG_(main);
-
-extern void VG_(usage)(void);
-
 /* Is this a SSE/SSE2-capable CPU?  If so, we had better save/restore
    the SSE state all over the place.  This is set up very early, in
    vg_startup.S.  We have to determine it early since we can't even
@@ -1446,13 +1408,15 @@
 extern const Char *VG_(libdir);
 
 /* A structure used as an intermediary when passing the simulated
-   CPU's state to some assembly fragments, particularly system calls.
-   Stuff is copied from baseBlock to here, the assembly magic runs,
-   and then the inverse copy is done.  Alignment: the SSE state must
-   be 16-byte aligned.  We ask for the whole struct to be 16-byte
-   aligned, and the SSE state starts at the 6+8+1+1th == 16th word,
-   so it too must be 16-byte aligned.  Consequence: change this struct
-   only _very carefully_ !  See also above comment re masking MXCSR. 
+   CPU's state to VG_(switch_to_real_CPU)(), for --stop-after=yes.
+   Stuff is copied from baseBlock to here, because it's much easier
+   to copy the state into the real registers from this structure than
+   the baseBlock, because it's layout is simpler.
+   Alignment: the SSE state must be 16-byte aligned.  We ask for the whole
+   struct to be 16-byte aligned, and the SSE state starts at the 6+8+1+1th
+   == 16th word, so it too must be 16-byte aligned.  Consequence: change
+   this struct only _very carefully_ !  See also above comment re masking
+   MXCSR. 
 */
 __attribute__ ((aligned (16)))
 extern UInt VG_(m_state_static) [6 /* segment regs, Intel order */
@@ -1462,10 +1426,6 @@
                                  + VG_SIZE_OF_SSESTATE_W /* SSE state */
                                 ];
 
-/* Handy fns for doing the copy back and forth. */
-extern void VG_(copy_baseBlock_to_m_state_static) ( void );
-extern void VG_(copy_m_state_static_to_baseBlock) ( void );
-
 /* Determine if %esp adjustment must be noted */
 extern Bool VG_(need_to_handle_esp_assignment) ( void );
 
@@ -1484,19 +1444,11 @@
 extern Int    VG_(vg_argc);
 extern Char **VG_(vg_argv);
 
-/* Holds client's %esp at the point we gained control.  From this the
-   client's argc, argv and envp are deduced. */
-extern Addr   VG_(esp_at_startup);
-
 /* Indicates presence, and holds address of client's sysinfo page, a
    feature of some modern kernels used to provide vsyscalls, etc. */
 extern Bool VG_(sysinfo_page_exists);
 extern Addr VG_(sysinfo_page_addr);
 
-/* Walk through a colon separated list variable, removing entries
-   which match pattern. */
-extern void VG_(mash_colon_env)(Char *varp, const Char *pattern);
-
 /* Something of a function looking for a home ... start up GDB. */
 extern void VG_(start_GDB) ( Int tid );
 
@@ -1633,8 +1585,6 @@
 extern Bool     VG_(seg_contains)(const Segment *s, Addr ptr, UInt size);
 extern Bool     VG_(seg_overlaps)(const Segment *s, Addr ptr, UInt size);
 
-extern void VG_(init_memory)        ( void );
-
 extern __attribute__((regparm(1))) 
        void VG_(unknown_esp_update) ( Addr new_ESP );
 
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
index 4dda086..11b80d4 100644
--- a/coregrind/vg_main.c
+++ b/coregrind/vg_main.c
@@ -1,7 +1,6 @@
 
 /*--------------------------------------------------------------------*/
-/*--- C startup stuff, reached from vg_startup.S.                  ---*/
-/*---                                                    vg_main.c ---*/
+/*--- Startup: the real stuff                            vg_main.c ---*/
 /*--------------------------------------------------------------------*/
 
 /*
@@ -29,121 +28,92 @@
    The GNU General Public License is contained in the file COPYING.
 */
 
-#include "vg_include.h"
+#define _FILE_OFFSET_BITS 64
 
+#include "vg_include.h"
+#include "ume.h"
+#include "ume_arch.h"
+#include "ume_archdefs.h"
+
+#include <dirent.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <sys/ptrace.h>
 #include <sys/signal.h>
 #include <sys/user.h>
 #include <sys/wait.h>
 #include <unistd.h>
 
+#ifndef AT_SYSINFO
+#define AT_SYSINFO		32
+#endif /* AT_SYSINFO */
+
+#ifndef AT_SYSINFO_EHDR
+#define AT_SYSINFO_EHDR		33
+#endif /* AT_SYSINFO_EHDR */
+
+#ifndef AT_SECURE
+#define AT_SECURE 23   /* secure mode boolean */
+#endif	/* AT_SECURE */
+
+/* Amount to reserve for Valgrind's internal heap */
+#define VALGRIND_HEAPSIZE	(128*1024*1024)
+
+/* Amount to reserve for Valgrind's internal mappings */
+#define VALGRIND_MAPSIZE	(128*1024*1024)
+
+/* redzone gap between client address space and shadow */
+#define REDZONE_SIZE		(1 * 1024*1024)
+
+/* size multiple for client address space */
+#define CLIENT_SIZE_MULTIPLE	(64 * 1024*1024)
+
+#define ISSPACE(cc)      ((cc) == ' ' || (cc) == '\t' || (cc) == '\n')
+
+/*====================================================================*/
+/*=== Global entities not referenced from generated code           ===*/
+/*====================================================================*/
+
 /* ---------------------------------------------------------------------
-   Compute offsets into baseBlock.  See comments in vg_include.h.
+   Startup stuff                            
    ------------------------------------------------------------------ */
+/* linker-defined base address */
+extern char kickstart_base;	
 
-/* The variables storing offsets. */
-
-#define INVALID_OFFSET (-1)
-
-Int VGOFF_(m_eax) = INVALID_OFFSET;
-Int VGOFF_(m_ecx) = INVALID_OFFSET;
-Int VGOFF_(m_edx) = INVALID_OFFSET;
-Int VGOFF_(m_ebx) = INVALID_OFFSET;
-Int VGOFF_(m_esp) = INVALID_OFFSET;
-Int VGOFF_(m_ebp) = INVALID_OFFSET;
-Int VGOFF_(m_esi) = INVALID_OFFSET;
-Int VGOFF_(m_edi) = INVALID_OFFSET;
-Int VGOFF_(m_eflags) = INVALID_OFFSET;
-Int VGOFF_(m_dflag)  = INVALID_OFFSET;
-Int VGOFF_(m_ssestate) = INVALID_OFFSET;
-Int VGOFF_(ldt)   = INVALID_OFFSET;
-Int VGOFF_(tls)   = INVALID_OFFSET;
-Int VGOFF_(m_cs)  = INVALID_OFFSET;
-Int VGOFF_(m_ss)  = INVALID_OFFSET;
-Int VGOFF_(m_ds)  = INVALID_OFFSET;
-Int VGOFF_(m_es)  = INVALID_OFFSET;
-Int VGOFF_(m_fs)  = INVALID_OFFSET;
-Int VGOFF_(m_gs)  = INVALID_OFFSET;
-Int VGOFF_(m_eip) = INVALID_OFFSET;
-Int VGOFF_(spillslots) = INVALID_OFFSET;
-Int VGOFF_(sh_eax) = INVALID_OFFSET;
-Int VGOFF_(sh_ecx) = INVALID_OFFSET;
-Int VGOFF_(sh_edx) = INVALID_OFFSET;
-Int VGOFF_(sh_ebx) = INVALID_OFFSET;
-Int VGOFF_(sh_esp) = INVALID_OFFSET;
-Int VGOFF_(sh_ebp) = INVALID_OFFSET;
-Int VGOFF_(sh_esi) = INVALID_OFFSET;
-Int VGOFF_(sh_edi) = INVALID_OFFSET;
-Int VGOFF_(sh_eflags) = INVALID_OFFSET;
-
-Int VGOFF_(helper_idiv_64_32) = INVALID_OFFSET;
-Int VGOFF_(helper_div_64_32) = INVALID_OFFSET;
-Int VGOFF_(helper_idiv_32_16) = INVALID_OFFSET;
-Int VGOFF_(helper_div_32_16) = INVALID_OFFSET;
-Int VGOFF_(helper_idiv_16_8) = INVALID_OFFSET;
-Int VGOFF_(helper_div_16_8) = INVALID_OFFSET;
-Int VGOFF_(helper_imul_32_64) = INVALID_OFFSET;
-Int VGOFF_(helper_mul_32_64) = INVALID_OFFSET;
-Int VGOFF_(helper_imul_16_32) = INVALID_OFFSET;
-Int VGOFF_(helper_mul_16_32) = INVALID_OFFSET;
-Int VGOFF_(helper_imul_8_16) = INVALID_OFFSET;
-Int VGOFF_(helper_mul_8_16) = INVALID_OFFSET;
-Int VGOFF_(helper_CLD) = INVALID_OFFSET;
-Int VGOFF_(helper_STD) = INVALID_OFFSET;
-Int VGOFF_(helper_get_dirflag) = INVALID_OFFSET;
-Int VGOFF_(helper_CLC) = INVALID_OFFSET;
-Int VGOFF_(helper_STC) = INVALID_OFFSET;
-Int VGOFF_(helper_shldl) = INVALID_OFFSET;
-Int VGOFF_(helper_shldw) = INVALID_OFFSET;
-Int VGOFF_(helper_shrdl) = INVALID_OFFSET;
-Int VGOFF_(helper_shrdw) = INVALID_OFFSET;
-Int VGOFF_(helper_IN) = INVALID_OFFSET;
-Int VGOFF_(helper_OUT) = INVALID_OFFSET;
-Int VGOFF_(helper_RDTSC) = INVALID_OFFSET;
-Int VGOFF_(helper_CPUID) = INVALID_OFFSET;
-Int VGOFF_(helper_BSWAP) = INVALID_OFFSET;
-Int VGOFF_(helper_bsf) = INVALID_OFFSET;
-Int VGOFF_(helper_bsr) = INVALID_OFFSET;
-Int VGOFF_(helper_fstsw_AX) = INVALID_OFFSET;
-Int VGOFF_(helper_SAHF) = INVALID_OFFSET;
-Int VGOFF_(helper_LAHF) = INVALID_OFFSET;
-Int VGOFF_(helper_DAS) = INVALID_OFFSET;
-Int VGOFF_(helper_DAA) = INVALID_OFFSET;
-Int VGOFF_(helper_cmpxchg8b) = INVALID_OFFSET;
-Int VGOFF_(helper_undefined_instruction) = INVALID_OFFSET;
-
-/* MAX_NONCOMPACT_HELPERS can be increased easily.  If MAX_COMPACT_HELPERS is
- * increased too much, they won't really be compact any more... */
-#define  MAX_COMPACT_HELPERS     8
-#define  MAX_NONCOMPACT_HELPERS  50 
-
-UInt VG_(n_compact_helpers)    = 0;
-UInt VG_(n_noncompact_helpers) = 0;
-
-Addr VG_(compact_helper_addrs)  [MAX_COMPACT_HELPERS];
-Int  VG_(compact_helper_offsets)[MAX_COMPACT_HELPERS];
-Addr VG_(noncompact_helper_addrs)  [MAX_NONCOMPACT_HELPERS];
-Int  VG_(noncompact_helper_offsets)[MAX_NONCOMPACT_HELPERS];
-
-/* This is the actual defn of baseblock. */
-UInt VG_(baseBlock)[VG_BASEBLOCK_WORDS];
-
-/* Client address space */
-Addr VG_(client_base);	/* client address space limits */
+/* Client address space, lowest to highest (see top of ume.c) */
+Addr VG_(client_base);           /* client address space limits */
 Addr VG_(client_end);
 Addr VG_(client_mapbase);
 Addr VG_(client_trampoline_code);
 Addr VG_(clstk_base);
 Addr VG_(clstk_end);
-Addr VG_(brk_base);	/* start of brk */
-Addr VG_(brk_limit);	/* current brk */
-Addr VG_(shadow_base);	/* skin's shadow memory */
+
+Addr VG_(brk_base);	         /* start of brk */
+Addr VG_(brk_limit);	         /* current brk */
+
+Addr VG_(shadow_base);	         /* skin's shadow memory */
 Addr VG_(shadow_end);
-Addr VG_(valgrind_base);	/* valgrind's address range */
-Addr VG_(valgrind_mmap_end);	/* valgrind's mmaps are between valgrind_base and here */
+
+Addr VG_(valgrind_base);	 /* valgrind's address range */
+Addr VG_(valgrind_mmap_end);	 /* valgrind's mmaps are between valgrind_base and here */
 Addr VG_(valgrind_end);
 
+/* This is set early to indicate whether this CPU has the
+   SSE/fxsave/fxrestor features.  */
+Bool VG_(have_ssestate);
+
+/* Indicates presence, and holds address of client's sysinfo page, a
+   feature of some modern kernels used to provide vsyscalls, etc. */
+Bool VG_(sysinfo_page_exists) = False;
+Addr VG_(sysinfo_page_addr) = 0;
+
 /* stage1 (main) executable */
 Int  VG_(vgexecfd) = -1;
 
@@ -166,314 +136,16 @@
 /* Maximum allowed application-visible file descriptor */
 Int VG_(max_fd) = -1;
 
-/* Words. */
-static Int baB_off = 0;
-
-/* jmp_buf for fatal signals */
-Int	VG_(fatal_sigNo) = -1;
-Bool	VG_(fatal_signal_set) = False;
-jmp_buf VG_(fatal_signal_jmpbuf);
-
-/* Returns the offset, in words. */
-static Int alloc_BaB ( Int words )
-{
-   Int off = baB_off;
-   baB_off += words;
-   if (baB_off >= VG_BASEBLOCK_WORDS)
-      VG_(core_panic)( "alloc_BaB: baseBlock is too small");
-
-   return off;   
-}
-
-/* Align offset, in *bytes* */
-static void align_BaB ( UInt align )
-{
-   vg_assert(2 == align || 4 == align || 8 == align || 16 == align);
-   baB_off +=  (align-1);
-   baB_off &= ~(align-1);
-}
-
-/* Allocate 1 word in baseBlock and set it to the given value. */
-static Int alloc_BaB_1_set ( Addr a )
-{
-   Int off = alloc_BaB(1);
-   VG_(baseBlock)[off] = (UInt)a;
-   return off;
-}
-
-/* Registers a function in compact_helper_addrs;  compact_helper_offsets is
-   filled in later. */
-void VG_(register_compact_helper)(Addr a)
-{
-   if (MAX_COMPACT_HELPERS <= VG_(n_compact_helpers)) {
-      VG_(printf)("Can only register %d compact helpers\n", 
-                  MAX_COMPACT_HELPERS);
-      VG_(core_panic)("Too many compact helpers registered");
-   }
-   VG_(compact_helper_addrs)[VG_(n_compact_helpers)] = a;
-   VG_(n_compact_helpers)++;
-}
-
-/* Registers a function in noncompact_helper_addrs;  noncompact_helper_offsets
- * is filled in later.
- */
-void VG_(register_noncompact_helper)(Addr a)
-{
-   if (MAX_NONCOMPACT_HELPERS <= VG_(n_noncompact_helpers)) {
-      VG_(printf)("Can only register %d non-compact helpers\n", 
-                  MAX_NONCOMPACT_HELPERS);
-      VG_(printf)("Try increasing MAX_NON_COMPACT_HELPERS\n");
-      VG_(core_panic)("Too many non-compact helpers registered");
-   }
-   VG_(noncompact_helper_addrs)[VG_(n_noncompact_helpers)] = a;
-   VG_(n_noncompact_helpers)++;
-}
-
-/* Allocate offsets in baseBlock for the skin helpers */
-static 
-void assign_helpers_in_baseBlock(UInt n, Int offsets[], Addr addrs[])
-{
-   UInt i;
-   for (i = 0; i < n; i++) 
-      offsets[i] = alloc_BaB_1_set( addrs[i] );
-}
-
-Bool VG_(need_to_handle_esp_assignment)(void)
-{
-   return ( VG_(defined_new_mem_stack_4)()  ||
-            VG_(defined_die_mem_stack_4)()  ||
-            VG_(defined_new_mem_stack_8)()  ||
-            VG_(defined_die_mem_stack_8)()  ||
-            VG_(defined_new_mem_stack_12)() ||
-            VG_(defined_die_mem_stack_12)() ||
-            VG_(defined_new_mem_stack_16)() ||
-            VG_(defined_die_mem_stack_16)() ||
-            VG_(defined_new_mem_stack_32)() ||
-            VG_(defined_die_mem_stack_32)() ||
-            VG_(defined_new_mem_stack)()    ||
-            VG_(defined_die_mem_stack)()
-          );
-}
-
-/* Here we assign actual offsets.  It's important to get the most
-   popular referents within 128 bytes of the start, so we can take
-   advantage of short addressing modes relative to %ebp.  Popularity
-   of offsets was measured on 22 Feb 02 running a KDE application, and
-   the slots rearranged accordingly, with a 1.5% reduction in total
-   size of translations. */
-static void vg_init_baseBlock ( void )
-{
-   /* Those with offsets under 128 are carefully chosen. */
-
-   /* WORD offsets in this column */
-   /* 0   */ VGOFF_(m_eax)     = alloc_BaB(1);
-   /* 1   */ VGOFF_(m_ecx)     = alloc_BaB(1);
-   /* 2   */ VGOFF_(m_edx)     = alloc_BaB(1);
-   /* 3   */ VGOFF_(m_ebx)     = alloc_BaB(1);
-   /* 4   */ VGOFF_(m_esp)     = alloc_BaB(1);
-   /* 5   */ VGOFF_(m_ebp)     = alloc_BaB(1);
-   /* 6   */ VGOFF_(m_esi)     = alloc_BaB(1);
-   /* 7   */ VGOFF_(m_edi)     = alloc_BaB(1);
-   /* 8   */ VGOFF_(m_eflags)  = alloc_BaB(1);
-
-   if (VG_(needs).shadow_regs) {
-      /* 9   */ VGOFF_(sh_eax)    = alloc_BaB(1);
-      /* 10  */ VGOFF_(sh_ecx)    = alloc_BaB(1);
-      /* 11  */ VGOFF_(sh_edx)    = alloc_BaB(1);
-      /* 12  */ VGOFF_(sh_ebx)    = alloc_BaB(1);
-      /* 13  */ VGOFF_(sh_esp)    = alloc_BaB(1);
-      /* 14  */ VGOFF_(sh_ebp)    = alloc_BaB(1);
-      /* 15  */ VGOFF_(sh_esi)    = alloc_BaB(1);
-      /* 16  */ VGOFF_(sh_edi)    = alloc_BaB(1);
-      /* 17  */ VGOFF_(sh_eflags) = alloc_BaB(1);
-   }
-
-   /* 9,10,11 or 18,19,20... depends on number whether shadow regs are used
-    * and on compact helpers registered */ 
-
-   /* Make these most-frequently-called specialised ones compact, if they
-      are used. */
-   if (VG_(defined_new_mem_stack_4)())
-      VG_(register_compact_helper)( (Addr) VG_(tool_interface).track_new_mem_stack_4);
-
-   if (VG_(defined_die_mem_stack_4)())
-      VG_(register_compact_helper)( (Addr) VG_(tool_interface).track_die_mem_stack_4);
-
-   /* (9 or 18) + n_compact_helpers  */
-   /* Allocate slots for compact helpers */
-   assign_helpers_in_baseBlock(VG_(n_compact_helpers), 
-                               VG_(compact_helper_offsets), 
-                               VG_(compact_helper_addrs));
-
-   /* (9/10 or 18/19) + n_compact_helpers */
-   VGOFF_(m_eip) = alloc_BaB(1);
-
-   /* There are currently 24 spill slots */
-   /* (11+/20+ .. 32+/43+) + n_compact_helpers.  This can overlap the magic
-    * boundary at >= 32 words, but most spills are to low numbered spill
-    * slots, so the ones above the boundary don't see much action. */
-   VGOFF_(spillslots) = alloc_BaB(VG_MAX_SPILLSLOTS);
-
-   /* I gave up counting at this point.  Since they're above the
-      short-amode-boundary, there's no point. */
-
-   VGOFF_(m_dflag) = alloc_BaB(1);
-
-   /* The FPU/SSE state.  This _must_ be 16-byte aligned. */
-   align_BaB(16);
-   VGOFF_(m_ssestate) = alloc_BaB(VG_SIZE_OF_SSESTATE_W);
-   vg_assert( 
-      (  ((UInt)(& VG_(baseBlock)[VGOFF_(m_ssestate)]))
-         % 16  )
-      == 0
-   );
-
-   /* This thread's LDT and TLS pointers, and segment registers. */
-   VGOFF_(ldt)   = alloc_BaB(1);
-   VGOFF_(tls)   = alloc_BaB(1);
-   VGOFF_(m_cs)  = alloc_BaB(1);
-   VGOFF_(m_ss)  = alloc_BaB(1);
-   VGOFF_(m_ds)  = alloc_BaB(1);
-   VGOFF_(m_es)  = alloc_BaB(1);
-   VGOFF_(m_fs)  = alloc_BaB(1);
-   VGOFF_(m_gs)  = alloc_BaB(1);
-
-   VG_(register_noncompact_helper)( (Addr) & VG_(do_useseg) );
-
-#define REG(kind, size) \
-   if (VG_(defined_##kind##_mem_stack##size)()) \
-      VG_(register_noncompact_helper)(           \
-          (Addr) VG_(tool_interface).track_##kind##_mem_stack##size );
-
-   REG(new, _8);
-   REG(new, _12);
-   REG(new, _16);
-   REG(new, _32);
-   REG(new, );
-   REG(die, _8);
-   REG(die, _12);
-   REG(die, _16);
-   REG(die, _32);
-   REG(die, );
-#undef REG
-
-   if (VG_(need_to_handle_esp_assignment)())
-      VG_(register_noncompact_helper)((Addr) VG_(unknown_esp_update));
-
-   /* Helper functions. */
-   VGOFF_(helper_idiv_64_32)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_idiv_64_32));
-   VGOFF_(helper_div_64_32)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_div_64_32));
-   VGOFF_(helper_idiv_32_16)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_idiv_32_16));
-   VGOFF_(helper_div_32_16)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_div_32_16));
-   VGOFF_(helper_idiv_16_8)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_idiv_16_8));
-   VGOFF_(helper_div_16_8)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_div_16_8));
-
-   VGOFF_(helper_imul_32_64)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_imul_32_64));
-   VGOFF_(helper_mul_32_64)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_mul_32_64));
-   VGOFF_(helper_imul_16_32)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_imul_16_32));
-   VGOFF_(helper_mul_16_32)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_mul_16_32));
-   VGOFF_(helper_imul_8_16)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_imul_8_16));
-   VGOFF_(helper_mul_8_16)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_mul_8_16));
-
-   VGOFF_(helper_CLD)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_CLD));
-   VGOFF_(helper_STD)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_STD));
-   VGOFF_(helper_get_dirflag)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_get_dirflag));
-
-   VGOFF_(helper_CLC)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_CLC));
-   VGOFF_(helper_STC)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_STC));
-
-   VGOFF_(helper_shldl)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_shldl));
-   VGOFF_(helper_shldw)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_shldw));
-   VGOFF_(helper_shrdl)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_shrdl));
-   VGOFF_(helper_shrdw)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_shrdw));
-
-   VGOFF_(helper_RDTSC)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_RDTSC));
-   VGOFF_(helper_CPUID)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_CPUID));
-
-   VGOFF_(helper_bsf)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_bsf));
-   VGOFF_(helper_bsr)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_bsr));
-
-   VGOFF_(helper_fstsw_AX)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_fstsw_AX));
-   VGOFF_(helper_SAHF)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_SAHF));
-   VGOFF_(helper_LAHF)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_LAHF));
-   VGOFF_(helper_DAS)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_DAS));
-   VGOFF_(helper_DAA)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_DAA));
-   VGOFF_(helper_IN)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_IN));
-   VGOFF_(helper_OUT)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_OUT));
-   VGOFF_(helper_cmpxchg8b)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_cmpxchg8b));
-
-   VGOFF_(helper_undefined_instruction)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_undefined_instruction));
-
-   /* Allocate slots for noncompact helpers */
-   assign_helpers_in_baseBlock(VG_(n_noncompact_helpers), 
-                               VG_(noncompact_helper_offsets), 
-                               VG_(noncompact_helper_addrs));
-
-
-   /* Initialise slots that require it */
-   VG_(copy_m_state_static_to_baseBlock)();
-
-   /* Pretend the root thread has a completely empty LDT to start with. */
-   VG_(baseBlock)[VGOFF_(ldt)] = (UInt)NULL;
-
-   /* Pretend the root thread has no TLS array for now. */
-   VG_(baseBlock)[VGOFF_(tls)] = (UInt)NULL;
-
-   /* Initialise shadow regs */
-   if (VG_(needs).shadow_regs) {
-      VG_(baseBlock)[VGOFF_(sh_esp)]    = 
-      VG_(baseBlock)[VGOFF_(sh_ebp)]    =
-      VG_(baseBlock)[VGOFF_(sh_eax)]    =
-      VG_(baseBlock)[VGOFF_(sh_ecx)]    =
-      VG_(baseBlock)[VGOFF_(sh_edx)]    =
-      VG_(baseBlock)[VGOFF_(sh_ebx)]    =
-      VG_(baseBlock)[VGOFF_(sh_esi)]    =
-      VG_(baseBlock)[VGOFF_(sh_edi)]    = 0;
-      VG_(baseBlock)[VGOFF_(sh_eflags)] = 0;
-      VG_TRACK( post_regs_write_init );
-   }
-}
-
+/* As deduced from esp_at_startup, the client's argc, argv[] and
+   envp[] as extracted from the client's stack at startup-time. */
+Int    VG_(client_argc);
+Char** VG_(client_argv);
+Char** VG_(client_envp);
 
 /* ---------------------------------------------------------------------
-   Global entities which are not referenced from generated code.
+   Running stuff                            
    ------------------------------------------------------------------ */
-
-/* Ditto our signal delivery stack. */
+/* Our signal delivery stack. */
 UInt VG_(sigstack)[VG_SIGSTACK_SIZE_W];
 
 /* Saving stuff across system calls. */
@@ -481,9 +153,13 @@
 UInt VG_(real_sse_state_saved_over_syscall)[VG_SIZE_OF_SSESTATE_W];
 Addr VG_(esp_saved_over_syscall);
 
-/* Counts downwards in vg_run_innerloop. */
-UInt VG_(dispatch_ctr);
+/* jmp_buf for fatal signals */
+Int	VG_(fatal_sigNo) = -1;
+Bool	VG_(fatal_signal_set) = False;
+jmp_buf VG_(fatal_signal_jmpbuf);
 
+/* Counts downwards in VG_(run_innerloop). */
+UInt VG_(dispatch_ctr);
 
 /* 64-bit counter for the number of basic blocks done. */
 ULong VG_(bbs_done);
@@ -493,23 +169,23 @@
 /* This is the ThreadId of the last thread the scheduler ran. */
 ThreadId VG_(last_run_tid) = 0;
 
+/* Tell the logging mechanism whether we are logging to a file
+   descriptor or a socket descriptor. */
+Bool VG_(logging_to_filedes) = True;
+
+/* This Bool is needed by wrappers in vg_clientmalloc.c to decide how
+   to behave.  Initially we say False. */
+Bool VG_(running_on_simd_CPU) = False;
+
 /* This is the argument to __NR_exit() supplied by the first thread to
    call that syscall.  We eventually pass that to __NR_exit() for
    real. */
 Int VG_(exitcode) = 0;
 
-/* Tell the logging mechanism whether we are logging to a file
-   descriptor or a socket descriptor. */
-Bool VG_(logging_to_filedes) = True;
 
-/* This is set early to inidicate whether this CPU has the
-   SSE/fxsave/fxrestor features.  */
-Bool VG_(have_ssestate);
-
-
-/* ---------------------------------------------------------------------
-   Counters, for informational purposes only.
-   ------------------------------------------------------------------ */
+/*====================================================================*/
+/*=== Counters, for profiling purposes only                        ===*/
+/*====================================================================*/
 
 /* Number of lookups which miss the fast tt helper. */
 UInt VG_(tt_fast_misses) = 0;
@@ -558,9 +234,1132 @@
 UInt VG_(num_scheduling_events_MAJOR) = 0;
 
 
-/* ---------------------------------------------------------------------
-   Values derived from command-line options.
-   ------------------------------------------------------------------ */
+static __inline__ Int safe_idiv(Int a, Int b)
+{
+   return (b == 0 ? 0 : a / b);
+}
+
+static void show_counts ( void )
+{
+   VG_(message)(Vg_DebugMsg,
+		"    TT/TC: %d tc sectors discarded.",
+                VG_(number_of_tc_discards) );
+   VG_(message)(Vg_DebugMsg,
+                "           %d chainings, %d unchainings.",
+                VG_(bb_enchain_count), VG_(bb_dechain_count) );
+   VG_(message)(Vg_DebugMsg,
+                "translate: new     %d (%d -> %d; ratio %d:10)",
+                VG_(overall_in_count),
+                VG_(overall_in_osize),
+                VG_(overall_in_tsize),
+                safe_idiv(10*VG_(overall_in_tsize), VG_(overall_in_osize)));
+   VG_(message)(Vg_DebugMsg,
+                "           discard %d (%d -> %d; ratio %d:10).",
+                VG_(overall_out_count),
+                VG_(overall_out_osize),
+                VG_(overall_out_tsize),
+                safe_idiv(10*VG_(overall_out_tsize), VG_(overall_out_osize)));
+   VG_(message)(Vg_DebugMsg,
+      " dispatch: %llu jumps (bb entries), of which %u (%lu%%) were unchained.",
+      VG_(bbs_done), 
+      VG_(unchained_jumps_done),
+      ((ULong)(100) * (ULong)(VG_(unchained_jumps_done)))
+         / ( VG_(bbs_done)==0 ? 1 : VG_(bbs_done) )
+   );
+
+   VG_(message)(Vg_DebugMsg,
+      "           %d/%d major/minor sched events.  %d tt_fast misses.", 
+                     VG_(num_scheduling_events_MAJOR), 
+                     VG_(num_scheduling_events_MINOR), 
+                     VG_(tt_fast_misses));
+
+   VG_(message)(Vg_DebugMsg, 
+                "reg-alloc: %d t-req-spill, "
+                "%d+%d orig+spill uis, %d total-reg-r.",
+                VG_(translations_needing_spill),
+                VG_(uinstrs_prealloc),
+                VG_(uinstrs_spill),
+                VG_(total_reg_rank) );
+   VG_(message)(Vg_DebugMsg, 
+                "   sanity: %d cheap, %d expensive checks.",
+                VG_(sanity_fast_count), 
+                VG_(sanity_slow_count) );
+   VG_(print_ccall_stats)();
+}
+
+
+/*====================================================================*/
+/*=== Miscellaneous global functions                               ===*/
+/*====================================================================*/
+
+/* Start GDB and get it to attach to this process.  Called if the user
+   requests this service after an error has been shown, so she can
+   poke around and look at parameters, memory, etc.  You can't
+   meaningfully get GDB to continue the program, though; to continue,
+   quit GDB.  */
+void VG_(start_GDB) ( Int tid )
+{
+   Int pid;
+
+   if ((pid = fork()) == 0) {
+      ptrace(PTRACE_TRACEME, 0, NULL, NULL);
+      VG_(kkill)(VG_(getpid)(), VKI_SIGSTOP);
+
+   } else if (pid > 0) {
+      struct user_regs_struct regs;
+      Int status;
+      Int res;
+
+      if (VG_(is_running_thread)( tid )) {
+         regs.xcs = VG_(baseBlock)[VGOFF_(m_cs)];
+         regs.xss = VG_(baseBlock)[VGOFF_(m_ss)];
+         regs.xds = VG_(baseBlock)[VGOFF_(m_ds)];
+         regs.xes = VG_(baseBlock)[VGOFF_(m_es)];
+         regs.xfs = VG_(baseBlock)[VGOFF_(m_fs)];
+         regs.xgs = VG_(baseBlock)[VGOFF_(m_gs)];
+         regs.eax = VG_(baseBlock)[VGOFF_(m_eax)];
+         regs.ebx = VG_(baseBlock)[VGOFF_(m_ebx)];
+         regs.ecx = VG_(baseBlock)[VGOFF_(m_ecx)];
+         regs.edx = VG_(baseBlock)[VGOFF_(m_edx)];
+         regs.esi = VG_(baseBlock)[VGOFF_(m_esi)];
+         regs.edi = VG_(baseBlock)[VGOFF_(m_edi)];
+         regs.ebp = VG_(baseBlock)[VGOFF_(m_ebp)];
+         regs.esp = VG_(baseBlock)[VGOFF_(m_esp)];
+         regs.eflags = VG_(baseBlock)[VGOFF_(m_eflags)];
+         regs.eip = VG_(baseBlock)[VGOFF_(m_eip)];
+      } else {
+         ThreadState* tst = & VG_(threads)[ tid ];
+         
+         regs.xcs = tst->m_cs;
+         regs.xss = tst->m_ss;
+         regs.xds = tst->m_ds;
+         regs.xes = tst->m_es;
+         regs.xfs = tst->m_fs;
+         regs.xgs = tst->m_gs;
+         regs.eax = tst->m_eax;
+         regs.ebx = tst->m_ebx;
+         regs.ecx = tst->m_ecx;
+         regs.edx = tst->m_edx;
+         regs.esi = tst->m_esi;
+         regs.edi = tst->m_edi;
+         regs.ebp = tst->m_ebp;
+         regs.esp = tst->m_esp;
+         regs.eflags = tst->m_eflags;
+         regs.eip = tst->m_eip;
+      }
+
+      if ((res = VG_(waitpid)(pid, &status, 0)) == pid &&
+          WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP &&
+          ptrace(PTRACE_SETREGS, pid, NULL, &regs) == 0 &&
+          ptrace(PTRACE_DETACH, pid, NULL, SIGSTOP) == 0) {
+         UChar buf[VG_(strlen)(VG_(clo_GDB_path)) + 100];
+
+         VG_(sprintf)(buf, "%s -nw /proc/%d/fd/%d %d",
+                      VG_(clo_GDB_path), VG_(main_pid), VG_(clexecfd), pid);
+         VG_(message)(Vg_UserMsg, "starting GDB with cmd: %s", buf);
+         res = VG_(system)(buf);
+         if (res == 0) {      
+            VG_(message)(Vg_UserMsg, "");
+            VG_(message)(Vg_UserMsg, 
+                         "GDB has detached.  Valgrind regains control.  We continue.");
+         } else {
+            VG_(message)(Vg_UserMsg, "Apparently failed!");
+            VG_(message)(Vg_UserMsg, "");
+         }
+      }
+
+      VG_(kkill)(pid, VKI_SIGKILL);
+      VG_(waitpid)(pid, &status, 0);
+   }
+}
+
+
+/* Print some helpful-ish text about unimplemented things, and give
+   up. */
+void VG_(unimplemented) ( Char* msg )
+{
+   VG_(message)(Vg_UserMsg, "");
+   VG_(message)(Vg_UserMsg, 
+      "Valgrind detected that your program requires");
+   VG_(message)(Vg_UserMsg, 
+      "the following unimplemented functionality:");
+   VG_(message)(Vg_UserMsg, "   %s", msg);
+   VG_(message)(Vg_UserMsg,
+      "This may be because the functionality is hard to implement,");
+   VG_(message)(Vg_UserMsg,
+      "or because no reasonable program would behave this way,");
+   VG_(message)(Vg_UserMsg,
+      "or because nobody has yet needed it.  In any case, let us know at");
+   VG_(message)(Vg_UserMsg,
+      "%s and/or try to work around the problem, if you can.", VG_BUGS_TO);
+   VG_(message)(Vg_UserMsg,
+      "");
+   VG_(message)(Vg_UserMsg,
+      "Valgrind has to exit now.  Sorry.  Bye!");
+   VG_(message)(Vg_UserMsg,
+      "");
+   VG_(pp_sched_status)();
+   VG_(exit)(1);
+}
+
+Addr VG_(get_stack_pointer) ( void )
+{
+   return VG_(baseBlock)[VGOFF_(m_esp)];
+}
+
+/* Debugging thing .. can be called from assembly with OYNK macro. */
+void VG_(oynk) ( Int n )
+{
+   OINK(n);
+}
+
+/* Initialize the PID and PGRP of scheduler LWP; this is also called
+   in any new children after fork. */
+static void newpid(ThreadId unused)
+{
+   /* PID of scheduler LWP */
+   VG_(main_pid)  = VG_(getpid)();
+   VG_(main_pgrp) = VG_(getpgrp)();
+}
+
+/*====================================================================*/
+/*=== Check we were launched by stage 1                            ===*/
+/*====================================================================*/
+
+/* Look for our AUXV table */
+static void scan_auxv(void)
+{
+   const struct ume_auxv *auxv = find_auxv((int *)ume_exec_esp);
+   int found = 0;
+
+   for (; auxv->a_type != AT_NULL; auxv++)
+      switch(auxv->a_type) {
+      case AT_UME_PADFD:
+	 as_setpadfd(auxv->u.a_val);
+	 found |= 1;
+	 break;
+
+      case AT_UME_EXECFD:
+	 VG_(vgexecfd) = auxv->u.a_val;
+	 found |= 2;
+	 break;
+      }
+
+   if ( ! (1|2) ) {
+      fprintf(stderr, "stage2 must be launched by stage1\n");
+      exit(127);
+   }
+}
+
+
+/*====================================================================*/
+/*=== Address space determination                                  ===*/
+/*====================================================================*/
+
+/* Pad client space so it doesn't get filled in before the right time */
+static void layout_client_space(Addr argc_addr)
+{
+   VG_(client_base)       = CLIENT_BASE;
+   VG_(valgrind_mmap_end) = (addr_t)&kickstart_base; /* end of V's mmaps */
+   VG_(valgrind_base)     = VG_(valgrind_mmap_end) - VALGRIND_MAPSIZE;
+   VG_(valgrind_end)      = ROUNDUP(argc_addr, 0x10000); /* stack */
+
+   if (0)
+      printf("client base:        %x\n"
+             "valgrind base--end: %x--%x (%x)\n"
+             "valgrind mmap end:  %x\n\n",
+             VG_(client_base),
+             VG_(valgrind_base), VG_(valgrind_end),
+             VG_(valgrind_end) - VG_(valgrind_base),
+             VG_(valgrind_mmap_end));
+
+   as_pad((void *)VG_(client_base), (void *)VG_(valgrind_base));
+}
+
+static void layout_remaining_space(float ratio)
+{
+   /* This tries to give the client as large as possible address space while
+    * taking into account the tool's shadow needs.  */
+   addr_t client_size = ROUNDDN((VG_(valgrind_base) - REDZONE_SIZE) / (1. + ratio), 
+                         CLIENT_SIZE_MULTIPLE);
+   addr_t shadow_size = PGROUNDUP(client_size * ratio);
+
+   VG_(client_end)     = VG_(client_base) + client_size;
+   VG_(client_mapbase) = PGROUNDDN((client_size/4)*3); /* where !FIXED mmap goes */
+   VG_(client_trampoline_code) = VG_(client_end) - VKI_BYTES_PER_PAGE;
+
+   VG_(shadow_base) = VG_(client_end) + REDZONE_SIZE;
+   VG_(shadow_end)  = VG_(shadow_base) + shadow_size;
+
+   if (0)
+      printf("client base--end:   %x--%x (%x)\n"
+             "client mapbase:     %x\n"
+             "shadow base--end:   %x--%x (%x)\n\n",
+             VG_(client_base), VG_(client_end), client_size,
+             VG_(client_mapbase),
+             VG_(shadow_base), VG_(shadow_end), shadow_size);
+
+   // Ban redzone
+   mmap((void *)VG_(client_end), REDZONE_SIZE, PROT_NONE,
+	MAP_FIXED|MAP_ANON|MAP_PRIVATE, -1, 0);
+
+   // Make client hole
+   munmap((void*)VG_(client_base), client_size);
+
+   // Map shadow memory.
+   // Initially all inaccessible, incrementally initialized as it is used
+   if (shadow_size != 0)
+      mmap((char *)VG_(shadow_base), shadow_size, PROT_NONE,
+         MAP_PRIVATE|MAP_ANON|MAP_FIXED, -1, 0);
+}
+
+/*====================================================================*/
+/*=== Command line setup                                           ===*/
+/*====================================================================*/
+
+/* Nb: malloc'd memory never freed -- kept throughout like argv, envp */
+static char* get_file_clo(char* dir)
+{
+#  define FLEN 512
+   Int fd, n;
+   struct stat s1;
+   char* f_clo = NULL;
+   char filename[FLEN];
+
+   snprintf(filename, FLEN, "%s/.valgrindrc", ( NULL == dir ? "" : dir ) );
+   fd = VG_(open)(filename, 0, VKI_S_IRUSR);
+   if ( fd > 0 ) {
+      if ( 0 == fstat(fd, &s1) ) {
+         f_clo = malloc(s1.st_size+1);
+         vg_assert(f_clo);
+         n = read(fd, f_clo, s1.st_size);
+         if (n == -1) n = 0;
+         f_clo[n] = '\0';
+      }
+      close(fd);
+   }
+   return f_clo;
+#  undef FLEN
+}
+
+static Int count_args(char* s)
+{
+   Int n = 0;
+   if (s) {
+      char* cp = s;
+      while (True) {
+         // We have alternating sequences: blanks, non-blanks, blanks...
+         // count the non-blanks sequences.
+         while ( ISSPACE(*cp) )         cp++;
+         if    ( !*cp )                 break;
+         n++;
+         while ( !ISSPACE(*cp) && *cp ) cp++;
+      }
+   }
+   return n;
+}
+
+/* add args out of environment, skipping multiple spaces and -- args */
+static char** copy_args( char* s, char** to )
+{
+   if (s) {
+      char* cp = s;
+      while (True) {
+         // We have alternating sequences: blanks, non-blanks, blanks...
+         // copy the non-blanks sequences, and add terminating '\0'
+         while ( ISSPACE(*cp) )         cp++;
+         if    ( !*cp )                 break;
+         *to++ = cp;
+         while ( !ISSPACE(*cp) && *cp ) cp++;
+         if ( *cp ) *cp++ = '\0';            // terminate if necessary
+         if (VG_STREQ(to[-1], "--")) to--;   // undo any '--' arg
+      }
+   }
+   return to;
+}
+
+// Augment command line with arguments from environment and .valgrindrc
+// files.
+static void augment_command_line(Int* vg_argc_inout, char*** vg_argv_inout)
+{
+   int    vg_argc = *vg_argc_inout;
+   char** vg_argv = *vg_argv_inout;
+
+   char*  env_clo = getenv(VALGRINDOPTS);
+   char*  f1_clo  = get_file_clo( getenv("HOME") );
+   char*  f2_clo  = get_file_clo(".");
+
+   /* copy any extra args from file or environment, if present */
+   if ( (env_clo && *env_clo) || (f1_clo && *f1_clo) || (f2_clo && *f2_clo) ) {
+      /* ' ' separated extra options */
+      char **from;
+      char **to;
+      int env_arg_count, f1_arg_count, f2_arg_count;
+      
+      env_arg_count = count_args(env_clo);
+      f1_arg_count  = count_args(f1_clo);
+      f2_arg_count  = count_args(f2_clo);
+
+      if (0)
+	 printf("extra-argc=%d %d %d\n",
+		env_arg_count, f1_arg_count, f2_arg_count);
+
+      /* +2: +1 for null-termination, +1 for added '--' */
+      from    = vg_argv;
+      vg_argv = malloc( (vg_argc + env_arg_count + f1_arg_count 
+                          + f2_arg_count + 2) * sizeof(char **));
+      to      = vg_argv;
+
+      /* copy argv[0] */
+      *to++ = *from++;
+
+      /* Copy extra args from env var and file, in the order: ~/.valgrindrc,
+       * $VALGRIND_OPTS, ./.valgrindrc -- more local options are put later
+       * to override less local ones. */
+      to = copy_args(f1_clo,  to);
+      to = copy_args(env_clo, to);
+      to = copy_args(f2_clo,  to);
+
+      /* copy original arguments, stopping at command or -- */
+      while (*from) {
+	 if (**from != '-')
+	    break;
+	 if (VG_STREQ(*from, "--")) {
+	    from++;		/* skip -- */
+	    break;
+	 }
+	 *to++ = *from++;
+      }
+
+      /* add -- */
+      *to++ = "--";
+
+      vg_argc = to - vg_argv;
+
+      /* copy rest of original command line, then NULL */
+      while (*from) *to++ = *from++;
+      *to = NULL;
+   }
+
+   *vg_argc_inout = vg_argc;
+   *vg_argv_inout = vg_argv;
+}
+
+static void get_command_line( int argc, char** argv,
+                              Int* vg_argc_out, Char*** vg_argv_out, 
+                                                char*** cl_argv_out )
+{
+   int    vg_argc;
+   char** vg_argv;
+   char** cl_argv;
+   char*  env_clo = getenv(VALGRINDCLO);
+
+   if (env_clo != NULL && *env_clo != '\0') {
+      char *cp;
+      char **cpp;
+
+      /* OK, we're getting all our arguments from the environment - the
+	 entire command line belongs to the client (including argv[0]) */
+      vg_argc = 1;		/* argv[0] */
+      for (cp = env_clo; *cp; cp++)
+	 if (*cp == '\01')
+	    vg_argc++;
+
+      vg_argv = malloc(sizeof(char **) * (vg_argc + 1));
+
+      cpp = vg_argv;
+
+      *cpp++ = "valgrind";	/* nominal argv[0] */
+      *cpp++ = env_clo;
+
+      for (cp = env_clo; *cp; cp++) {
+	 if (*cp == '\01') {
+	    *cp++ = '\0';	/* chop it up in place */
+	    *cpp++ = cp;
+	 }
+      }
+      *cpp = NULL;
+      cl_argv = argv;
+
+   } else {
+      /* Count the arguments on the command line. */
+      vg_argv = argv;
+
+      for (vg_argc = 1; vg_argc < argc; vg_argc++) {
+	 if (argv[vg_argc][0] != '-') /* exe name */
+	    break;
+	 if (VG_STREQ(argv[vg_argc], "--")) { /* dummy arg */
+	    vg_argc++;
+	    break;
+	 }
+      }
+      cl_argv = &argv[vg_argc];
+
+      /* Get extra args from VALGRIND_OPTS and .valgrindrc files.
+       * Note we don't do this if getting args from VALGRINDCLO. */
+      augment_command_line(&vg_argc, &vg_argv);
+   }
+
+   if (0) {
+      Int i;
+      for (i = 0; i < vg_argc; i++)
+         printf("vg_argv[%d]=\"%s\"\n", i, vg_argv[i]);
+   }
+
+   *vg_argc_out =         vg_argc;
+   *vg_argv_out = (Char**)vg_argv;
+   *cl_argv_out =         cl_argv;
+}
+
+
+/*====================================================================*/
+/*=== Environment and stack setup                                  ===*/
+/*====================================================================*/
+
+/* Scan a colon-separated list, and call a function on each element.
+   The string must be mutable, because we insert a temporary '\0', but
+   the string will end up unmodified.  (*func) should return 1 if it
+   doesn't need to see any more.
+*/
+static void scan_colsep(char *colsep, int (*func)(const char *))
+{
+   char *cp, *entry;
+   int end;
+
+   if (colsep == NULL ||
+       *colsep == '\0')
+      return;
+
+   entry = cp = colsep;
+
+   do {
+      end = (*cp == '\0');
+
+      if (*cp == ':' || *cp == '\0') {
+	 char save = *cp;
+
+	 *cp = '\0';
+	 if ((*func)(entry))
+	    end = 1;
+	 *cp = save;
+	 entry = cp+1;
+      }
+      cp++;
+   } while(!end);
+}
+
+/* Prepare the client's environment.  This is basically a copy of our
+   environment, except:
+   1. LD_LIBRARY_PATH=$VALGRINDLIB:$LD_LIBRARY_PATH
+   2. LD_PRELOAD=$VALGRINDLIB/vg_inject.so:($VALGRINDLIB/vgpreload_TOOL.so:)?$LD_PRELOAD
+
+   If any of these is missing, then it is added.
+
+   Yummy.  String hacking in C.
+
+   If this needs to handle any more variables it should be hacked
+   into something table driven.
+ */
+static char **fix_environment(char **origenv, const char *preload)
+{
+   static const char inject_so[]          = "vg_inject.so";
+   static const char ld_library_path[]    = "LD_LIBRARY_PATH=";
+   static const char ld_preload[]         = "LD_PRELOAD=";
+   static const char valgrind_clo[]       = VALGRINDCLO "=";
+   static const int  ld_library_path_len  = sizeof(ld_library_path)-1;
+   static const int  ld_preload_len       = sizeof(ld_preload)-1;
+   static const int  valgrind_clo_len     = sizeof(valgrind_clo)-1;
+   int ld_preload_done       = 0;
+   int ld_library_path_done  = 0;
+   char *inject_path;
+   int   inject_path_len;
+   int vgliblen = strlen(VG_(libdir));
+   char **cpp;
+   char **ret;
+   int envc;
+   const int preloadlen = (preload == NULL) ? 0 : strlen(preload);
+
+   /* Find the vg_inject.so; also make room for the tool preload
+      library */
+   inject_path_len = sizeof(inject_so) + vgliblen + preloadlen + 16;
+   inject_path = malloc(inject_path_len);
+
+   if (preload)
+      snprintf(inject_path, inject_path_len, "%s/%s:%s", 
+	       VG_(libdir), inject_so, preload);
+   else
+      snprintf(inject_path, inject_path_len, "%s/%s", 
+	       VG_(libdir), inject_so);
+   
+   /* Count the original size of the env */
+   envc = 0;			/* trailing NULL */
+   for (cpp = origenv; cpp && *cpp; cpp++)
+      envc++;
+
+   /* Allocate a new space */
+   ret = malloc(sizeof(char *) * (envc+3+1)); /* 3 new entries + NULL */
+
+   /* copy it over */
+   for (cpp = ret; *origenv; )
+      *cpp++ = *origenv++;
+   *cpp = NULL;
+   
+   vg_assert(envc == (cpp - ret));
+
+   /* Walk over the new environment, mashing as we go */
+   for (cpp = ret; cpp && *cpp; cpp++) {
+      if (memcmp(*cpp, ld_library_path, ld_library_path_len) == 0) {
+	 int done = 0;
+	 int contains(const char *p) {
+	    if (VG_STREQ(p, VG_(libdir))) {
+	       done = 1;
+	       return 1;
+	    }
+	    return 0;
+	 }
+
+	 /* If the LD_LIBRARY_PATH already contains libdir, then don't
+	    bother adding it again, even if it isn't the first (it
+	    seems that the Java runtime will keep reexecing itself
+	    unless its paths are at the front of LD_LIBRARY_PATH) */
+	 scan_colsep(*cpp + ld_library_path_len, contains);
+
+	 if (!done) {
+	    int len = strlen(*cpp) + vgliblen*2 + 16;
+	    char *cp = malloc(len);
+
+	    snprintf(cp, len, "%s%s:%s",
+		     ld_library_path, VG_(libdir),
+		     (*cpp)+ld_library_path_len);
+
+	    *cpp = cp;
+	 }
+
+	 ld_library_path_done = 1;
+      } else if (memcmp(*cpp, ld_preload, ld_preload_len) == 0) {
+	 int len = strlen(*cpp) + inject_path_len;
+	 char *cp = malloc(len);
+
+	 snprintf(cp, len, "%s%s:%s",
+		  ld_preload, inject_path, (*cpp)+ld_preload_len);
+
+	 *cpp = cp;
+	 
+	 ld_preload_done = 1;
+      } else if (memcmp(*cpp, valgrind_clo, valgrind_clo_len) == 0) {
+	 *cpp = "";
+      }
+   }
+
+   /* Add the missing bits */
+
+   if (!ld_library_path_done) {
+      int len = ld_library_path_len + vgliblen*2 + 16;
+      char *cp = malloc(len);
+
+      snprintf(cp, len, "%s%s", ld_library_path, VG_(libdir));
+
+      ret[envc++] = cp;
+   }
+
+   if (!ld_preload_done) {
+      int len = ld_preload_len + inject_path_len;
+      char *cp = malloc(len);
+      
+      snprintf(cp, len, "%s%s",
+	       ld_preload, inject_path);
+      
+      ret[envc++] = cp;
+   }
+
+   ret[envc] = NULL;
+
+   return ret;
+}
+
+extern char **environ;		/* our environment */
+//#include <error.h>
+
+/* Add a string onto the string table, and return its address */
+static char *copy_str(char **tab, const char *str)
+{
+   char *cp = *tab;
+   char *orig = cp;
+
+   while(*str)
+      *cp++ = *str++;
+   *cp++ = '\0';
+
+   if (0)
+      printf("copied %p \"%s\" len %d\n",
+	     orig, orig, cp-orig);
+
+   *tab = cp;
+
+   return orig;
+}
+
+/* 
+   This sets up the client's initial stack, containing the args,
+   environment and aux vector.
+
+   The format of the stack is:
+
+   higher address +-----------------+
+		  | Trampoline code |
+		  +-----------------+
+                  |                 |
+		  : string table    :
+		  |                 |
+		  +-----------------+
+		  | AT_NULL         |
+		  -                 -
+		  | auxv            |
+		  +-----------------+
+		  |  NULL           |
+		  -                 -
+		  | envp            |
+		  +-----------------+
+		  |  NULL           |
+		  -                 -
+		  | argv            |
+		  +-----------------+
+		  | argc            |
+   lower address  +-----------------+ <- esp
+                  | undefined       |
+		  :                 :
+ */
+static Addr setup_client_stack(char **orig_argv, char **orig_envp, 
+			       const struct exeinfo *info,
+                               UInt** client_auxv)
+{
+   char **cpp;
+   char *strtab;		/* string table */
+   char *stringbase;
+   addr_t *ptr;
+   struct ume_auxv *auxv;
+   const struct ume_auxv *orig_auxv;
+   const struct ume_auxv *cauxv;
+   unsigned stringsize;		/* total size of strings in bytes */
+   unsigned auxsize;		/* total size of auxv in bytes */
+   int argc;			/* total argc */
+   int envc;			/* total number of env vars */
+   unsigned stacksize;		/* total client stack size */
+   addr_t cl_esp;		/* client stack base (initial esp) */
+
+   /* use our own auxv as a prototype */
+   orig_auxv = find_auxv(ume_exec_esp);
+
+   /* ==================== compute sizes ==================== */
+
+   /* first of all, work out how big the client stack will be */
+   stringsize = 0;
+
+   /* paste on the extra args if the loader needs them (ie, the #! 
+      interpreter and its argument) */
+   argc = 0;
+   if (info->argv0 != NULL) {
+      argc++;
+      stringsize += strlen(info->argv0) + 1;
+   }
+   if (info->argv1 != NULL) {
+      argc++;
+      stringsize += strlen(info->argv1) + 1;
+   }
+
+   /* now scan the args we're given... */
+   for (cpp = orig_argv; *cpp; cpp++) {
+      argc++;
+      stringsize += strlen(*cpp) + 1;
+   }
+   
+   /* ...and the environment */
+   envc = 0;
+   for (cpp = orig_envp; cpp && *cpp; cpp++) {
+      envc++;
+      stringsize += strlen(*cpp) + 1;
+   }
+
+   /* now, how big is the auxv? */
+   auxsize = sizeof(*auxv);	/* there's always at least one entry: AT_NULL */
+   for (cauxv = orig_auxv; cauxv->a_type != AT_NULL; cauxv++) {
+      if (cauxv->a_type == AT_PLATFORM)
+	 stringsize += strlen(cauxv->u.a_ptr) + 1;
+      auxsize += sizeof(*cauxv);
+   }
+
+   /* OK, now we know how big the client stack is */
+   stacksize =
+      sizeof(int) +			/* argc */
+      sizeof(char **)*argc +		/* argv */
+      sizeof(char **) +			/* terminal NULL */
+      sizeof(char **)*envc +		/* envp */
+      sizeof(char **) +			/* terminal NULL */
+      auxsize +				/* auxv */
+      ROUNDUP(stringsize, sizeof(int)) +/* strings (aligned) */
+      VKI_BYTES_PER_PAGE;		/* page for trampoline code */
+
+   /* cl_esp is the client's stack pointer */
+   cl_esp = VG_(client_end) - stacksize;
+   cl_esp = ROUNDDN(cl_esp, 16); /* make stack 16 byte aligned */
+
+   if (0)
+      printf("stringsize=%d auxsize=%d stacksize=%d\n",
+	     stringsize, auxsize, stacksize);
+
+
+   /* base of the string table (aligned) */
+   stringbase = strtab = (char *)(VG_(client_trampoline_code) - ROUNDUP(stringsize, sizeof(int)));
+
+   VG_(clstk_base) = PGROUNDDN(cl_esp);
+   VG_(clstk_end)  = VG_(client_end);
+
+   /* ==================== allocate space ==================== */
+
+   /* allocate a stack - mmap enough space for the stack */
+   mmap((void *)PGROUNDDN(cl_esp),
+	VG_(client_end) - PGROUNDDN(cl_esp),
+	PROT_READ | PROT_WRITE | PROT_EXEC, 
+	MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+   
+
+   /* ==================== copy client stack ==================== */
+
+   ptr = (addr_t *)cl_esp;
+
+   /* --- argc --- */
+   *ptr++ = argc;		/* client argc */
+
+   /* --- argv --- */
+   if (info->argv0) {
+      *ptr++ = (addr_t)copy_str(&strtab, info->argv0);
+      free(info->argv0);
+   }
+   if (info->argv1) {
+      *ptr++ = (addr_t)copy_str(&strtab, info->argv1);
+      free(info->argv1);
+   }
+   for (cpp = orig_argv; *cpp; ptr++, cpp++) {
+      *ptr = (addr_t)copy_str(&strtab, *cpp);
+   }
+   *ptr++ = 0;
+
+   /* --- envp --- */
+   VG_(client_envp) = (Char **)ptr;
+   for (cpp = orig_envp; cpp && *cpp; ptr++, cpp++)
+      *ptr = (addr_t)copy_str(&strtab, *cpp);
+   *ptr++ = 0;
+
+   /* --- auxv --- */
+   auxv = (struct ume_auxv *)ptr;
+   *client_auxv = (UInt *)auxv;
+
+   for (; orig_auxv->a_type != AT_NULL; auxv++, orig_auxv++) {
+      /* copy the entry... */
+      *auxv = *orig_auxv;
+
+      /* ...and fix up the copy */
+      switch(auxv->a_type) {
+      case AT_PHDR:
+	 if (info->phdr == 0)
+	    auxv->a_type = AT_IGNORE;
+	 else
+	    auxv->u.a_val = info->phdr;
+	 break;
+
+      case AT_PHNUM:
+	 if (info->phdr == 0)
+	    auxv->a_type = AT_IGNORE;
+	 else
+	    auxv->u.a_val = info->phnum;
+	 break;
+
+      case AT_BASE:
+	 if (info->interp_base == 0)
+	    auxv->a_type = AT_IGNORE;
+	 else
+	    auxv->u.a_val = info->interp_base;
+	 break;
+
+      case AT_PLATFORM:		/* points to a platform description string */
+	 auxv->u.a_ptr = copy_str(&strtab, orig_auxv->u.a_ptr);
+	 break;
+
+      case AT_ENTRY:
+	 auxv->u.a_val = info->entry;
+	 break;
+
+      case AT_IGNORE:
+      case AT_EXECFD:
+      case AT_PHENT:
+      case AT_PAGESZ:
+      case AT_FLAGS:
+      case AT_NOTELF:
+      case AT_UID:
+      case AT_EUID:
+      case AT_GID:
+      case AT_EGID:
+      case AT_CLKTCK:
+      case AT_HWCAP:
+      case AT_FPUCW:
+      case AT_DCACHEBSIZE:
+      case AT_ICACHEBSIZE:
+      case AT_UCACHEBSIZE:
+	 /* All these are pointerless, so we don't need to do anything
+	    about them. */
+	 break;
+
+      case AT_SECURE:
+	 /* If this is 1, then it means that this program is running
+	    suid, and therefore the dynamic linker should be careful
+	    about LD_PRELOAD, etc.  However, since stage1 (the thing
+	    the kernel actually execve's) should never be SUID, and we
+	    need LD_PRELOAD/LD_LIBRARY_PATH to work for the client, we
+	    set AT_SECURE to 0. */
+	 auxv->u.a_val = 0;
+	 break;
+
+      case AT_SYSINFO:
+	 /* Leave this unmolested for now, but we'll update it later
+	    when we set up the client trampoline code page */
+	 break;
+
+      case AT_SYSINFO_EHDR:
+	 /* Trash this, because we don't reproduce it */
+	 auxv->a_type = AT_IGNORE;
+	 break;
+
+      default:
+	 /* stomp out anything we don't know about */
+	 if (0)
+	    printf("stomping auxv entry %d\n", auxv->a_type);
+	 auxv->a_type = AT_IGNORE;
+	 break;
+	 
+      }
+   }
+   *auxv = *orig_auxv;
+   vg_assert(auxv->a_type == AT_NULL);
+
+   vg_assert((strtab-stringbase) == stringsize);
+
+   return cl_esp;
+}
+
+/*====================================================================*/
+/*=== Find executable                                              ===*/
+/*====================================================================*/
+
+static const char* find_executable(const char* exec)
+{
+   vg_assert(NULL != exec);
+   if (strchr(exec, '/') == NULL) {
+      /* no '/' - we need to search the path */
+      char *path = getenv("PATH");
+      int pathlen = path ? strlen(path) : 0;
+
+      int match_exe(const char *entry) {
+         char buf[pathlen + strlen(entry) + 3];
+
+         /* empty PATH element means . */
+         if (*entry == '\0')
+            entry = ".";
+
+         snprintf(buf, sizeof(buf), "%s/%s", entry, exec);
+
+         if (access(buf, R_OK|X_OK) == 0) {
+            exec = strdup(buf);
+            vg_assert(NULL != exec);
+            return 1;
+         }
+         return 0;
+      }
+      scan_colsep(path, match_exe);
+   }
+   return exec;
+}
+
+
+/*====================================================================*/
+/*=== Loading tools                                                ===*/
+/*====================================================================*/
+
+static void list_tools(void)
+{
+   DIR *dir = opendir(VG_(libdir));
+   struct dirent *de;
+   int first = 1;
+
+   if (dir == NULL) {
+      fprintf(stderr, "Can't open %s: %s (installation problem?)\n",
+	      VG_(libdir), strerror(errno));
+      return;
+   }
+
+   while((de = readdir(dir)) != NULL) {
+      int len = strlen(de->d_name);
+
+      /* look for vgskin_TOOL.so names */
+      if (len > (7+1+3) &&   /* "vgskin_" + at least 1-char toolname + ".so" */
+	  strncmp(de->d_name, "vgskin_", 7) == 0 &&
+	  VG_STREQ(de->d_name + len - 3, ".so")) {
+	 if (first) {
+	    printf("Available tools:\n");
+	    first = 0;
+	 }
+	 de->d_name[len-3] = '\0';
+	 printf("\t%s\n", de->d_name+7);
+      }
+   }
+
+   closedir(dir);
+
+   if (first)
+      printf("No tools available in \"%s\" (installation problem?)\n",
+	     VG_(libdir));
+}
+
+
+/* Find and load a tool, and check it looks ok.  Also looks to see if there's 
+ * a matching vgpreload_*.so file, and returns its name in *preloadpath. */
+static void load_tool( const char *toolname, void** handle_out,
+                       ToolInfo** toolinfo_out, char **preloadpath_out )
+{
+   Bool      ok;
+   int       len = strlen(VG_(libdir)) + strlen(toolname)*2 + 16;
+   char      buf[len];
+   void*     handle;
+   ToolInfo* toolinfo;
+   char*     preloadpath = NULL;
+   Int*      vg_malloc_redzonep;
+
+   // XXX: allowing full paths for --tool option -- does it make sense?
+   // Doesn't allow for vgpreload_<tool>.so.
+
+   if (strchr(toolname, '/') != 0) {
+      /* toolname contains '/', and so must be a pathname */
+      handle = dlopen(toolname, RTLD_NOW);
+   } else {
+      /* just try in the libdir */
+      snprintf(buf, len, "%s/vgskin_%s.so", VG_(libdir), toolname);
+      handle = dlopen(buf, RTLD_NOW);
+
+      if (handle != NULL) {
+	 snprintf(buf, len, "%s/vgpreload_%s.so", VG_(libdir), toolname);
+	 if (access(buf, R_OK) == 0) {
+	    preloadpath = strdup(buf);
+            vg_assert(NULL != preloadpath);
+         }
+      }
+   }
+
+   ok = (NULL != handle);
+   if (!ok) {
+      fprintf(stderr, "Can't open tool \"%s\": %s\n", toolname, dlerror());
+      goto bad_load;
+   }
+
+   toolinfo = dlsym(handle, "vgSkin_tool_info");
+   ok = (NULL != toolinfo);
+   if (!ok) {
+      fprintf(stderr, "Tool \"%s\" doesn't define SK_(tool_info) - "
+                      "add VG_DETERMINE_INTERFACE_VERSION?\n", toolname);
+      goto bad_load;
+   }
+
+   ok = (toolinfo->sizeof_ToolInfo == sizeof(*toolinfo) &&
+     toolinfo->interface_major_version == VG_CORE_INTERFACE_MAJOR_VERSION &&
+     toolinfo->sk_pre_clo_init != NULL);
+   if (!ok) { 
+      fprintf(stderr, "Error:\n"
+              "  Tool and core interface versions do not match.\n"
+              "  Interface version used by core is: %d.%d (size %d)\n"
+              "  Interface version used by tool is: %d.%d (size %d)\n"
+              "  The major version numbers must match.\n",
+              VG_CORE_INTERFACE_MAJOR_VERSION, 
+              VG_CORE_INTERFACE_MINOR_VERSION,
+              sizeof(*toolinfo),
+              toolinfo->interface_major_version,
+              toolinfo->interface_minor_version, 
+              toolinfo->sizeof_ToolInfo);
+      fprintf(stderr, "  You need to at least recompile, and possibly update,\n");
+      if (VG_CORE_INTERFACE_MAJOR_VERSION > toolinfo->interface_major_version)
+         fprintf(stderr, "  your skin to work with this version of Valgrind.\n");
+      else
+         fprintf(stderr, "  your version of Valgrind to work with this skin.\n");
+      goto bad_load;
+   }
+
+   // Set redzone size for V's allocator
+   vg_malloc_redzonep = dlsym(handle, STR(VG_(vg_malloc_redzone_szB)));
+   if ( NULL != vg_malloc_redzonep ) {
+      VG_(vg_malloc_redzone_szB) = *vg_malloc_redzonep;
+   }
+
+   vg_assert(NULL != handle && NULL != toolinfo);
+   *handle_out      = handle;
+   *toolinfo_out    = toolinfo;
+   *preloadpath_out = preloadpath;
+   return;
+
+
+ bad_load:
+   if (handle != NULL)
+      dlclose(handle);
+
+   fprintf(stderr, "Aborting: couldn't load tool\n");
+   list_tools();
+   exit(127);
+}
+
+/*====================================================================*/
+/*=== Loading the client                                           ===*/
+/*====================================================================*/
+
+static void load_client(char* cl_argv[], const char* exec,    
+                 /*inout*/Bool* need_help,
+                 /*out*/struct exeinfo* info, /*out*/Addr* client_eip)
+{
+   // If they didn't specify an executable with --exec, and didn't specify 
+   // --help, then use client argv[0] (searching $PATH if necessary).
+   if (NULL == exec && !*need_help) {
+      if (cl_argv[0] == NULL || 
+          ( NULL == (exec = find_executable(cl_argv[0])) ) )
+      {
+         *need_help = True;
+      }
+   }
+
+   info->map_base = VG_(client_mapbase);
+   info->setbrk   = False;
+
+   info->exe_base = VG_(client_base);
+   info->exe_end  = VG_(client_end);
+   info->argv     = cl_argv;
+
+   if (*need_help) {
+      VG_(clexecfd) = -1;
+      info->argv0 = NULL;
+      info->argv1 = NULL;
+   } else {
+      Int ret;
+      VG_(clexecfd) = VG_(open)(exec, O_RDONLY, VKI_S_IRUSR);
+      ret = do_exec(exec, info);
+      if (ret != 0) {
+         fprintf(stderr, "do_exec(%s) failed: %s\n", exec, strerror(ret));
+         exit(127);
+      }
+   }
+
+   /* Copy necessary bits of 'info' that were filled in */
+   *client_eip = info->init_eip;
+   VG_(brk_base) = VG_(brk_limit) = info->brkbase;
+}
+
+
+/*====================================================================*/
+/*=== Command-line: variables, processing                          ===*/
+/*====================================================================*/
 
 /* Define, and set defaults. */
 Bool   VG_(clo_error_limit)    = True;
@@ -613,27 +1412,6 @@
 Bool   VG_(clo_lowlat_syscalls) = False; /* low-latency syscalls */
 Bool   VG_(clo_lowlat_signals)  = False; /* low-latency signals */
 
-/* This Bool is needed by wrappers in vg_clientmalloc.c to decide how
-   to behave.  Initially we say False. */
-Bool VG_(running_on_simd_CPU) = False;
-
-/* Holds client's %esp at the point we gained control. */
-Addr VG_(esp_at_startup);
-
-/* Indicates presence, and holds address of client's sysinfo page, a
-   feature of some modern kernels used to provide vsyscalls, etc. */
-Bool VG_(sysinfo_page_exists) = False;
-Addr VG_(sysinfo_page_addr) = 0;
-
-/* As deduced from VG_(esp_at_startup), the client's argc, argv[] and
-   envp[] as extracted from the client's stack at startup-time. */
-Int    VG_(client_argc);
-Char** VG_(client_argv);
-Char** VG_(client_envp);
-
-/* ---------------------------------------------------------------------
-   Processing of command-line options.
-   ------------------------------------------------------------------ */
 
 void VG_(bad_option) ( Char* opt )
 {
@@ -656,13 +1434,13 @@
    VG_(exit)(1);
 }
 
-void VG_(usage) ( void )
+void usage ( void )
 {
    Char* usage1 = 
-"usage: valgrind [options] prog-and-args\n"
+"usage: valgrind --tool=<toolname> [options] prog-and-args\n"
 "\n"
 "  common user options for all Valgrind tools, with defaults in [ ]:\n"
-"    --tool=<name>             Use the Valgrind tool named <name> [memcheck]\n"
+"    --tool=<name>             Use the Valgrind tool named <name>\n"
 "    --help                    show this message\n"
 "    --version                 show version\n"
 "    -q --quiet                run silently; only print error msgs\n"
@@ -725,7 +1503,7 @@
 
    Char* usage3 =
 "\n"
-"  Extra options are read from env variable $VALGRIND_OPTS\n"
+"  Extra options read from ~/.valgrindrc, $VALGRIND_OPTS, ./.valgrindrc\n"
 "\n"
 "  Valgrind is Copyright (C) 2000-2004 Julian Seward\n"
 "  and licensed under the GNU General Public License, version 2.\n"
@@ -762,15 +1540,45 @@
    VG_(exit)(1);
 }
 
-
-static void process_cmd_line_options ( const KickstartParams *kp )
+static void pre_process_cmd_line_options
+      ( Bool* need_help, const char** tool, const char** exec )
 {
-   Int argc;
-   Char **argv;
-   Int   i, eventually_logfile_fd;
-   Int	*auxp;
+   UInt i;
 
-#  define ISSPACE(cc)      ((cc) == ' ' || (cc) == '\t' || (cc) == '\n')
+   /* parse the options we have (only the options we care about now) */
+   for (i = 1; i < VG_(vg_argc); i++) {
+
+      if (strcmp(VG_(vg_argv)[i], "--version") == 0) {
+         printf("valgrind-" VERSION "\n");
+         exit(1);
+
+      } else if (strcmp(VG_(vg_argv)[i], "--help") == 0) {
+         *need_help = True;
+
+      } else if (strncmp(VG_(vg_argv)[i], "--tool=", 7) == 0 ||
+	         strncmp(VG_(vg_argv)[i], "--skin=", 7) == 0) {
+	 *tool = &VG_(vg_argv)[i][7];
+	 
+      } else if (strncmp(VG_(vg_argv)[i], "--exec=", 7) == 0) {
+	 *exec = &VG_(vg_argv)[i][7];
+      }
+   }
+
+   /* If no tool specified, can give usage message without loading tool */
+   if (*tool == NULL) {
+      if (!need_help)
+	 list_tools();
+      usage();
+   }
+}
+
+static void process_cmd_line_options 
+      ( UInt* client_auxv, Addr esp_at_startup, 
+        const char* toolname, Bool need_help )
+{
+   Int  i, eventually_logfile_fd;
+   Int *auxp;
+   Int  toolname_len = VG_(strlen)(toolname);
 
    /* log to stderr by default, but usage message goes to stdout */
    eventually_logfile_fd = 2; 
@@ -784,7 +1592,7 @@
      config_error("Please use absolute paths in "
                   "./configure --prefix=... or --libdir=...");
 
-   for(auxp = kp->client_auxv; auxp[0] != VKI_AT_NULL; auxp += 2) {
+   for (auxp = client_auxv; auxp[0] != VKI_AT_NULL; auxp += 2) {
       switch(auxp[0]) {
       case VKI_AT_SYSINFO:
 	 VG_(sysinfo_page_exists) = True;
@@ -794,149 +1602,168 @@
       }
    } 
 
-   VG_(client_envp) = kp->client_envp;
-
-   argc = kp->argc;
-   argv = kp->argv;
-
-   VG_(vg_argc) = argc;
-   VG_(vg_argv) = argv;
+   if (need_help)
+      usage();
 
    /* We know the initial ESP is pointing at argc/argv */
-   VG_(client_argc) = *(Int *)kp->client_esp;
-   VG_(client_argv) = (Char **)(kp->client_esp + sizeof(Int));
+   VG_(client_argc) = *(Int *)esp_at_startup;
+   VG_(client_argv) = (Char **)(esp_at_startup + sizeof(Int));
 
-   for (i = 1; i < argc; i++) {
+   for (i = 1; i < VG_(vg_argc); i++) {
+
+      Char* arg = VG_(vg_argv)[i];
+
+      // XXX: allow colons in options, for Josef
+
+      /* Look for matching "--toolname:foo" */
+      if (VG_(strstr)(arg, ":")) {
+         if (VG_CLO_STREQN(2,            arg,                "--") && 
+             VG_CLO_STREQN(toolname_len, arg+2,              toolname) &&
+             VG_CLO_STREQN(1,            arg+2+toolname_len, ":"))
+         {
+            // prefix matches, convert "--toolname:foo" to "--foo"
+            if (0)
+               VG_(printf)("tool-specific arg: %s\n", arg);
+            arg += toolname_len + 1;
+            arg[0] = '-';
+            arg[1] = '-';
+
+         } else {
+            // prefix doesn't match, skip to next arg
+            continue;
+         }
+      }
+      
       /* Ignore these options - they've already been handled */
-      if (VG_CLO_STREQN(7, argv[i], "--tool=") ||
-	  VG_CLO_STREQN(7, argv[i], "--skin="))
+      if (VG_CLO_STREQN(7, arg, "--tool=") ||
+	  VG_CLO_STREQN(7, arg, "--skin="))
 	 continue;
-      if (VG_CLO_STREQN(7, argv[i], "--exec="))
+      if (VG_CLO_STREQN(7, arg, "--exec="))
 	 continue;
 
-      if (     VG_CLO_STREQ(argv[i], "--"))
+      if (     VG_CLO_STREQ(arg, "--"))
 	 continue;
-      else if (VG_CLO_STREQ(argv[i], "-v") ||
-               VG_CLO_STREQ(argv[i], "--verbose"))
+      else if (VG_CLO_STREQ(arg, "-v") ||
+               VG_CLO_STREQ(arg, "--verbose"))
          VG_(clo_verbosity)++;
-      else if (VG_CLO_STREQ(argv[i], "-q") ||
-               VG_CLO_STREQ(argv[i], "--quiet"))
+      else if (VG_CLO_STREQ(arg, "-q") ||
+               VG_CLO_STREQ(arg, "--quiet"))
          VG_(clo_verbosity)--;
 
-      else if (VG_CLO_STREQ(argv[i], "--error-limit=yes"))
+      else if (VG_CLO_STREQ(arg, "--error-limit=yes"))
          VG_(clo_error_limit) = True;
-      else if (VG_CLO_STREQ(argv[i], "--error-limit=no"))
+      else if (VG_CLO_STREQ(arg, "--error-limit=no"))
          VG_(clo_error_limit) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--gdb-attach=yes"))
+      else if (VG_CLO_STREQ(arg, "--gdb-attach=yes"))
          VG_(clo_GDB_attach) = True;
-      else if (VG_CLO_STREQ(argv[i], "--gdb-attach=no"))
+      else if (VG_CLO_STREQ(arg, "--gdb-attach=no"))
          VG_(clo_GDB_attach) = False;
 
-      else if (VG_CLO_STREQN(11,argv[i], "--gdb-path="))
-         VG_(clo_GDB_path) = &argv[i][11];
+      else if (VG_CLO_STREQN(11,arg, "--gdb-path="))
+         VG_(clo_GDB_path) = &arg[11];
 
-      else if (VG_CLO_STREQ(argv[i], "--gen-suppressions=yes"))
+      else if (VG_CLO_STREQ(arg, "--gen-suppressions=yes"))
          VG_(clo_gen_suppressions) = True;
-      else if (VG_CLO_STREQ(argv[i], "--gen-suppressions=no"))
+      else if (VG_CLO_STREQ(arg, "--gen-suppressions=no"))
          VG_(clo_gen_suppressions) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--show-below-main=yes"))
+      else if (VG_CLO_STREQ(arg, "--show-below-main=yes"))
          VG_(clo_show_below_main) = True;
-      else if (VG_CLO_STREQ(argv[i], "--show-below-main=no"))
+      else if (VG_CLO_STREQ(arg, "--show-below-main=no"))
          VG_(clo_show_below_main) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--pointercheck=yes"))
+      else if (VG_CLO_STREQ(arg, "--pointercheck=yes"))
          VG_(clo_pointercheck) = True;
-      else if (VG_CLO_STREQ(argv[i], "--pointercheck=no"))
+      else if (VG_CLO_STREQ(arg, "--pointercheck=no"))
          VG_(clo_pointercheck) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--demangle=yes"))
+      else if (VG_CLO_STREQ(arg, "--demangle=yes"))
          VG_(clo_demangle) = True;
-      else if (VG_CLO_STREQ(argv[i], "--demangle=no"))
+      else if (VG_CLO_STREQ(arg, "--demangle=no"))
          VG_(clo_demangle) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--trace-children=yes"))
+      else if (VG_CLO_STREQ(arg, "--trace-children=yes"))
          VG_(clo_trace_children) = True;
-      else if (VG_CLO_STREQ(argv[i], "--trace-children=no"))
+      else if (VG_CLO_STREQ(arg, "--trace-children=no"))
          VG_(clo_trace_children) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--run-libc-freeres=yes"))
+      else if (VG_CLO_STREQ(arg, "--run-libc-freeres=yes"))
          VG_(clo_run_libc_freeres) = True;
-      else if (VG_CLO_STREQ(argv[i], "--run-libc-freeres=no"))
+      else if (VG_CLO_STREQ(arg, "--run-libc-freeres=no"))
          VG_(clo_run_libc_freeres) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--track-fds=yes"))
+      else if (VG_CLO_STREQ(arg, "--track-fds=yes"))
          VG_(clo_track_fds) = True;
-      else if (VG_CLO_STREQ(argv[i], "--track-fds=no"))
+      else if (VG_CLO_STREQ(arg, "--track-fds=no"))
          VG_(clo_track_fds) = False;
 
-      else if (VG_CLO_STREQN(15, argv[i], "--sanity-level="))
-         VG_(sanity_level) = (Int)VG_(atoll)(&argv[i][15]);
+      else if (VG_CLO_STREQN(15, arg, "--sanity-level="))
+         VG_(sanity_level) = (Int)VG_(atoll)(&arg[15]);
 
-      else if (VG_CLO_STREQN(13, argv[i], "--logfile-fd=")) {
+      else if (VG_CLO_STREQN(13, arg, "--logfile-fd=")) {
          VG_(clo_log_to)       = VgLogTo_Fd;
          VG_(clo_logfile_name) = NULL;
-         eventually_logfile_fd = (Int)VG_(atoll)(&argv[i][13]);
+         eventually_logfile_fd = (Int)VG_(atoll)(&arg[13]);
       }
 
-      else if (VG_CLO_STREQN(10, argv[i], "--logfile=")) {
+      else if (VG_CLO_STREQN(10, arg, "--logfile=")) {
          VG_(clo_log_to)       = VgLogTo_File;
-         VG_(clo_logfile_name) = &argv[i][10];
+         VG_(clo_logfile_name) = &arg[10];
       }
 
-      else if (VG_CLO_STREQN(12, argv[i], "--logsocket=")) {
+      else if (VG_CLO_STREQN(12, arg, "--logsocket=")) {
          VG_(clo_log_to)       = VgLogTo_Socket;
-         VG_(clo_logfile_name) = &argv[i][12];
+         VG_(clo_logfile_name) = &arg[12];
       }
 
-      else if (VG_CLO_STREQN(11, argv[i], "--input-fd="))
-         VG_(clo_input_fd)     = (Int)VG_(atoll)(&argv[i][11]);
+      else if (VG_CLO_STREQN(11, arg, "--input-fd="))
+         VG_(clo_input_fd)     = (Int)VG_(atoll)(&arg[11]);
 
-      else if (VG_CLO_STREQN(15, argv[i], "--suppressions=")) {
+      else if (VG_CLO_STREQN(15, arg, "--suppressions=")) {
          if (VG_(clo_n_suppressions) >= VG_CLO_MAX_SFILES) {
             VG_(message)(Vg_UserMsg, "Too many suppression files specified.");
             VG_(message)(Vg_UserMsg, 
                          "Increase VG_CLO_MAX_SFILES and recompile.");
-            VG_(bad_option)(argv[i]);
+            VG_(bad_option)(arg);
          }
-         VG_(clo_suppressions)[VG_(clo_n_suppressions)] = &argv[i][15];
+         VG_(clo_suppressions)[VG_(clo_n_suppressions)] = &arg[15];
          VG_(clo_n_suppressions)++;
       }
-      else if (VG_CLO_STREQ(argv[i], "--profile=yes"))
+      else if (VG_CLO_STREQ(arg, "--profile=yes"))
          VG_(clo_profile) = True;
-      else if (VG_CLO_STREQ(argv[i], "--profile=no"))
+      else if (VG_CLO_STREQ(arg, "--profile=no"))
          VG_(clo_profile) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--chain-bb=yes"))
+      else if (VG_CLO_STREQ(arg, "--chain-bb=yes"))
 	 VG_(clo_chain_bb) = True;
-      else if (VG_CLO_STREQ(argv[i], "--chain-bb=no"))
+      else if (VG_CLO_STREQ(arg, "--chain-bb=no"))
 	 VG_(clo_chain_bb) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--branchpred=yes"))
+      else if (VG_CLO_STREQ(arg, "--branchpred=yes"))
 	 VG_(clo_branchpred) = True;
-      else if (VG_CLO_STREQ(argv[i], "--branchpred=no"))
+      else if (VG_CLO_STREQ(arg, "--branchpred=no"))
 	 VG_(clo_branchpred) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--single-step=yes"))
+      else if (VG_CLO_STREQ(arg, "--single-step=yes"))
          VG_(clo_single_step) = True;
-      else if (VG_CLO_STREQ(argv[i], "--single-step=no"))
+      else if (VG_CLO_STREQ(arg, "--single-step=no"))
          VG_(clo_single_step) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--optimise=yes"))
+      else if (VG_CLO_STREQ(arg, "--optimise=yes"))
          VG_(clo_optimise) = True;
-      else if (VG_CLO_STREQ(argv[i], "--optimise=no"))
+      else if (VG_CLO_STREQ(arg, "--optimise=no"))
          VG_(clo_optimise) = False;
 
       /* "vwxyz" --> 000zyxwv (binary) */
-      else if (VG_CLO_STREQN(16, argv[i], "--trace-codegen=")) {
+      else if (VG_CLO_STREQN(16, arg, "--trace-codegen=")) {
          Int j;
-         char* opt = & argv[i][16];
+         char* opt = & arg[16];
    
          if (5 != VG_(strlen)(opt)) {
             VG_(message)(Vg_UserMsg, 
                          "--trace-codegen argument must have 5 digits");
-            VG_(bad_option)(argv[i]);
+            VG_(bad_option)(arg);
          }
          for (j = 0; j < 5; j++) {
             if      ('0' == opt[j]) { /* do nothing */ }
@@ -944,85 +1771,80 @@
             else {
                VG_(message)(Vg_UserMsg, "--trace-codegen argument can only "
                                         "contain 0s and 1s");
-               VG_(bad_option)(argv[i]);
+               VG_(bad_option)(arg);
             }
          }
       }
 
-      else if (VG_CLO_STREQ(argv[i], "--trace-syscalls=yes"))
+      else if (VG_CLO_STREQ(arg, "--trace-syscalls=yes"))
          VG_(clo_trace_syscalls) = True;
-      else if (VG_CLO_STREQ(argv[i], "--trace-syscalls=no"))
+      else if (VG_CLO_STREQ(arg, "--trace-syscalls=no"))
          VG_(clo_trace_syscalls) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--trace-signals=yes"))
+      else if (VG_CLO_STREQ(arg, "--trace-signals=yes"))
          VG_(clo_trace_signals) = True;
-      else if (VG_CLO_STREQ(argv[i], "--trace-signals=no"))
+      else if (VG_CLO_STREQ(arg, "--trace-signals=no"))
          VG_(clo_trace_signals) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--trace-symtab=yes"))
+      else if (VG_CLO_STREQ(arg, "--trace-symtab=yes"))
          VG_(clo_trace_symtab) = True;
-      else if (VG_CLO_STREQ(argv[i], "--trace-symtab=no"))
+      else if (VG_CLO_STREQ(arg, "--trace-symtab=no"))
          VG_(clo_trace_symtab) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--trace-sched=yes"))
+      else if (VG_CLO_STREQ(arg, "--trace-sched=yes"))
          VG_(clo_trace_sched) = True;
-      else if (VG_CLO_STREQ(argv[i], "--trace-sched=no"))
+      else if (VG_CLO_STREQ(arg, "--trace-sched=no"))
          VG_(clo_trace_sched) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--trace-pthread=none"))
+      else if (VG_CLO_STREQ(arg, "--trace-pthread=none"))
          VG_(clo_trace_pthread_level) = 0;
-      else if (VG_CLO_STREQ(argv[i], "--trace-pthread=some"))
+      else if (VG_CLO_STREQ(arg, "--trace-pthread=some"))
          VG_(clo_trace_pthread_level) = 1;
-      else if (VG_CLO_STREQ(argv[i], "--trace-pthread=all"))
+      else if (VG_CLO_STREQ(arg, "--trace-pthread=all"))
          VG_(clo_trace_pthread_level) = 2;
 
-      else if (VG_CLO_STREQN(14, argv[i], "--weird-hacks="))
-         VG_(clo_weird_hacks) = &argv[i][14];
+      else if (VG_CLO_STREQN(14, arg, "--weird-hacks="))
+         VG_(clo_weird_hacks) = &arg[14];
 
-      else if (VG_CLO_STREQN(17, argv[i], "--signal-polltime="))
-	 VG_(clo_signal_polltime) = VG_(atoll)(&argv[i][17]);
+      else if (VG_CLO_STREQN(17, arg, "--signal-polltime="))
+	 VG_(clo_signal_polltime) = VG_(atoll)(&arg[17]);
 
-      else if (VG_CLO_STREQ(argv[i], "--lowlat-signals=yes"))
+      else if (VG_CLO_STREQ(arg, "--lowlat-signals=yes"))
 	 VG_(clo_lowlat_signals) = True;
-      else if (VG_CLO_STREQ(argv[i], "--lowlat-signals=no"))
+      else if (VG_CLO_STREQ(arg, "--lowlat-signals=no"))
 	 VG_(clo_lowlat_signals) = False;
 
-      else if (VG_CLO_STREQ(argv[i], "--lowlat-syscalls=yes"))
+      else if (VG_CLO_STREQ(arg, "--lowlat-syscalls=yes"))
 	 VG_(clo_lowlat_syscalls) = True;
-      else if (VG_CLO_STREQ(argv[i], "--lowlat-syscalls=no"))
+      else if (VG_CLO_STREQ(arg, "--lowlat-syscalls=no"))
 	 VG_(clo_lowlat_syscalls) = False;
 
-      else if (VG_CLO_STREQN(13, argv[i], "--stop-after="))
-         VG_(clo_stop_after) = VG_(atoll)(&argv[i][13]);
+      else if (VG_CLO_STREQN(13, arg, "--stop-after="))
+         VG_(clo_stop_after) = VG_(atoll)(&arg[13]);
 
-      else if (VG_CLO_STREQN(13, argv[i], "--dump-error="))
-         VG_(clo_dump_error) = (Int)VG_(atoll)(&argv[i][13]);
+      else if (VG_CLO_STREQN(13, arg, "--dump-error="))
+         VG_(clo_dump_error) = (Int)VG_(atoll)(&arg[13]);
 
-      else if (VG_CLO_STREQ(argv[i], "--wait-for-gdb=yes"))
+      else if (VG_CLO_STREQ(arg, "--wait-for-gdb=yes"))
 	 VG_(clo_wait_for_gdb) = True;
-      else if (VG_CLO_STREQ(argv[i], "--wait-for-gdb=no"))
+      else if (VG_CLO_STREQ(arg, "--wait-for-gdb=no"))
 	 VG_(clo_wait_for_gdb) = False;
 
-      else if (VG_CLO_STREQN(14, argv[i], "--num-callers=")) {
+      else if (VG_CLO_STREQN(14, arg, "--num-callers=")) {
          /* Make sure it's sane. */
-	 VG_(clo_backtrace_size) = (Int)VG_(atoll)(&argv[i][14]);
+	 VG_(clo_backtrace_size) = (Int)VG_(atoll)(&arg[14]);
          if (VG_(clo_backtrace_size) < 1)
             VG_(clo_backtrace_size) = 1;
          if (VG_(clo_backtrace_size) >= VG_DEEPEST_BACKTRACE)
             VG_(clo_backtrace_size) = VG_DEEPEST_BACKTRACE;
       }
 
-      else if (VG_(needs).command_line_options) {
-         Bool ok = SK_(process_cmd_line_option)(argv[i]);
-         if (!ok)
-            VG_(usage)();
+      else if ( ! VG_(needs).command_line_options
+             || ! SK_(process_cmd_line_option)(arg) ) {
+         usage();
       }
-      else
-         VG_(usage)();
    }
 
-#  undef ISSPACE
-
    if (VG_(clo_verbosity) < 0)
       VG_(clo_verbosity) = 0;
 
@@ -1061,7 +1883,7 @@
          vg_assert(VG_(clo_logfile_name) != NULL);
          vg_assert(VG_(strlen)(VG_(clo_logfile_name)) <= 900); /* paranoia */
 
-	 for(;;) {
+	 for (;;) {
 	    if (seq == 0)
 	       VG_(sprintf)(logfilename, "%s.pid%d",
 			    VG_(clo_logfile_name), pid );
@@ -1173,8 +1995,8 @@
          VG_(message)(Vg_UserMsg, "   %s", VG_(client_argv)[i]);
 
       VG_(message)(Vg_UserMsg, "Startup, with flags:");
-      for (i = 1; i < argc; i++) {
-         VG_(message)(Vg_UserMsg, "   %s", argv[i]);
+      for (i = 1; i < VG_(vg_argc); i++) {
+         VG_(message)(Vg_UserMsg, "   %s", VG_(vg_argv)[i]);
       }
    }
 
@@ -1196,15 +2018,136 @@
                    "   as it doesn't generate errors.");
    }
 
+   VG_(bbs_to_go) = VG_(clo_stop_after);
 }
 
-/* ---------------------------------------------------------------------
-   Copying to/from m_state_static.
-   ------------------------------------------------------------------ */
 
-/* See comment about this in vg_include.h.  Change only with
-   great care.
-*/
+/*====================================================================*/
+/*=== File descriptor setup                                        ===*/
+/*====================================================================*/
+
+static void setup_file_descriptors(void)
+{
+   struct vki_rlimit rl;
+
+   /* Get the current file descriptor limits. */
+   if (VG_(getrlimit)(VKI_RLIMIT_NOFILE, &rl) < 0) {
+      rl.rlim_cur = 1024;
+      rl.rlim_max = 1024;
+   }
+
+   /* Work out where to move the soft limit to. */
+   if (rl.rlim_cur + VG_N_RESERVED_FDS <= rl.rlim_max) {
+      rl.rlim_cur = rl.rlim_cur + VG_N_RESERVED_FDS;
+   } else {
+      rl.rlim_cur = rl.rlim_max;
+   }
+
+   /* Reserve some file descriptors for our use. */
+   VG_(max_fd) = rl.rlim_cur - VG_N_RESERVED_FDS;
+
+   /* Update the soft limit. */
+   VG_(setrlimit)(VKI_RLIMIT_NOFILE, &rl);
+
+   if (VG_(vgexecfd) != -1)
+      VG_(vgexecfd) = VG_(safe_fd)( VG_(vgexecfd) );
+   if (VG_(clexecfd) != -1)
+      VG_(clexecfd) = VG_(safe_fd)( VG_(clexecfd) );
+}
+
+
+/*====================================================================*/
+/*=== m_state_static + baseBlock: definition, setup, copying       ===*/
+/*====================================================================*/
+
+/* The variables storing offsets. */
+
+#define INVALID_OFFSET (-1)
+
+Int VGOFF_(m_eax) = INVALID_OFFSET;
+Int VGOFF_(m_ecx) = INVALID_OFFSET;
+Int VGOFF_(m_edx) = INVALID_OFFSET;
+Int VGOFF_(m_ebx) = INVALID_OFFSET;
+Int VGOFF_(m_esp) = INVALID_OFFSET;
+Int VGOFF_(m_ebp) = INVALID_OFFSET;
+Int VGOFF_(m_esi) = INVALID_OFFSET;
+Int VGOFF_(m_edi) = INVALID_OFFSET;
+Int VGOFF_(m_eflags) = INVALID_OFFSET;
+Int VGOFF_(m_dflag)  = INVALID_OFFSET;
+Int VGOFF_(m_ssestate) = INVALID_OFFSET;
+Int VGOFF_(ldt)   = INVALID_OFFSET;
+Int VGOFF_(tls)   = INVALID_OFFSET;
+Int VGOFF_(m_cs)  = INVALID_OFFSET;
+Int VGOFF_(m_ss)  = INVALID_OFFSET;
+Int VGOFF_(m_ds)  = INVALID_OFFSET;
+Int VGOFF_(m_es)  = INVALID_OFFSET;
+Int VGOFF_(m_fs)  = INVALID_OFFSET;
+Int VGOFF_(m_gs)  = INVALID_OFFSET;
+Int VGOFF_(m_eip) = INVALID_OFFSET;
+Int VGOFF_(spillslots) = INVALID_OFFSET;
+Int VGOFF_(sh_eax) = INVALID_OFFSET;
+Int VGOFF_(sh_ecx) = INVALID_OFFSET;
+Int VGOFF_(sh_edx) = INVALID_OFFSET;
+Int VGOFF_(sh_ebx) = INVALID_OFFSET;
+Int VGOFF_(sh_esp) = INVALID_OFFSET;
+Int VGOFF_(sh_ebp) = INVALID_OFFSET;
+Int VGOFF_(sh_esi) = INVALID_OFFSET;
+Int VGOFF_(sh_edi) = INVALID_OFFSET;
+Int VGOFF_(sh_eflags) = INVALID_OFFSET;
+
+Int VGOFF_(helper_idiv_64_32) = INVALID_OFFSET;
+Int VGOFF_(helper_div_64_32) = INVALID_OFFSET;
+Int VGOFF_(helper_idiv_32_16) = INVALID_OFFSET;
+Int VGOFF_(helper_div_32_16) = INVALID_OFFSET;
+Int VGOFF_(helper_idiv_16_8) = INVALID_OFFSET;
+Int VGOFF_(helper_div_16_8) = INVALID_OFFSET;
+Int VGOFF_(helper_imul_32_64) = INVALID_OFFSET;
+Int VGOFF_(helper_mul_32_64) = INVALID_OFFSET;
+Int VGOFF_(helper_imul_16_32) = INVALID_OFFSET;
+Int VGOFF_(helper_mul_16_32) = INVALID_OFFSET;
+Int VGOFF_(helper_imul_8_16) = INVALID_OFFSET;
+Int VGOFF_(helper_mul_8_16) = INVALID_OFFSET;
+Int VGOFF_(helper_CLD) = INVALID_OFFSET;
+Int VGOFF_(helper_STD) = INVALID_OFFSET;
+Int VGOFF_(helper_get_dirflag) = INVALID_OFFSET;
+Int VGOFF_(helper_CLC) = INVALID_OFFSET;
+Int VGOFF_(helper_STC) = INVALID_OFFSET;
+Int VGOFF_(helper_shldl) = INVALID_OFFSET;
+Int VGOFF_(helper_shldw) = INVALID_OFFSET;
+Int VGOFF_(helper_shrdl) = INVALID_OFFSET;
+Int VGOFF_(helper_shrdw) = INVALID_OFFSET;
+Int VGOFF_(helper_IN) = INVALID_OFFSET;
+Int VGOFF_(helper_OUT) = INVALID_OFFSET;
+Int VGOFF_(helper_RDTSC) = INVALID_OFFSET;
+Int VGOFF_(helper_CPUID) = INVALID_OFFSET;
+Int VGOFF_(helper_BSWAP) = INVALID_OFFSET;
+Int VGOFF_(helper_bsf) = INVALID_OFFSET;
+Int VGOFF_(helper_bsr) = INVALID_OFFSET;
+Int VGOFF_(helper_fstsw_AX) = INVALID_OFFSET;
+Int VGOFF_(helper_SAHF) = INVALID_OFFSET;
+Int VGOFF_(helper_LAHF) = INVALID_OFFSET;
+Int VGOFF_(helper_DAS) = INVALID_OFFSET;
+Int VGOFF_(helper_DAA) = INVALID_OFFSET;
+Int VGOFF_(helper_cmpxchg8b) = INVALID_OFFSET;
+Int VGOFF_(helper_undefined_instruction) = INVALID_OFFSET;
+
+/* MAX_NONCOMPACT_HELPERS can be increased easily.  If MAX_COMPACT_HELPERS is
+ * increased too much, they won't really be compact any more... */
+#define  MAX_COMPACT_HELPERS     8
+#define  MAX_NONCOMPACT_HELPERS  50 
+
+UInt VG_(n_compact_helpers)    = 0;
+UInt VG_(n_noncompact_helpers) = 0;
+
+Addr VG_(compact_helper_addrs)  [MAX_COMPACT_HELPERS];
+Int  VG_(compact_helper_offsets)[MAX_COMPACT_HELPERS];
+Addr VG_(noncompact_helper_addrs)  [MAX_NONCOMPACT_HELPERS];
+Int  VG_(noncompact_helper_offsets)[MAX_NONCOMPACT_HELPERS];
+
+/* This is the actual defn of baseblock. */
+UInt VG_(baseBlock)[VG_BASEBLOCK_WORDS];
+
+/* See comment about this in vg_include.h.  Change only with great care. */
 __attribute__ ((aligned (16)))
 UInt VG_(m_state_static) [6 /* segment regs, Intel order */
                           + 8 /* int regs, in Intel order */ 
@@ -1213,6 +2156,10 @@
                           + VG_SIZE_OF_SSESTATE_W /* FPU state */
                          ];
 
+/* Words. */
+static Int baB_off = 0;
+
+
 UInt VG_(insertDflag)(UInt eflags, Int d)
 {
    vg_assert(d == 1 || d == -1);
@@ -1236,7 +2183,7 @@
    return ret;
 }
 
-void VG_(copy_baseBlock_to_m_state_static) ( void )
+static void copy_baseBlock_to_m_state_static( void )
 {
    Int i;
    VG_(m_state_static)[ 0/4] = VG_(baseBlock)[VGOFF_(m_cs)];
@@ -1266,129 +2213,160 @@
 }
 
 
-void VG_(copy_m_state_static_to_baseBlock) ( void )
+/* Returns the offset, in words. */
+static Int alloc_BaB ( Int words )
 {
-   Int i;
-   VG_(baseBlock)[VGOFF_(m_cs)] = VG_(m_state_static)[ 0/4];
-   VG_(baseBlock)[VGOFF_(m_ss)] = VG_(m_state_static)[ 4/4];
-   VG_(baseBlock)[VGOFF_(m_ds)] = VG_(m_state_static)[ 8/4];
-   VG_(baseBlock)[VGOFF_(m_es)] = VG_(m_state_static)[12/4];
-   VG_(baseBlock)[VGOFF_(m_fs)] = VG_(m_state_static)[16/4];
-   VG_(baseBlock)[VGOFF_(m_gs)] = VG_(m_state_static)[20/4];
+   Int off = baB_off;
+   baB_off += words;
+   if (baB_off >= VG_BASEBLOCK_WORDS)
+      VG_(core_panic)( "alloc_BaB: baseBlock is too small");
 
-   VG_(baseBlock)[VGOFF_(m_eax)] = VG_(m_state_static)[24/4];
-   VG_(baseBlock)[VGOFF_(m_ecx)] = VG_(m_state_static)[28/4];
-   VG_(baseBlock)[VGOFF_(m_edx)] = VG_(m_state_static)[32/4];
-   VG_(baseBlock)[VGOFF_(m_ebx)] = VG_(m_state_static)[36/4];
-   VG_(baseBlock)[VGOFF_(m_esp)] = VG_(m_state_static)[40/4];
-   VG_(baseBlock)[VGOFF_(m_ebp)] = VG_(m_state_static)[44/4];
-   VG_(baseBlock)[VGOFF_(m_esi)] = VG_(m_state_static)[48/4];
-   VG_(baseBlock)[VGOFF_(m_edi)] = VG_(m_state_static)[52/4];
-
-   VG_(baseBlock)[VGOFF_(m_eflags)] 
-      = VG_(m_state_static)[56/4] & ~EFlagD;
-   VG_(baseBlock)[VGOFF_(m_dflag)] 
-      = VG_(extractDflag)(VG_(m_state_static)[56/4]);
-
-   VG_(baseBlock)[VGOFF_(m_eip)] = VG_(m_state_static)[60/4];
-
-   for (i = 0; i < VG_SIZE_OF_SSESTATE_W; i++)
-      VG_(baseBlock)[VGOFF_(m_ssestate) + i]
-         = VG_(m_state_static)[64/4 + i];
+   return off;   
 }
 
-Addr VG_(get_stack_pointer) ( void )
+/* Align offset, in *bytes* */
+static void align_BaB ( UInt align )
 {
-   return VG_(baseBlock)[VGOFF_(m_esp)];
+   vg_assert(2 == align || 4 == align || 8 == align || 16 == align);
+   baB_off +=  (align-1);
+   baB_off &= ~(align-1);
 }
 
-/* ---------------------------------------------------------------------
-   Show accumulated counts.
-   ------------------------------------------------------------------ */
-
-static __inline__ Int safe_idiv(Int a, Int b)
+/* Allocate 1 word in baseBlock and set it to the given value. */
+static Int alloc_BaB_1_set ( Addr a )
 {
-   return (b == 0 ? 0 : a / b);
+   Int off = alloc_BaB(1);
+   VG_(baseBlock)[off] = (UInt)a;
+   return off;
 }
 
-static void vg_show_counts ( void )
+/* Registers a function in compact_helper_addrs;  compact_helper_offsets is
+   filled in later. */
+void VG_(register_compact_helper)(Addr a)
 {
-   VG_(message)(Vg_DebugMsg,
-		"    TT/TC: %d tc sectors discarded.",
-                VG_(number_of_tc_discards) );
-   VG_(message)(Vg_DebugMsg,
-                "           %d chainings, %d unchainings.",
-                VG_(bb_enchain_count), VG_(bb_dechain_count) );
-   VG_(message)(Vg_DebugMsg,
-                "translate: new     %d (%d -> %d; ratio %d:10)",
-                VG_(overall_in_count),
-                VG_(overall_in_osize),
-                VG_(overall_in_tsize),
-                safe_idiv(10*VG_(overall_in_tsize), VG_(overall_in_osize)));
-   VG_(message)(Vg_DebugMsg,
-                "           discard %d (%d -> %d; ratio %d:10).",
-                VG_(overall_out_count),
-                VG_(overall_out_osize),
-                VG_(overall_out_tsize),
-                safe_idiv(10*VG_(overall_out_tsize), VG_(overall_out_osize)));
-   VG_(message)(Vg_DebugMsg,
-      " dispatch: %llu jumps (bb entries), of which %u (%lu%%) were unchained.",
-      VG_(bbs_done), 
-      VG_(unchained_jumps_done),
-      ((ULong)(100) * (ULong)(VG_(unchained_jumps_done)))
-         / ( VG_(bbs_done)==0 ? 1 : VG_(bbs_done) )
+   if (MAX_COMPACT_HELPERS <= VG_(n_compact_helpers)) {
+      VG_(printf)("Can only register %d compact helpers\n", 
+                  MAX_COMPACT_HELPERS);
+      VG_(core_panic)("Too many compact helpers registered");
+   }
+   VG_(compact_helper_addrs)[VG_(n_compact_helpers)] = a;
+   VG_(n_compact_helpers)++;
+}
+
+/* Registers a function in noncompact_helper_addrs;  noncompact_helper_offsets
+ * is filled in later.
+ */
+void VG_(register_noncompact_helper)(Addr a)
+{
+   if (MAX_NONCOMPACT_HELPERS <= VG_(n_noncompact_helpers)) {
+      VG_(printf)("Can only register %d non-compact helpers\n", 
+                  MAX_NONCOMPACT_HELPERS);
+      VG_(printf)("Try increasing MAX_NON_COMPACT_HELPERS\n");
+      VG_(core_panic)("Too many non-compact helpers registered");
+   }
+   VG_(noncompact_helper_addrs)[VG_(n_noncompact_helpers)] = a;
+   VG_(n_noncompact_helpers)++;
+}
+
+/* Allocate offsets in baseBlock for the skin helpers */
+static 
+void assign_helpers_in_baseBlock(UInt n, Int offsets[], Addr addrs[])
+{
+   UInt i;
+   for (i = 0; i < n; i++) 
+      offsets[i] = alloc_BaB_1_set( addrs[i] );
+}
+
+Bool VG_(need_to_handle_esp_assignment)(void)
+{
+   return ( VG_(defined_new_mem_stack_4)()  ||
+            VG_(defined_die_mem_stack_4)()  ||
+            VG_(defined_new_mem_stack_8)()  ||
+            VG_(defined_die_mem_stack_8)()  ||
+            VG_(defined_new_mem_stack_12)() ||
+            VG_(defined_die_mem_stack_12)() ||
+            VG_(defined_new_mem_stack_16)() ||
+            VG_(defined_die_mem_stack_16)() ||
+            VG_(defined_new_mem_stack_32)() ||
+            VG_(defined_die_mem_stack_32)() ||
+            VG_(defined_new_mem_stack)()    ||
+            VG_(defined_die_mem_stack)()
+          );
+}
+
+/* Here we assign actual offsets.  It's important to get the most
+   popular referents within 128 bytes of the start, so we can take
+   advantage of short addressing modes relative to %ebp.  Popularity
+   of offsets was measured on 22 Feb 02 running a KDE application, and
+   the slots rearranged accordingly, with a 1.5% reduction in total
+   size of translations. */
+static void init_baseBlock ( Addr client_eip, Addr esp_at_startup )
+{
+   /* Those with offsets under 128 are carefully chosen. */
+
+   /* WORD offsets in this column */
+   /* 0   */ VGOFF_(m_eax)     = alloc_BaB_1_set(0);
+   /* 1   */ VGOFF_(m_ecx)     = alloc_BaB_1_set(0);
+   /* 2   */ VGOFF_(m_edx)     = alloc_BaB_1_set(0);
+   /* 3   */ VGOFF_(m_ebx)     = alloc_BaB_1_set(0);
+   /* 4   */ VGOFF_(m_esp)     = alloc_BaB_1_set(esp_at_startup);
+   /* 5   */ VGOFF_(m_ebp)     = alloc_BaB_1_set(0);
+   /* 6   */ VGOFF_(m_esi)     = alloc_BaB_1_set(0);
+   /* 7   */ VGOFF_(m_edi)     = alloc_BaB_1_set(0);
+   /* 8   */ VGOFF_(m_eflags)  = alloc_BaB_1_set(0);
+
+   if (VG_(needs).shadow_regs) {
+      /* 9   */ VGOFF_(sh_eax)    = alloc_BaB_1_set(0);
+      /* 10  */ VGOFF_(sh_ecx)    = alloc_BaB_1_set(0);
+      /* 11  */ VGOFF_(sh_edx)    = alloc_BaB_1_set(0);
+      /* 12  */ VGOFF_(sh_ebx)    = alloc_BaB_1_set(0);
+      /* 13  */ VGOFF_(sh_esp)    = alloc_BaB_1_set(0);
+      /* 14  */ VGOFF_(sh_ebp)    = alloc_BaB_1_set(0);
+      /* 15  */ VGOFF_(sh_esi)    = alloc_BaB_1_set(0);
+      /* 16  */ VGOFF_(sh_edi)    = alloc_BaB_1_set(0);
+      /* 17  */ VGOFF_(sh_eflags) = alloc_BaB_1_set(0);
+      VG_TRACK( post_regs_write_init );
+   }
+
+   /* 9,10,11 or 18,19,20... depends on number whether shadow regs are used
+    * and on compact helpers registered */ 
+
+   /* Make these most-frequently-called specialised ones compact, if they
+      are used. */
+   if (VG_(defined_new_mem_stack_4)())
+      VG_(register_compact_helper)( (Addr) VG_(tool_interface).track_new_mem_stack_4);
+
+   if (VG_(defined_die_mem_stack_4)())
+      VG_(register_compact_helper)( (Addr) VG_(tool_interface).track_die_mem_stack_4);
+
+   /* (9 or 18) + n_compact_helpers  */
+   /* Allocate slots for compact helpers */
+   assign_helpers_in_baseBlock(VG_(n_compact_helpers), 
+                               VG_(compact_helper_offsets), 
+                               VG_(compact_helper_addrs));
+
+   /* (9/10 or 18/19) + n_compact_helpers */
+   VGOFF_(m_eip) = alloc_BaB_1_set(client_eip);
+
+   /* There are currently 24 spill slots */
+   /* (11+/20+ .. 32+/43+) + n_compact_helpers.  This can overlap the magic
+    * boundary at >= 32 words, but most spills are to low numbered spill
+    * slots, so the ones above the boundary don't see much action. */
+   VGOFF_(spillslots) = alloc_BaB(VG_MAX_SPILLSLOTS);
+
+   /* I gave up counting at this point.  Since they're above the
+      short-amode-boundary, there's no point. */
+
+   VGOFF_(m_dflag) = alloc_BaB_1_set(1);  // 1 == forward D-flag
+
+   /* The FPU/SSE state.  This _must_ be 16-byte aligned.  Initial
+      state doesn't matter much, as long as it's not totally borked. */
+   align_BaB(16);
+   VGOFF_(m_ssestate) = alloc_BaB(VG_SIZE_OF_SSESTATE_W);
+   vg_assert( 
+      0 == ( ((UInt)(& VG_(baseBlock)[VGOFF_(m_ssestate)])) % 16 )
    );
 
-   VG_(message)(Vg_DebugMsg,
-      "           %d/%d major/minor sched events.  %d tt_fast misses.", 
-                     VG_(num_scheduling_events_MAJOR), 
-                     VG_(num_scheduling_events_MINOR), 
-                     VG_(tt_fast_misses));
-
-   VG_(message)(Vg_DebugMsg, 
-                "reg-alloc: %d t-req-spill, "
-                "%d+%d orig+spill uis, %d total-reg-r.",
-                VG_(translations_needing_spill),
-                VG_(uinstrs_prealloc),
-                VG_(uinstrs_spill),
-                VG_(total_reg_rank) );
-   VG_(message)(Vg_DebugMsg, 
-                "   sanity: %d cheap, %d expensive checks.",
-                VG_(sanity_fast_count), 
-                VG_(sanity_slow_count) );
-   VG_(print_ccall_stats)();
-}
-
-
-/* ---------------------------------------------------------------------
-   Main!
-   ------------------------------------------------------------------ */
-
-/* Initialize the PID and PGRP of scheduler LWP; this is also called
-   in any new children after fork. */
-static void newpid(ThreadId unused)
-{
-   /* PID of scheduler LWP */
-   VG_(main_pid) = VG_(getpid)();
-   VG_(main_pgrp) = VG_(getpgrp)();
-}
-
-/* Where we jump to once Valgrind has got control, and the real
-   machine's state has been copied to the m_state_static. */
-
-void VG_(main) ( const KickstartParams *kp, void (*tool_init)(void), void *tool_dlhandle )
-{
-   VgSchedReturnCode src;
-   struct vki_rlimit rl;
-
-   /* initial state */
-   if (0)
-      VG_(printf)("starting esp=%p eip=%p, esp=%p\n", kp->client_esp, kp->client_eip, &src);
-   VG_(esp_at_startup) = kp->client_esp;
-   VG_(memset)(&VG_(m_state_static), 0, sizeof(VG_(m_state_static)));
-   VG_(m_state_static)[40/4] = kp->client_esp;
-   VG_(m_state_static)[60/4] = kp->client_eip;
-
    /* I assume that if we have SSE2 we also have SSE */
    VG_(have_ssestate) = 
 	   VG_(cpu_has_feature)(VG_X86_FEAT_FXSR) &&
@@ -1397,29 +2375,17 @@
    /* set up an initial FPU state (doesn't really matter what it is,
       so long as it's somewhat valid) */
    if (!VG_(have_ssestate))
-	   asm volatile("fwait; fnsave %0; fwait; frstor %0; fwait" 
-			: : "m" (VG_(m_state_static)[64/4]) : "cc", "memory");
+      asm volatile("fwait; fnsave %0; fwait; frstor %0; fwait" 
+                   : 
+                   : "m" (VG_(baseBlock)[VGOFF_(m_ssestate)]) 
+                   : "cc", "memory");
    else
-	   asm volatile("fwait; fxsave %0; fwait; andl $0xffbf, %1; fxrstor %0; fwait"
-			: : "m" (VG_(m_state_static)[64/4]), "m" (VG_(m_state_static)[(64+24)/4]) : "cc", "memory");
-
-   VG_(brk_base)          = VG_(brk_limit) = kp->client_brkbase;
-   VG_(client_base)       = kp->client_base;
-   VG_(client_end)        = kp->client_end;
-   VG_(client_mapbase)    = kp->client_mapbase;
-   VG_(clstk_base)        = kp->clstk_base;
-   VG_(clstk_end)         = kp->clstk_end;
-   vg_assert(VG_(clstk_end) == VG_(client_end));
-
-   VG_(shadow_base)	  = kp->shadow_base;
-   VG_(shadow_end)	  = kp->shadow_end;
-   VG_(valgrind_base)	  = kp->vg_base;
-   VG_(valgrind_mmap_end) = kp->vg_mmap_end;
-   VG_(valgrind_end)	  = kp->vg_end;
-
-   VG_(libdir)            = kp->libdir;
-
-   VG_(client_trampoline_code) = kp->cl_tramp_code;
+      asm volatile("fwait; fxsave %0; fwait; andl $0xffbf, %1;"
+                   "fxrstor %0; fwait"
+                   : 
+                   : "m" (VG_(baseBlock)[VGOFF_(m_ssestate)]), 
+                     "m" (VG_(baseBlock)[VGOFF_(m_ssestate)+(24/4)]) 
+                   : "cc", "memory");
 
    if (0) {
       if (VG_(have_ssestate))
@@ -1428,144 +2394,103 @@
          VG_(printf)("Looks like a MMX-only CPU\n");
    }
 
-   VG_(atfork)(NULL, NULL, newpid);
-   newpid(VG_INVALID_THREADID);
+   /* LDT pointer: pretend the root thread has an empty LDT to start with. */
+   VGOFF_(ldt)   = alloc_BaB_1_set((UInt)NULL);
 
-   /* Get the current file descriptor limits. */
-   if (VG_(getrlimit)(VKI_RLIMIT_NOFILE, &rl) < 0) {
-      rl.rlim_cur = 1024;
-      rl.rlim_max = 1024;
-   }
+   /* TLS pointer: pretend the root thread has no TLS array for now. */
+   VGOFF_(tls)   = alloc_BaB_1_set((UInt)NULL);
 
-   /* Work out where to move the soft limit to. */
-   if (rl.rlim_cur + VG_N_RESERVED_FDS <= rl.rlim_max) {
-      rl.rlim_cur = rl.rlim_cur + VG_N_RESERVED_FDS;
-   } else {
-      rl.rlim_cur = rl.rlim_max;
-   }
+   /* segment registers */
+   VGOFF_(m_cs)  = alloc_BaB_1_set(0);
+   VGOFF_(m_ss)  = alloc_BaB_1_set(0);
+   VGOFF_(m_ds)  = alloc_BaB_1_set(0);
+   VGOFF_(m_es)  = alloc_BaB_1_set(0);
+   VGOFF_(m_fs)  = alloc_BaB_1_set(0);
+   VGOFF_(m_gs)  = alloc_BaB_1_set(0);
 
-   /* Reserve some file descriptors for our use. */
-   VG_(max_fd) = rl.rlim_cur - VG_N_RESERVED_FDS;
+   VG_(register_noncompact_helper)( (Addr) & VG_(do_useseg) );
 
-   /* Update the soft limit. */
-   VG_(setrlimit)(VKI_RLIMIT_NOFILE, &rl);
+#define REG(kind, size) \
+   if (VG_(defined_##kind##_mem_stack##size)()) \
+      VG_(register_noncompact_helper)(           \
+          (Addr) VG_(tool_interface).track_##kind##_mem_stack##size );
+   REG(new, _8);
+   REG(new, _12);
+   REG(new, _16);
+   REG(new, _32);
+   REG(new, );
+   REG(die, _8);
+   REG(die, _12);
+   REG(die, _16);
+   REG(die, _32);
+   REG(die, );
+#undef REG
 
-   if (kp->vgexecfd != -1)
-      VG_(vgexecfd) = VG_(safe_fd)(kp->vgexecfd);
-   if (kp->clexecfd != -1)
-      VG_(clexecfd) = VG_(safe_fd)(kp->clexecfd);
+   if (VG_(need_to_handle_esp_assignment)())
+      VG_(register_noncompact_helper)((Addr) VG_(unknown_esp_update));
 
-   /* Read /proc/self/maps into a buffer.  Must be before:
-      - SK_(pre_clo_init)(): so that if it calls VG_(malloc)(), any mmap'd
-        superblocks are not erroneously identified as being owned by the
-        client, which would be bad.
-      - init_memory(): that's where the buffer is parsed
-      - init_tt_tc(): so the anonymous mmaps for the translation table and
-        translation cache aren't identified as part of the client, which would
-        waste > 20M of virtual address space, and be bad.
-   */
-   VG_(read_procselfmaps)();
+#  define HELPER(name) \
+   VGOFF_(helper_##name) = alloc_BaB_1_set( (Addr) & VG_(helper_##name))
 
-   /* Setup stuff that depends on the skin.  Must be before:
-      - vg_init_baseBlock(): to register helpers
-      - process_cmd_line_options(): to register skin name and description,
-        and turn on/off 'command_line_options' need
-      - init_memory() (to setup memory event trackers).
-   */
-   (*tool_init)();
-   VG_(tool_init_dlsym)(tool_dlhandle);
+   /* Helper functions. */
+   HELPER(idiv_64_32);     HELPER(div_64_32);
+   HELPER(idiv_32_16);     HELPER(div_32_16);
+   HELPER(idiv_16_8);      HELPER(div_16_8);
 
-   VG_(sanity_check_needs)();
+   HELPER(imul_32_64);     HELPER(mul_32_64);
+   HELPER(imul_16_32);     HELPER(mul_16_32);
+   HELPER(imul_8_16);      HELPER(mul_8_16);
 
-   /* Process Valgrind's command-line opts */
-   process_cmd_line_options(kp);
+   HELPER(CLD);            HELPER(STD);
+   HELPER(get_dirflag);
 
-   /* Hook to delay things long enough so we can get the pid and
-      attach GDB in another shell. */
-   if (VG_(clo_wait_for_gdb)) {
-      VG_(printf)("pid=%d\n", VG_(getpid)());
-      /* do "jump *$eip" to skip this in gdb */
-      VG_(do_syscall)(__NR_pause);
-   }
+   HELPER(CLC);            HELPER(STC);
 
-   /* Do post command-line processing initialisation.  Must be before:
-      - vg_init_baseBlock(): to register any more helpers
-   */
-   SK_(post_clo_init)();
+   HELPER(shldl);          HELPER(shldw);
+   HELPER(shrdl);          HELPER(shrdw);
 
-   /* Set up baseBlock offsets and copy the saved machine's state into it. */
-   vg_init_baseBlock();
+   HELPER(RDTSC);          HELPER(CPUID);
 
-   /* Search for file descriptors that are inherited from our parent. */
-   if (VG_(clo_track_fds))
-      VG_(init_preopened_fds)();
+   HELPER(bsf);            HELPER(bsr);
 
-   /* Initialise the scheduler, and copy the client's state from
-      baseBlock into VG_(threads)[1].  Must be before:
-      - VG_(sigstartup_actions)()
-   */
-   VG_(scheduler_init)();
+   HELPER(fstsw_AX);
+   HELPER(SAHF);           HELPER(LAHF);
+   HELPER(DAS);            HELPER(DAA);
+   HELPER(IN);             HELPER(OUT);
+   HELPER(cmpxchg8b);
 
-   /* Set up the ProxyLWP machinery */
-   VG_(proxy_init)();
+   HELPER(undefined_instruction);
 
-   /* Initialise the signal handling subsystem, temporarily parking
-      the saved blocking-mask in saved_sigmask. */
-   VG_(sigstartup_actions)();
+#  undef HELPER
 
-   /* Perhaps we're profiling Valgrind? */
-   if (VG_(clo_profile))
-      VGP_(init_profiling)();
+   /* Allocate slots for noncompact helpers */
+   assign_helpers_in_baseBlock(VG_(n_noncompact_helpers), 
+                               VG_(noncompact_helper_offsets), 
+                               VG_(noncompact_helper_addrs));
+}
 
-   /* Start calibration of our RDTSC-based clock. */
-   VG_(start_rdtsc_calibration)();
 
-   /* Parse /proc/self/maps to learn about startup segments. */
-   VGP_PUSHCC(VgpInitMem);
-   VG_(init_memory)();
-   VGP_POPCC(VgpInitMem);
+/*====================================================================*/
+/*=== Setup pointercheck                                           ===*/
+/*====================================================================*/
 
-   /* Read the list of errors to suppress.  This should be found in
-      the file specified by vg_clo_suppressions. */
-   if (VG_(needs).core_errors || VG_(needs).skin_errors)
-      VG_(load_suppressions)();
-
-   /* End calibration of our RDTSC-based clock, leaving it as long as
-      we can. */
-   VG_(end_rdtsc_calibration)();
-
-   /* Initialise translation table and translation cache. */
-   VG_(init_tt_tc)();
-
-   if (VG_(clo_verbosity) == 1) {
-      VG_(message)(Vg_UserMsg, 
-                   "For more details, rerun with: -v");
-   }
-
-   /* Force a read of the debug info so that we can look for 
-      glibc entry points to intercept. */
-   VG_(setup_code_redirect_table)();
-
-   /* Now it is safe for malloc et al in vg_clientmalloc.c to act
-      instrumented-ly. */
-   if (VG_(clo_verbosity) > 0)
-      VG_(message)(Vg_UserMsg, "");
-
-   VG_(bbs_to_go) = VG_(clo_stop_after);
+static void setup_pointercheck(void)
+{
+   int ret;
 
    if (VG_(clo_pointercheck)) {
-      vki_modify_ldt_t ldt = { VG_POINTERCHECK_SEGIDX,
-			       VG_(client_base),
-			       (VG_(client_end)-VG_(client_base)) / VKI_BYTES_PER_PAGE,
-			       1,		/* 32 bit */
-			       0,		/* contents: data, RW, non-expanding */
-			       0,		/* not read-exec only */
-			       1,		/* limit in pages */
-			       0,		/* !seg not present */
-			       1,		/* usable */
+      vki_modify_ldt_t ldt = { 
+         VG_POINTERCHECK_SEGIDX,    // entry_number
+         VG_(client_base),          // base_addr
+         (VG_(client_end)-VG_(client_base)) / VKI_BYTES_PER_PAGE, // limit
+         1,                         // seg_32bit
+         0,                         // contents: data, RW, non-expanding
+         0,                         // ! read_exec_only
+         1,                         // limit_in_pages
+         0,                         // ! seg not present
+         1,                         // useable
       };
-      Int ret = VG_(do_syscall)(__NR_modify_ldt, 1, &ldt, sizeof(ldt));
-
+      ret = VG_(do_syscall)(__NR_modify_ldt, 1, &ldt, sizeof(ldt));
       if (ret < 0) {
 	 VG_(message)(Vg_UserMsg,
 		      "Warning: ignoring --pointercheck=yes, "
@@ -1573,306 +2498,83 @@
 	 VG_(clo_pointercheck) = False;
       }
    }
-
-   /* Run! */
-   VG_(running_on_simd_CPU) = True;
-   VGP_PUSHCC(VgpSched);
-
-   if (__builtin_setjmp(&VG_(fatal_signal_jmpbuf)) == 0) {
-      VG_(fatal_signal_set) = True;
-      src = VG_(scheduler)();
-   } else
-      src = VgSrc_FatalSig;
-
-   VGP_POPCC(VgpSched);
-   VG_(running_on_simd_CPU) = False;
-
-   if (VG_(clo_verbosity) > 0)
-      VG_(message)(Vg_UserMsg, "");
-
-   if (src == VgSrc_Deadlock) {
-     VG_(message)(Vg_UserMsg, 
-        "Warning: pthread scheduler exited due to deadlock");
-   }
-
-   /* Print out file descriptor summary and stats. */
-   if(VG_(clo_track_fds))
-      VG_(fd_stats)();
-
-   if (VG_(needs).core_errors || VG_(needs).skin_errors)
-      VG_(show_all_errors)();
-
-   SK_(fini)( VG_(exitcode) );
-
-   VG_(do_sanity_checks)( True /*include expensive checks*/ );
-
-   if (VG_(clo_verbosity) > 1)
-      vg_show_counts();
-
-   if (VG_(clo_verbosity) > 3)
-      VG_(print_UInstr_histogram)();
-
-   if (0) {
-      VG_(message)(Vg_DebugMsg, "");
-      VG_(message)(Vg_DebugMsg, 
-         "------ Valgrind's internal memory use stats follow ------" );
-      VG_(mallocSanityCheckAll)();
-      VG_(show_all_arena_stats)();
-      VG_(message)(Vg_DebugMsg, 
-         "------ Valgrind's ExeContext management stats follow ------" );
-      VG_(show_ExeContext_stats)();
-   }
- 
-   if (VG_(clo_profile))
-      VGP_(done_profiling)();
-
-   VG_(shutdown_logging)();
-
-   /* We're exiting, so nuke all the threads and clean up the proxy LWPs */
-   vg_assert(src == VgSrc_FatalSig ||
-	     VG_(threads)[VG_(last_run_tid)].status == VgTs_Runnable ||
-	     VG_(threads)[VG_(last_run_tid)].status == VgTs_WaitJoiner);
-   VG_(nuke_all_threads_except)(VG_INVALID_THREADID);
-
-   /* Decide how to exit.  This depends on what the scheduler
-      returned. */
-  
-   switch (src) {
-      case VgSrc_ExitSyscall: /* the normal way out */
-         vg_assert(VG_(last_run_tid) > 0 
-                   && VG_(last_run_tid) < VG_N_THREADS);
-	 VG_(proxy_shutdown)();
-
-         /* The thread's %EBX at the time it did __NR_exit() will hold
-            the arg to __NR_exit(), so we just do __NR_exit() with
-            that arg. */
-         VG_(exit)( VG_(exitcode) );
-         /* NOT ALIVE HERE! */
-         VG_(core_panic)("entered the afterlife in vg_main() -- ExitSyscall");
-         break; /* what the hell :) */
-
-      case VgSrc_Deadlock:
-         /* Just exit now.  No point in continuing. */
-	 VG_(proxy_shutdown)();
-         VG_(exit)(0);
-         VG_(core_panic)("entered the afterlife in vg_main() -- Deadlock");
-         break;
-
-      case VgSrc_BbsDone: 
-         /* Tricky; we have to try and switch back to the real CPU.
-            This is all very dodgy and won't work at all in the
-            presence of threads, or if the client happened to be
-            running a signal handler. */
-         /* Prepare to restore state to the real CPU. */
-         VG_(sigshutdown_actions)();
-         VG_(load_thread_state)(1 /* root thread */ );
-         VG_(copy_baseBlock_to_m_state_static)();
-
-	 VG_(proxy_shutdown)();
-
-         /* This pushes a return address on the simulator's stack,
-            which is abandoned.  We call vg_sigshutdown_actions() at
-            the end of vg_switch_to_real_CPU(), so as to ensure that
-            the original stack and machine state is restored before
-            the real signal mechanism is restored.  */
-         VG_(switch_to_real_CPU)();
-
-      case VgSrc_FatalSig:
-	 /* We were killed by a fatal signal, so replicate the effect */
-	 vg_assert(VG_(fatal_sigNo) != -1);
-	 VG_(kill_self)(VG_(fatal_sigNo));
-	 VG_(core_panic)("vg_main(): signal was supposed to be fatal");
-	 break;
-
-      default:
-         VG_(core_panic)("vg_main(): unexpected scheduler return code");
-   }
 }
 
+/*====================================================================*/
+/*===  Initialise program data/text, etc.                          ===*/
+/*====================================================================*/
 
-/* Debugging thing .. can be called from assembly with OYNK macro. */
-void VG_(oynk) ( Int n )
+static void build_valgrind_map_callback 
+      ( Addr start, UInt size, Char rr, Char ww, Char xx, 
+        UInt dev, UInt ino, ULong foffset, const UChar* filename )
 {
-   OINK(n);
+   UInt prot  = 0;
+   UInt flags = SF_MMAP|SF_NOSYMS;
+   Bool is_stack_segment;
+
+   is_stack_segment = 
+      (start == VG_(clstk_base) && (start+size) == VG_(clstk_end));
+
+   /* Only record valgrind mappings for now, without loading any
+      symbols.  This is so we know where the free space is before we
+      start allocating more memory (note: heap is OK, it's just mmap
+      which is the problem here). */
+   if (start >= VG_(valgrind_base) && (start+size) <= VG_(valgrind_end)) {
+      flags |= SF_VALGRIND;
+      VG_(map_file_segment)(start, size, prot, flags, dev, ino, foffset, filename);
+   }
 }
 
+// Global var used to pass local data to callback
+Addr esp_at_startup___global_arg = 0;
 
-/* Walk through a colon-separated environment variable, and remove the
-   entries which matches file_pattern.  It slides everything down over
-   the removed entries, and pads the remaining space with '\0'.  It
-   modifies the entries in place (in the client address space), but it
-   shouldn't matter too much, since we only do this just before an
-   execve().
-
-   This is also careful to mop up any excess ':'s, since empty strings
-   delimited by ':' are considered to be '.' in a path.
-*/
-void VG_(mash_colon_env)(Char *varp, const Char *remove_pattern)
+static void build_segment_map_callback 
+      ( Addr start, UInt size, Char rr, Char ww, Char xx,
+        UInt dev, UInt ino, ULong foffset, const UChar* filename )
 {
-   Char *const start = varp;
-   Char *entry_start = varp;
-   Char *output = varp;
+   UInt prot = 0;
+   UInt flags;
+   Bool is_stack_segment;
+   Addr r_esp;
 
-   if (varp == NULL)
-      return;
+   is_stack_segment 
+      = (start == VG_(clstk_base) && (start+size) == VG_(clstk_end));
 
-   while(*varp) {
-      if (*varp == ':') {
-	 Char prev;
-	 Bool match;
+   if (rr == 'r') prot |= VKI_PROT_READ;
+   if (ww == 'w') prot |= VKI_PROT_WRITE;
+   if (xx == 'x') prot |= VKI_PROT_EXEC;
 
-	 /* This is a bit subtle: we want to match against the entry
-	    we just copied, because it may have overlapped with
-	    itself, junking the original. */
+   if (is_stack_segment)
+      flags = SF_STACK | SF_GROWDOWN;
+   else
+      flags = SF_EXEC|SF_MMAP;
 
-	 prev = *output;
-	 *output = '\0';
+   if (filename != NULL)
+      flags |= SF_FILE;
 
-	 match = VG_(string_match)(remove_pattern, entry_start);
+   if (start >= VG_(valgrind_base) && (start+size) <= VG_(valgrind_end))
+      flags |= SF_VALGRIND;
 
-	 *output = prev;
-	 
-	 if (match) {
-	    output = entry_start;
-	    varp++;			/* skip ':' after removed entry */
-	 } else
-	    entry_start = output+1;	/* entry starts after ':' */
-      }
+   VG_(map_file_segment)(start, size, prot, flags, dev, ino, foffset, filename);
 
-      *output++ = *varp++;
-   }
+   if (VG_(is_client_addr)(start) && VG_(is_client_addr)(start+size-1))
+      VG_TRACK( new_mem_startup, start, size, rr=='r', ww=='w', xx=='x' );
 
-   /* match against the last entry */
-   if (VG_(string_match)(remove_pattern, entry_start)) {
-      output = entry_start;
-      if (output > start) {
-	 /* remove trailing ':' */
-	 output--;
-	 vg_assert(*output == ':');
-      }
-   }	 
-
-   /* pad out the left-overs with '\0' */
-   while(output < varp)
-      *output++ = '\0';
-}
-
-/* Start GDB and get it to attach to this process.  Called if the user
-   requests this service after an error has been shown, so she can
-   poke around and look at parameters, memory, etc.  You can't
-   meaningfully get GDB to continue the program, though; to continue,
-   quit GDB.  */
-void VG_(start_GDB) ( Int tid )
-{
-   Int pid;
-
-   if ((pid = fork()) == 0)
-   {
-      ptrace(PTRACE_TRACEME, 0, NULL, NULL);
-      VG_(kkill)(VG_(getpid)(), VKI_SIGSTOP);
-   }
-   else if (pid > 0) 
-   {
-      struct user_regs_struct regs;
-      Int status;
-      Int res;
-
-      if (VG_(is_running_thread)( tid )) {
-         regs.xcs = VG_(baseBlock)[VGOFF_(m_cs)];
-         regs.xss = VG_(baseBlock)[VGOFF_(m_ss)];
-         regs.xds = VG_(baseBlock)[VGOFF_(m_ds)];
-         regs.xes = VG_(baseBlock)[VGOFF_(m_es)];
-         regs.xfs = VG_(baseBlock)[VGOFF_(m_fs)];
-         regs.xgs = VG_(baseBlock)[VGOFF_(m_gs)];
-         regs.eax = VG_(baseBlock)[VGOFF_(m_eax)];
-         regs.ebx = VG_(baseBlock)[VGOFF_(m_ebx)];
-         regs.ecx = VG_(baseBlock)[VGOFF_(m_ecx)];
-         regs.edx = VG_(baseBlock)[VGOFF_(m_edx)];
-         regs.esi = VG_(baseBlock)[VGOFF_(m_esi)];
-         regs.edi = VG_(baseBlock)[VGOFF_(m_edi)];
-         regs.ebp = VG_(baseBlock)[VGOFF_(m_ebp)];
-         regs.esp = VG_(baseBlock)[VGOFF_(m_esp)];
-         regs.eflags = VG_(baseBlock)[VGOFF_(m_eflags)];
-         regs.eip = VG_(baseBlock)[VGOFF_(m_eip)];
-      } else {
-         ThreadState* tst = & VG_(threads)[ tid ];
-         
-         regs.xcs = tst->m_cs;
-         regs.xss = tst->m_ss;
-         regs.xds = tst->m_ds;
-         regs.xes = tst->m_es;
-         regs.xfs = tst->m_fs;
-         regs.xgs = tst->m_gs;
-         regs.eax = tst->m_eax;
-         regs.ebx = tst->m_ebx;
-         regs.ecx = tst->m_ecx;
-         regs.edx = tst->m_edx;
-         regs.esi = tst->m_esi;
-         regs.edi = tst->m_edi;
-         regs.ebp = tst->m_ebp;
-         regs.esp = tst->m_esp;
-         regs.eflags = tst->m_eflags;
-         regs.eip = tst->m_eip;
-      }
-
-      if ((res = VG_(waitpid)(pid, &status, 0)) == pid &&
-          WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP &&
-          ptrace(PTRACE_SETREGS, pid, NULL, &regs) == 0 &&
-          ptrace(PTRACE_DETACH, pid, NULL, SIGSTOP) == 0) {
-         UChar buf[VG_(strlen)(VG_(clo_GDB_path)) + 100];
-
-         VG_(sprintf)(buf, "%s -nw /proc/%d/fd/%d %d",
-                      VG_(clo_GDB_path), VG_(main_pid), VG_(clexecfd), pid);
-         VG_(message)(Vg_UserMsg, "starting GDB with cmd: %s", buf);
-         res = VG_(system)(buf);
-         if (res == 0) {      
-            VG_(message)(Vg_UserMsg, "");
-            VG_(message)(Vg_UserMsg, 
-                         "GDB has detached.  Valgrind regains control.  We continue.");
-         } else {
-            VG_(message)(Vg_UserMsg, "Apparently failed!");
-            VG_(message)(Vg_UserMsg, "");
-         }
-      }
-
-      VG_(kkill)(pid, VKI_SIGKILL);
-      VG_(waitpid)(pid, &status, 0);
+   /* If this is the stack segment mark all below %esp as noaccess. */
+   r_esp = esp_at_startup___global_arg;
+   vg_assert(0 != r_esp);
+   if (is_stack_segment) {
+      if (0)
+         VG_(message)(Vg_DebugMsg, "invalidating stack area: %x .. %x",
+                      start,r_esp);
+      VG_TRACK( die_mem_stack, start, r_esp-start );
    }
 }
 
 
-/* Print some helpful-ish text about unimplemented things, and give
-   up. */
-void VG_(unimplemented) ( Char* msg )
-{
-   VG_(message)(Vg_UserMsg, "");
-   VG_(message)(Vg_UserMsg, 
-      "Valgrind detected that your program requires");
-   VG_(message)(Vg_UserMsg, 
-      "the following unimplemented functionality:");
-   VG_(message)(Vg_UserMsg, "   %s", msg);
-   VG_(message)(Vg_UserMsg,
-      "This may be because the functionality is hard to implement,");
-   VG_(message)(Vg_UserMsg,
-      "or because no reasonable program would behave this way,");
-   VG_(message)(Vg_UserMsg,
-      "or because nobody has yet needed it.  In any case, let us know at");
-   VG_(message)(Vg_UserMsg,
-      "%s and/or try to work around the problem, if you can.", VG_BUGS_TO);
-   VG_(message)(Vg_UserMsg,
-      "");
-   VG_(message)(Vg_UserMsg,
-      "Valgrind has to exit now.  Sorry.  Bye!");
-   VG_(message)(Vg_UserMsg,
-      "");
-   VG_(pp_sched_status)();
-   VG_(exit)(1);
-}
-
-
-/* ---------------------------------------------------------------------
-   Sanity check machinery (permanently engaged).
-   ------------------------------------------------------------------ */
+/*====================================================================*/
+/*=== Sanity check machinery (permanently engaged)                 ===*/
+/*====================================================================*/
 
 /* A fast sanity check -- suitable for calling circa once per
    millisecond. */
@@ -1938,6 +2640,477 @@
    }
    VGP_POPCC(VgpCoreCheapSanity);
 }
+
+
+/*====================================================================*/
+/*=== main()                                                       ===*/
+/*====================================================================*/
+
+int main(int argc, char **argv)
+{
+   char **cl_argv;
+   const char *tool = NULL;
+   const char *exec = NULL;
+   char *preload;          /* tool-specific LD_PRELOAD .so */
+   char **env;
+   Bool need_help = False;
+   struct exeinfo info;
+   ToolInfo *toolinfo = NULL;
+   void *tool_dlhandle;
+   Addr client_eip;
+   Addr esp_at_startup;    /* client's %esp at the point we gained control. */
+   UInt * client_auxv;
+   VgSchedReturnCode src;
+
+   //============================================================
+   // Nb: startup is complex.  Prerequisites are shown at every step.
+   //
+   // *** Be very careful when messing with the order ***
+   //============================================================
+
+   //--------------------------------------------------------------
+   // Check we were launched by stage1
+   //   p: n/a  [must be first step]
+   //--------------------------------------------------------------
+   scan_auxv();
+
+   if (0) {
+      int prmap(void *start, void *end, const char *perm, off_t off, 
+                int maj, int min, int ino) {
+         printf("mapping %10p-%10p %s %02x:%02x %d\n",
+                start, end, perm, maj, min, ino);
+         return True;
+      }
+      printf("========== main() ==========\n");
+      foreach_map(prmap);
+   }
+
+   //--------------------------------------------------------------
+   // Look for alternative libdir                                  
+   //   p: n/a
+   //--------------------------------------------------------------
+   {  char *cp = getenv(VALGRINDLIB);
+      if (cp != NULL)
+	 VG_(libdir) = cp;
+   }
+
+   //--------------------------------------------------------------
+   // Begin working out address space layout
+   //   p: n/a
+   //--------------------------------------------------------------
+   layout_client_space( (Addr) & argc );
+
+   //--------------------------------------------------------------
+   // Get valgrind args + client args (inc. from VALGRIND_OPTS/.valgrindrc).
+   // Pre-process the command line.
+   //   p: n/a
+   //--------------------------------------------------------------
+   get_command_line(argc, argv, &VG_(vg_argc), &VG_(vg_argv), &cl_argv);
+   pre_process_cmd_line_options(&need_help, &tool, &exec);
+
+   //==============================================================
+   // Nb: once a tool is specified, the tool.so must be loaded even if 
+   // they specified --help or didn't specify a client program.
+   //==============================================================
+
+   //--------------------------------------------------------------
+   // With client padded out, map in tool
+   //   p: layout_client_space()          [for padding]
+   //   p: set-libdir                     [for VG_(libdir)]
+   //   p: pre_process_cmd_line_options() [for 'tool']
+   //--------------------------------------------------------------
+   load_tool(tool, &tool_dlhandle, &toolinfo, &preload);
+
+   //==============================================================
+   // Can use VG_(malloc)() and VG_(arena_malloc)() only after load_tool()
+   // -- redzone size is now set.
+   //==============================================================
+   
+   //--------------------------------------------------------------
+   // Finalise address space layout
+   //   p: layout_client_space(), load_tool()           [for 'toolinfo']
+   //--------------------------------------------------------------
+   layout_remaining_space( toolinfo->shadow_ratio );
+
+   //--------------------------------------------------------------
+   // Load client executable, finding in $PATH if necessary
+   //   p: layout_client_space()           [so there's space]
+   //   p: pre_process_cmd_line_options()  [for 'exec', 'need_help']
+   //   p: layout_remaining_space          [so there's space]
+   //--------------------------------------------------------------
+   load_client(cl_argv, exec, /*inout*/&need_help, &info, &client_eip);
+
+   //--------------------------------------------------------------
+   // Everything in place, unpad us
+   //   p: layout_remaining_space()  [everything must be mapped in before now]  
+   //   p: load_client()             [ditto] 
+   //--------------------------------------------------------------
+   as_unpad((void *)VG_(shadow_end), (void *)~0);
+   as_closepadfile();		/* no more padding */
+
+   //--------------------------------------------------------------
+   // Set up client's environment
+   //   p: set-libdir  [for VG_(libdir)]
+   //   p: load_tool() [for 'preload']
+   //--------------------------------------------------------------
+   env = fix_environment(environ, preload);
+
+   //--------------------------------------------------------------
+   // Setup client stack and eip 
+   //   p: load_client()     [for 'info']
+   //   p: fix_environment() [for 'env']
+   //--------------------------------------------------------------
+   esp_at_startup = setup_client_stack(cl_argv, env, &info, &client_auxv);
+
+   if (0)
+      printf("entry=%x client esp=%x vg_argc=%d brkbase=%x\n",
+	     client_eip, esp_at_startup, VG_(vg_argc), VG_(brk_base));
+
+   //==============================================================
+   // Finished setting up operating environment.  Now initialise
+   // Valgrind.  (This is where the old VG_(main)() started.)
+   //==============================================================
+
+   //--------------------------------------------------------------
+   // Read /proc/self/maps into a buffer
+   //   p: all memory layout, environment setup   [so memory maps are right]
+   //--------------------------------------------------------------
+   VG_(read_procselfmaps)();
+
+   //--------------------------------------------------------------
+   // atfork
+   //   p: n/a
+   //--------------------------------------------------------------
+   VG_(atfork)(NULL, NULL, newpid);
+   newpid(VG_INVALID_THREADID);
+
+   //--------------------------------------------------------------
+   // setup file descriptors
+   //   p: n/a
+   //--------------------------------------------------------------
+   setup_file_descriptors();
+
+   //--------------------------------------------------------------
+   // Setup tool
+   //   p: VG_(read_procselfmaps)()  [so if sk_pre_clo_init calls
+   //        VG_(malloc), any mmap'd superblocks aren't erroneously
+   //        identified later as being owned by the client]
+   // XXX: is that necessary, now that we look for V's segments separately?
+   // XXX: alternatively, if sk_pre_clo_init does use VG_(malloc)(), is it
+   //      wrong to ignore any segments that might add in parse_procselfmaps?
+   //--------------------------------------------------------------
+   (*toolinfo->sk_pre_clo_init)();
+   VG_(tool_init_dlsym)(tool_dlhandle);
+   VG_(sanity_check_needs)();
+
+   //--------------------------------------------------------------
+   // Process Valgrind's + tool's command-line options
+   //   p: load_tool()               [for 'tool']
+   //   p: load_client()             [for 'need_help']
+   //   p: setup_file_descriptors()  [for 'VG_(max_fd)']
+   //   p: sk_pre_clo_init           [to set 'command_line_options' need]
+   //--------------------------------------------------------------
+   process_cmd_line_options(client_auxv, esp_at_startup, tool, need_help);
+
+   //--------------------------------------------------------------
+   // Allow GDB attach
+   //   p: process_cmd_line_options()  [for VG_(clo_wait_for_gdb)]
+   //--------------------------------------------------------------
+   /* Hook to delay things long enough so we can get the pid and
+      attach GDB in another shell. */
+   if (VG_(clo_wait_for_gdb)) {
+      VG_(printf)("pid=%d\n", VG_(getpid)());
+      /* do "jump *$eip" to skip this in gdb */
+      VG_(do_syscall)(__NR_pause);
+   }
+
+   //--------------------------------------------------------------
+   // Setup tool, post command-line processing
+   //   p: process_cmd_line_options  [tool assumes it]
+   //--------------------------------------------------------------
+   SK_(post_clo_init)();
+
+   //--------------------------------------------------------------
+   // Set up baseBlock, copy machine state (m_state_static)
+   //   p: {pre,post}_clo_init()  [for tool helper registration]
+   //      load_client()          [for 'client_eip']
+   //      setup_client_stack()   [for 'esp_at_startup']
+   //--------------------------------------------------------------
+   init_baseBlock(client_eip, esp_at_startup);
+
+   //--------------------------------------------------------------
+   // Search for file descriptors that are inherited from our parent
+   //   p: process_cmd_line_options  [for VG_(clo_track_fds)]
+   //--------------------------------------------------------------
+   if (VG_(clo_track_fds))
+      VG_(init_preopened_fds)();
+
+   //--------------------------------------------------------------
+   // Initialise the scheduler
+   //   p: init_baseBlock()  [baseBlock regs copied into VG_(threads)[1]]
+   //   p: setup_file_descriptors() [else VG_(safe_fd)() breaks]
+   //--------------------------------------------------------------
+   VG_(scheduler_init)();
+
+   //--------------------------------------------------------------
+   // Set up the ProxyLWP machinery
+   //   p: VG_(scheduler_init)()?  [XXX: subtle dependency?]
+   // - subs: VG_(sigstartup_actions)()?
+   //--------------------------------------------------------------
+   VG_(proxy_init)();
+
+   //--------------------------------------------------------------
+   // Initialise the signal handling subsystem
+   //   p: VG_(atfork)(NULL, NULL, newpid) [else problems with sigmasks]
+   //   p: VG_(proxy_init)()               [else breaks...]
+   //--------------------------------------------------------------
+   // Nb: temporarily parks the saved blocking-mask in saved_sigmask.
+   VG_(sigstartup_actions)();
+
+   //--------------------------------------------------------------
+   // Perhaps we're profiling Valgrind?
+   //   p: process_cmd_line_options()  [for VG_(clo_profile)]
+   //   p: others?
+   //
+   // XXX: this seems to be broken?   It always says the tool wasn't built
+   // for profiling;  vg_profile.c's functions don't seem to be overriding
+   // vg_dummy_profile.c's?
+   //
+   // XXX: want this as early as possible.  Looking for --profile
+   // in pre_process_cmd_line_options() could get it earlier.
+   //--------------------------------------------------------------
+   if (VG_(clo_profile))
+      VGP_(init_profiling)();
+
+   VGP_PUSHCC(VgpStartup);
+
+   //--------------------------------------------------------------
+   // Start calibration of our RDTSC-based clock
+   //   p: n/a
+   //--------------------------------------------------------------
+   VG_(start_rdtsc_calibration)();
+
+   //--------------------------------------------------------------
+   // Reserve Valgrind's kickstart, heap and stack
+   //   p: XXX ???
+   //--------------------------------------------------------------
+   VG_(map_segment)(VG_(valgrind_mmap_end),
+                    VG_(valgrind_end)-VG_(valgrind_mmap_end),
+                    VKI_PROT_NONE, SF_VALGRIND|SF_FIXED);
+
+   //--------------------------------------------------------------
+   // Identify Valgrind's segments
+   //   p: read proc/self/maps
+   //   p: VG_(map_segment)   [XXX ???]
+   //   p: sk_pre_clo_init()  [to setup new_mem_startup tracker]
+   //--------------------------------------------------------------
+   VG_(parse_procselfmaps) ( build_valgrind_map_callback );
+
+   // XXX: I can't see why these two need to be separate;  could they be
+   // folded together?  If not, need a comment explaining why.
+   //
+   // XXX: can we merge reading and parsing of /proc/self/maps?
+   //
+   // XXX: can we dynamically allocate the /proc/self/maps buffer? (or mmap
+   //      it?)  Or does that disturb its contents...
+
+   //--------------------------------------------------------------
+   // Build segment map (all segments)
+   //   p: setup_client_stack()  [for 'esp_at_startup']
+   //--------------------------------------------------------------
+   esp_at_startup___global_arg = esp_at_startup;
+   VG_(parse_procselfmaps) ( build_segment_map_callback );  /* everything */
+   esp_at_startup___global_arg = 0;
+   
+   //==============================================================
+   // Can only use VG_(map)() after VG_(map_segment)()  [XXX ???]
+   //==============================================================
+
+   //--------------------------------------------------------------
+   // Build segment map (all segments)
+   //   p: setup_client_stack()  [for 'esp_at_startup']
+   //--------------------------------------------------------------
+   /* Initialize our trampoline page (which is also sysinfo stuff) */
+   VG_(memcpy)( (void *)VG_(client_trampoline_code),
+                &VG_(trampoline_code_start), VG_(trampoline_code_length) );
+   VG_(mprotect)( (void *)VG_(client_trampoline_code),
+                 VG_(trampoline_code_length), VKI_PROT_READ|VKI_PROT_EXEC );
+
+   //--------------------------------------------------------------
+   // Read suppression file
+   //   p: process_cmd_line_options()  [for VG_(clo_suppressions)]
+   //--------------------------------------------------------------
+   if (VG_(needs).core_errors || VG_(needs).skin_errors)
+      VG_(load_suppressions)();
+
+   //--------------------------------------------------------------
+   // End calibrating our RDTSC-based clock, having waited a while.
+   //   p: VG_(start_rdtsc_calibration)()  [obviously]
+   //--------------------------------------------------------------
+   // Nb: Don't have to wait very long;  it does pretty well even if
+   // start_rdtsc_calibration() is immediately before this.
+   VG_(end_rdtsc_calibration)();
+
+   //--------------------------------------------------------------
+   // Initialise translation table and translation cache
+   //   p: read_procselfmaps  [so the anonymous mmaps for the TT/TC
+   //         aren't identified as part of the client, which would waste
+   //         > 20M of virtual address space.]
+   //--------------------------------------------------------------
+   VG_(init_tt_tc)();
+
+   //--------------------------------------------------------------
+   // Read debug info to find glibc entry points to intercept
+   //   p: parse_procselfmaps? [XXX for debug info?]
+   //   p: init_tt_tc?  [XXX ???]
+   //--------------------------------------------------------------
+   VG_(setup_code_redirect_table)();
+
+   //--------------------------------------------------------------
+   // Verbosity message
+   //   p: end_rdtsc_calibration [so startup message is printed first]
+   //--------------------------------------------------------------
+   if (VG_(clo_verbosity) == 1)
+      VG_(message)(Vg_UserMsg, "For more details, rerun with: -v");
+   if (VG_(clo_verbosity) > 0)
+      VG_(message)(Vg_UserMsg, "");
+
+   //--------------------------------------------------------------
+   // Setup pointercheck
+   //   p: process_cmd_line_options() [for VG_(clo_pointercheck)]
+   //--------------------------------------------------------------
+   setup_pointercheck();
+
+
+
+   //--------------------------------------------------------------
+   // Run!
+   //--------------------------------------------------------------
+   VG_(running_on_simd_CPU) = True;
+   VGP_POPCC(VgpStartup);
+   VGP_PUSHCC(VgpSched);
+
+   if (__builtin_setjmp(&VG_(fatal_signal_jmpbuf)) == 0) {
+      VG_(fatal_signal_set) = True;
+      src = VG_(scheduler)();
+   } else
+      src = VgSrc_FatalSig;
+
+   VGP_POPCC(VgpSched);
+   VG_(running_on_simd_CPU) = False;
+
+
+
+   //--------------------------------------------------------------
+   // Finalisation: cleanup, messages, etc.  Order no so important, only
+   // affects what order the messages come.
+   //--------------------------------------------------------------
+   if (VG_(clo_verbosity) > 0)
+      VG_(message)(Vg_UserMsg, "");
+
+   if (src == VgSrc_Deadlock) {
+     VG_(message)(Vg_UserMsg, 
+        "Warning: pthread scheduler exited due to deadlock");
+   }
+
+   /* Print out file descriptor summary and stats. */
+   if (VG_(clo_track_fds))
+      VG_(fd_stats)();
+
+   if (VG_(needs).core_errors || VG_(needs).skin_errors)
+      VG_(show_all_errors)();
+
+   SK_(fini)( VG_(exitcode) );
+
+   VG_(do_sanity_checks)( True /*include expensive checks*/ );
+
+   if (VG_(clo_verbosity) > 1)
+      show_counts();
+
+   if (VG_(clo_verbosity) > 3)
+      VG_(print_UInstr_histogram)();
+
+   if (0) {
+      VG_(message)(Vg_DebugMsg, "");
+      VG_(message)(Vg_DebugMsg, 
+         "------ Valgrind's internal memory use stats follow ------" );
+      VG_(mallocSanityCheckAll)();
+      VG_(show_all_arena_stats)();
+      VG_(message)(Vg_DebugMsg, 
+         "------ Valgrind's ExeContext management stats follow ------" );
+      VG_(show_ExeContext_stats)();
+   }
+ 
+   if (VG_(clo_profile))
+      VGP_(done_profiling)();
+
+   /* Must be after all messages are done */
+   VG_(shutdown_logging)();
+
+   /* We're exiting, so nuke all the threads and clean up the proxy LWPs */
+   vg_assert(src == VgSrc_FatalSig ||
+	     VG_(threads)[VG_(last_run_tid)].status == VgTs_Runnable ||
+	     VG_(threads)[VG_(last_run_tid)].status == VgTs_WaitJoiner);
+   VG_(nuke_all_threads_except)(VG_INVALID_THREADID);
+
+   //--------------------------------------------------------------
+   // Exit, according to the scheduler's return code
+   //--------------------------------------------------------------
+   switch (src) {
+      case VgSrc_ExitSyscall: /* the normal way out */
+         vg_assert(VG_(last_run_tid) > 0 
+                   && VG_(last_run_tid) < VG_N_THREADS);
+	 VG_(proxy_shutdown)();
+
+         /* The thread's %EBX at the time it did __NR_exit() will hold
+            the arg to __NR_exit(), so we just do __NR_exit() with
+            that arg. */
+         VG_(exit)( VG_(exitcode) );
+         /* NOT ALIVE HERE! */
+         VG_(core_panic)("entered the afterlife in main() -- ExitSyscall");
+         break; /* what the hell :) */
+
+      case VgSrc_Deadlock:
+         /* Just exit now.  No point in continuing. */
+	 VG_(proxy_shutdown)();
+         VG_(exit)(0);
+         VG_(core_panic)("entered the afterlife in main() -- Deadlock");
+         break;
+
+      case VgSrc_BbsDone: 
+         /* Tricky; we have to try and switch back to the real CPU.
+            This is all very dodgy and won't work at all in the
+            presence of threads, or if the client happened to be
+            running a signal handler. */
+         /* Prepare to restore state to the real CPU. */
+         VG_(sigshutdown_actions)();
+         VG_(load_thread_state)(1 /* root thread */ );
+         copy_baseBlock_to_m_state_static();
+
+	 VG_(proxy_shutdown)();
+
+         /* This pushes a return address on the simulator's stack,
+            which is abandoned.  We call vg_sigshutdown_actions() at
+            the end of vg_switch_to_real_CPU(), so as to ensure that
+            the original stack and machine state is restored before
+            the real signal mechanism is restored.  */
+         VG_(switch_to_real_CPU)();
+
+      case VgSrc_FatalSig:
+	 /* We were killed by a fatal signal, so replicate the effect */
+	 vg_assert(VG_(fatal_sigNo) != -1);
+	 VG_(kill_self)(VG_(fatal_sigNo));
+	 VG_(core_panic)("main(): signal was supposed to be fatal");
+	 break;
+
+      default:
+         VG_(core_panic)("main(): unexpected scheduler return code");
+   }
+
+   abort();
+}
+
+
 /*--------------------------------------------------------------------*/
 /*--- end                                                vg_main.c ---*/
 /*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_memory.c b/coregrind/vg_memory.c
index 34299d8..7f5121b 100644
--- a/coregrind/vg_memory.c
+++ b/coregrind/vg_memory.c
@@ -397,12 +397,13 @@
 	  (prot & (VKI_PROT_READ|VKI_PROT_EXEC)) == (VKI_PROT_READ|VKI_PROT_EXEC)	&&
 	  len >= VKI_BYTES_PER_PAGE							&&
 	  s->symtab == NULL								&&
-	  VG_(is_object_file)((void *)addr)) {
+	  VG_(is_object_file)((void *)addr)) 
+      {
+         s->symtab = VG_(read_seg_symbols)(s);
 
-      s->symtab = VG_(read_seg_symbols)(s);
-
-      if (s->symtab != NULL)
-	 s->flags |= SF_DYNLIB;
+         if (s->symtab != NULL) {
+            s->flags |= SF_DYNLIB;
+         }
       } else if (flags & SF_MMAP) {
 	 const SegInfo *info;
 
@@ -410,7 +411,8 @@
 	 for(info = VG_(next_seginfo)(NULL);
 	     info != NULL;
 	     info = VG_(next_seginfo)(info)) {
-	    if (VG_(seg_overlaps)(s, VG_(seg_start)(info), VG_(seg_size)(info))) {
+	    if (VG_(seg_overlaps)(s, VG_(seg_start)(info), VG_(seg_size)(info)))
+            {
 	       s->symtab = (SegInfo *)info;
 	       VG_(symtab_incref)((SegInfo *)info);
 	    }
@@ -554,121 +556,6 @@
    return VG_(SkipNode_Next)(&sk_segments, s);
 }
 
-/*--------------------------------------------------------------*/
-/*--- Initialise program data/text etc on program startup.   ---*/
-/*--------------------------------------------------------------*/
-
-static
-void build_valgrind_map_callback ( Addr start, UInt size, 
-				   Char rr, Char ww, Char xx, UInt dev, UInt ino,
-				   ULong foffset, const UChar* filename )
-{
-   UInt prot = 0;
-   UInt flags;
-   Bool is_stack_segment;
-   Bool verbose = False || mem_debug; /* set to True for debugging */
-
-   is_stack_segment = (start == VG_(clstk_base) && (start+size) == VG_(clstk_end));
-
-   prot = 0;
-   flags = SF_MMAP|SF_NOSYMS;
-
-   if (start >= VG_(valgrind_base) && (start+size) <= VG_(valgrind_end))
-      flags |= SF_VALGRIND;
-
-   /* Only record valgrind mappings for now, without loading any
-      symbols.  This is so we know where the free space is before we
-      start allocating more memory (note: heap is OK, it's just mmap
-      which is the problem here). */
-   if (flags & SF_VALGRIND) {
-      if (verbose)
-	 VG_(printf)("adding segment %08p-%08p prot=%x flags=%4x filename=%s\n",
-		     start, start+size, prot, flags, filename);
-
-      VG_(map_file_segment)(start, size, prot, flags, dev, ino, foffset, filename);
-   }
-}
-
-static
-void build_segment_map_callback ( Addr start, UInt size, 
-				  Char rr, Char ww, Char xx, UInt dev, UInt ino,
-				  ULong foffset, const UChar* filename )
-{
-   UInt prot = 0;
-   UInt flags;
-   Bool is_stack_segment;
-   Bool verbose = False || mem_debug; /* set to True for debugging */
-   Addr r_esp;
-
-   is_stack_segment = (start == VG_(clstk_base) && (start+size) == VG_(clstk_end));
-
-   if (rr == 'r')
-      prot |= VKI_PROT_READ;
-   if (ww == 'w')
-      prot |= VKI_PROT_WRITE;
-   if (xx == 'x')
-      prot |= VKI_PROT_EXEC;
-
-      
-   if (is_stack_segment)
-      flags = SF_STACK | SF_GROWDOWN;
-   else
-      flags = SF_EXEC|SF_MMAP;
-
-   if (filename != NULL)
-      flags |= SF_FILE;
-
-   if (start >= VG_(valgrind_base) && (start+size) <= VG_(valgrind_end))
-      flags |= SF_VALGRIND;
-
-   if (verbose)
-      VG_(printf)("adding segment %08p-%08p prot=%x flags=%4x filename=%s\n",
-		  start, start+size, prot, flags, filename);
-
-   VG_(map_file_segment)(start, size, prot, flags, dev, ino, foffset, filename);
-
-   if (VG_(is_client_addr)(start) && VG_(is_client_addr)(start+size-1))
-      VG_TRACK( new_mem_startup, start, size, rr=='r', ww=='w', xx=='x' );
-
-   /* If this is the stack segment mark all below %esp as noaccess. */
-   r_esp = VG_(m_state_static)[40/4];
-   if (is_stack_segment) {
-      if (0)
-         VG_(message)(Vg_DebugMsg, "invalidating stack area: %x .. %x",
-                      start,r_esp);
-      VG_TRACK( die_mem_stack, start, r_esp-start );
-   }
-}
-
-
-/* 1. Records startup segments from /proc/pid/maps.  Takes special note
-      of the executable ones, because if they're munmap()ed we need to
-      discard translations.  Also checks there's no exe segment overlaps.
-
-      Note that `read_from_file' is false;  we read /proc/self/maps into a
-      buffer at the start of VG_(main) so that any superblocks mmap'd by
-      calls to VG_(malloc)() by SK_({pre,post}_clo_init) aren't erroneously
-      thought of as being owned by the client.
- */
-void VG_(init_memory) ( void )
-{
-   /* 1 */
-   /* reserve Valgrind's kickstart, heap and stack */
-   VG_(map_segment)(VG_(valgrind_mmap_end), VG_(valgrind_end)-VG_(valgrind_mmap_end),
-		    VKI_PROT_NONE, SF_VALGRIND|SF_FIXED);
-
-   /* work out what's mapped where, and read interesting symtabs */
-   VG_(parse_procselfmaps) ( build_valgrind_map_callback );	/* just Valgrind mappings */
-   VG_(parse_procselfmaps) ( build_segment_map_callback );	/* everything */
-
-   /* initialize our trampoline page (which is also sysinfo stuff) */
-   VG_(memcpy)((void *)VG_(client_trampoline_code), 
-	       &VG_(trampoline_code_start),
-	       VG_(trampoline_code_length));
-   VG_(mprotect)((void *)VG_(client_trampoline_code), VG_(trampoline_code_length), 
-		 VKI_PROT_READ|VKI_PROT_EXEC);
-}
-
 /*------------------------------------------------------------*/
 /*--- Tracking permissions around %esp changes.            ---*/
 /*------------------------------------------------------------*/
diff --git a/coregrind/vg_scheduler.c b/coregrind/vg_scheduler.c
index ce04d4d..418919d 100644
--- a/coregrind/vg_scheduler.c
+++ b/coregrind/vg_scheduler.c
@@ -581,8 +581,7 @@
 
 /* Initialise the scheduler.  Create a single "main" thread ready to
    run, with special ThreadId of one.  This is called at startup; the
-   caller takes care to park the client's state is parked in
-   VG_(baseBlock).  
+   caller takes care to park the client's state in VG_(baseBlock).  
 */
 void VG_(scheduler_init) ( void )
 {
diff --git a/coregrind/vg_syscalls.c b/coregrind/vg_syscalls.c
index b36245a..dd99a8b 100644
--- a/coregrind/vg_syscalls.c
+++ b/coregrind/vg_syscalls.c
@@ -153,6 +153,67 @@
    return ret;
 }
 
+/* Walk through a colon-separated environment variable, and remove the
+   entries which matches file_pattern.  It slides everything down over
+   the removed entries, and pads the remaining space with '\0'.  It
+   modifies the entries in place (in the client address space), but it
+   shouldn't matter too much, since we only do this just before an
+   execve().
+
+   This is also careful to mop up any excess ':'s, since empty strings
+   delimited by ':' are considered to be '.' in a path.
+*/
+static void mash_colon_env(Char *varp, const Char *remove_pattern)
+{
+   Char *const start = varp;
+   Char *entry_start = varp;
+   Char *output = varp;
+
+   if (varp == NULL)
+      return;
+
+   while(*varp) {
+      if (*varp == ':') {
+	 Char prev;
+	 Bool match;
+
+	 /* This is a bit subtle: we want to match against the entry
+	    we just copied, because it may have overlapped with
+	    itself, junking the original. */
+
+	 prev = *output;
+	 *output = '\0';
+
+	 match = VG_(string_match)(remove_pattern, entry_start);
+
+	 *output = prev;
+	 
+	 if (match) {
+	    output = entry_start;
+	    varp++;			/* skip ':' after removed entry */
+	 } else
+	    entry_start = output+1;	/* entry starts after ':' */
+      }
+
+      *output++ = *varp++;
+   }
+
+   /* match against the last entry */
+   if (VG_(string_match)(remove_pattern, entry_start)) {
+      output = entry_start;
+      if (output > start) {
+	 /* remove trailing ':' */
+	 output--;
+	 vg_assert(*output == ':');
+      }
+   }	 
+
+   /* pad out the left-overs with '\0' */
+   while(output < varp)
+      *output++ = '\0';
+}
+
+
 /* ---------------------------------------------------------------------
    Doing mmap, munmap, mremap, mprotect
    ------------------------------------------------------------------ */
@@ -1808,13 +1869,13 @@
 	 buf = VG_(arena_malloc)(VG_AR_CORE, VG_(strlen)(VG_(libdir)) + 20);
 
 	 VG_(sprintf)(buf, "%s*/vg_inject.so", VG_(libdir));
-	 VG_(mash_colon_env)(ld_preload_str, buf);
+	 mash_colon_env(ld_preload_str, buf);
 
 	 VG_(sprintf)(buf, "%s*/vgpreload_*.so", VG_(libdir));
-	 VG_(mash_colon_env)(ld_preload_str, buf);
+	 mash_colon_env(ld_preload_str, buf);
 
 	 VG_(sprintf)(buf, "%s*", VG_(libdir));
-	 VG_(mash_colon_env)(ld_library_path_str, buf);
+	 mash_colon_env(ld_library_path_str, buf);
 
 	 VG_(env_unsetenv)(envp, VALGRINDCLO);
 
diff --git a/include/vg_skin.h.base b/include/vg_skin.h.base
index cf03145..dd61569 100644
--- a/include/vg_skin.h.base
+++ b/include/vg_skin.h.base
@@ -224,6 +224,7 @@
 #define VGP_CORE_LIST \
    /* These ones depend on the core */                \
    VGP_PAIR(VgpUnc,         "unclassified"),          \
+   VGP_PAIR(VgpStartup,     "startup"),               \
    VGP_PAIR(VgpRun,         "running"),               \
    VGP_PAIR(VgpSched,       "scheduler"),             \
    VGP_PAIR(VgpMalloc,      "low-lev malloc/free"),   \
@@ -237,7 +238,6 @@
    VGP_PAIR(VgpLiveness,    "liveness-analysis"),     \
    VGP_PAIR(VgpDoLRU,       "do-lru"),                \
    VGP_PAIR(VgpSlowFindT,   "slow-search-transtab"),  \
-   VGP_PAIR(VgpInitMem,     "init-memory"),           \
    VGP_PAIR(VgpExeContext,  "exe-context"),           \
    VGP_PAIR(VgpReadSyms,    "read-syms"),             \
    VGP_PAIR(VgpSearchSyms,  "search-syms"),           \
@@ -288,12 +288,6 @@
 /* Get the simulated %esp */
 extern Addr VG_(get_stack_pointer) ( void );
 
-/* Detect if an address is within Valgrind's stack, Valgrind's
-   m_state_static, or the VG_(threads) array.  This is useful for
-   memory leak detectors to rule out spurious pointers to a block. */
-extern Bool VG_(within_stack)(Addr a);
-extern Bool VG_(within_m_state_static_OR_threads)(Addr a);
-
 /* Check if an address is 4-byte aligned */
 #define IS_ALIGNED4_ADDR(aaa_p) (0 == (((UInt)(aaa_p)) & 3))
 #define IS_ALIGNED8_ADDR(aaa_p) (0 == (((UInt)(aaa_p)) & 7))
