Get rid of baseBlock.  Now, when generated code is running, the guest
state pointer points directly at the ThreadState.arch.vex field, thus
updating it in place and avoiding a lot of code (and time-wasting)
which copies stuff back and forth to baseBlock.

Fix zillions of other places in the system where the current thread id
is needed.  It is now passed to all needed places.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@3090 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/Makefile.am b/coregrind/Makefile.am
index 462ce63..fe518b4 100644
--- a/coregrind/Makefile.am
+++ b/coregrind/Makefile.am
@@ -7,6 +7,7 @@
 		-DKICKSTART_BASE=@KICKSTART_BASE@ \
 		-DVG_PLATFORM="\"$(VG_PLATFORM)"\"
 AM_CFLAGS = $(WERROR) -Winline -Wall -Wshadow -O -g @ARCH_CORE_AM_CFLAGS@
+AM_CFLAGS += -fno-omit-frame-pointer
 
 default.supp: $(SUPP_FILES)
 
diff --git a/coregrind/core.h b/coregrind/core.h
index 7fddeae..9f4295d 100644
--- a/coregrind/core.h
+++ b/coregrind/core.h
@@ -100,6 +100,9 @@
 #define TL_(x)	vgToolInternal_##x
 
 
+/* ToDo: nuke */
+#define INVALID_OFFSET (-1)
+
 /* ---------------------------------------------------------------------
    Build options and table sizes.  You should be able to change these
    options or sizes, recompile, and still have a working system.
@@ -852,7 +855,7 @@
    vki_stack_t altstack;
 
    /* Architecture-specific thread state */
-   arch_thread_t arch;
+   ThreadArchState arch;
 };
 //ThreadState;
 
@@ -863,16 +866,9 @@
 /* Check that tid is in range and denotes a non-Empty thread. */
 extern Bool VG_(is_valid_tid) ( ThreadId tid );
 
-/* Determine if 'tid' is that of the current running thread (Nb: returns
-   False if no thread is currently running. */
-extern Bool VG_(is_running_thread)(ThreadId tid);
-
 /* Get the ThreadState for a particular thread */
 extern ThreadState *VG_(get_ThreadState)(ThreadId tid);
 
-/* Similarly ... */
-extern ThreadId VG_(get_current_tid) ( void );
-
 /* Nuke all threads except tid. */
 extern void VG_(nuke_all_threads_except) ( ThreadId me );
 
@@ -1251,9 +1247,6 @@
 
 extern void VG_(print_scheduler_stats) ( void );
 
-extern Int  VG_(alloc_BaB)( Int );      // Allocate slots in baseBlock
-extern void VG_(align_BaB)( UInt );     // Align baseBlock offset
-extern Int  VG_(alloc_BaB_1_set)( Addr ); // Allocate & init baseBlock slot
 
 /* ---------------------------------------------------------------------
    Exports of vg_memory.c
@@ -1321,8 +1314,8 @@
 extern void VG_(pad_address_space)  (void);
 extern void VG_(unpad_address_space)(void);
 
-extern REGPARM(1)
-       void VG_(unknown_SP_update) ( Addr new_SP );
+extern REGPARM(2)
+       void VG_(unknown_SP_update) ( Addr old_SP, Addr new_SP );
 
 /* ---------------------------------------------------------------------
    Exports of vg_proxylwp.c
@@ -1726,8 +1719,9 @@
    ------------------------------------------------------------------ */
 
 /* Run a thread for a (very short) while, until some event happens
-   which means we need to defer to the scheduler. */
-extern UInt VG_(run_innerloop) ( void );
+   which means we need to defer to the scheduler.  This is passed
+   a pointer to the VEX guest state (arch.vex). */
+extern UInt VG_(run_innerloop) ( void* guest_state );
 
 /* The patching routing called when a BB wants to chain itself to
    another. */
@@ -1756,64 +1750,42 @@
 __attribute__ ((noreturn))
 extern void VG_(missing_tool_func) ( const Char* fn );
 
-/* ---------------------------------------------------------------------
-   The baseBlock -- arch-neutral bits
-   ------------------------------------------------------------------ */
-
-#define INVALID_OFFSET (-1)
-
-/* An array of words.  In generated code, %ebp always points to the
-   start of this array.  Useful stuff, like the simulated CPU state,
-   can then be found by indexing off %ebp.  The following declares
-   variables which, at startup time, are given values denoting offsets
-   into baseBlock.  These offsets are in *words* from the start of
-   baseBlock. */
-
-#define VG_BASEBLOCK_WORDS 400
-
-extern UInt VG_(baseBlock)[VG_BASEBLOCK_WORDS];
-
 // ---------------------------------------------------------------------
 // Architecture-specific things defined in eg. x86/*.c
 // ---------------------------------------------------------------------
 
-// For setting up the baseBlock
-extern void VGA_(init_baseBlock)  ( Addr client_ip, Addr sp_at_startup );
-
-// Register state moving
-extern void VGA_(load_state) ( arch_thread_t*, ThreadId tid );
-extern void VGA_(save_state) ( arch_thread_t*, ThreadId tid );
+// Setting up the initial thread (1) state
+extern void 
+       VGA_(init_thread1state) ( Addr client_eip, 
+                                 Addr esp_at_startup,
+                                 /*MOD*/ ThreadArchState* arch );
 
 // Thread stuff
-extern void VGA_(clear_thread)   ( arch_thread_t* );
-extern void VGA_(init_thread)    ( arch_thread_t* );
-extern void VGA_(cleanup_thread) ( arch_thread_t* );
-extern void VGA_(setup_child)    ( arch_thread_t*, arch_thread_t* );
+extern void VGA_(clear_thread)   ( ThreadArchState* );
+extern void VGA_(cleanup_thread) ( ThreadArchState* );
+extern void VGA_(setup_child)    ( ThreadArchState*, ThreadArchState* );
 
 extern void VGA_(set_arg_and_bogus_ret) ( ThreadId tid, UWord arg, Addr ret );
 extern void VGA_(thread_initial_stack)  ( ThreadId tid, UWord arg, Addr ret );
 
 // Symtab stuff
 extern UInt* VGA_(reg_addr_from_BB)  ( Int reg );
-extern UInt* VGA_(reg_addr_from_tst) ( Int reg, arch_thread_t* );
+extern UInt* VGA_(reg_addr_from_tst) ( Int reg, ThreadArchState* );
 
 // Pointercheck
 extern Bool VGA_(setup_pointercheck) ( void );
 
 // For attaching the debugger
 extern Int  VGA_(ptrace_setregs_from_BB)  ( Int pid );
-extern Int  VGA_(ptrace_setregs_from_tst) ( Int pid, arch_thread_t* arch );
+extern Int  VGA_(ptrace_setregs_from_tst) ( Int pid, ThreadArchState* arch );
 
 // Making coredumps
-extern void VGA_(fill_elfregs_from_BB)     ( struct vki_user_regs_struct* regs );
 extern void VGA_(fill_elfregs_from_tst)    ( struct vki_user_regs_struct* regs,
-                                             arch_thread_t* arch );
-extern void VGA_(fill_elffpregs_from_BB)   ( vki_elf_fpregset_t* fpu );
+                                             ThreadArchState* arch );
 extern void VGA_(fill_elffpregs_from_tst)  ( vki_elf_fpregset_t* fpu,
-                                             const arch_thread_t* arch );
-extern void VGA_(fill_elffpxregs_from_BB)  ( vki_elf_fpxregset_t* xfpu );
+                                             const ThreadArchState* arch );
 extern void VGA_(fill_elffpxregs_from_tst) ( vki_elf_fpxregset_t* xfpu,
-                                             const arch_thread_t* arch );
+                                             const ThreadArchState* arch );
 
 // Signal stuff
 extern void VGA_(push_signal_frame) ( ThreadId tid, Addr sp_top_of_frame,
@@ -1872,9 +1844,9 @@
 extern const Addr vga_sys_before, vga_sys_restarted,
                   vga_sys_after,  vga_sys_done;
 
-extern void VGA_(restart_syscall)(arch_thread_t* arch);
+extern void VGA_(restart_syscall)(ThreadArchState* arch);
 
-extern void VGA_(thread_syscall)(Int syscallno, arch_thread_t* arch, 
+extern void VGA_(thread_syscall)(Int syscallno, ThreadArchState* arch, 
                                  enum PXState* state, enum PXState poststate);
 
 /* ---------------------------------------------------------------------
diff --git a/coregrind/core_asm.h b/coregrind/core_asm.h
index c222152..ce5b8c8 100644
--- a/coregrind/core_asm.h
+++ b/coregrind/core_asm.h
@@ -39,13 +39,14 @@
 
 /* Magic values that %ebp might be set to when returning to the
    dispatcher.  The only other legitimate value is to point to the
-   start of VG_(baseBlock).  These also are return values from
+   start of the thread's VEX state.  These also are return values from
    VG_(run_innerloop) to the scheduler.
 
    EBP means %ebp can legitimately have this value when a basic block
    returns to the dispatch loop.  TRC means that this value is a valid
    thread return code, which the dispatch loop may return to the
-   scheduler.  */
+   scheduler.  
+*/
 #define VG_TRC_EBP_JMP_SYSCALL    19 /* EBP and TRC */
 #define VG_TRC_EBP_JMP_CLIENTREQ  23 /* EBP and TRC */
 #define VG_TRC_EBP_JMP_YIELD      27 /* EBP and TRC */
diff --git a/coregrind/toolfuncs.def b/coregrind/toolfuncs.def
index 2308261..d3d3226 100644
--- a/coregrind/toolfuncs.def
+++ b/coregrind/toolfuncs.def
@@ -280,12 +280,12 @@
 ## ================================================================================
 ## malloc and friends
 :malloc
-void*,	malloc,			SizeT n
-void*,	__builtin_new,		SizeT n
-void*,	__builtin_vec_new,	SizeT n
-void*,	memalign,		SizeT align, SizeT n
-void*,	calloc,			SizeT nmemb, SizeT n
-void,	free,			void* p
-void,	__builtin_delete,	void* p
-void,	__builtin_vec_delete,	void* p
-void*,	realloc,		void* p, SizeT size
+void*,	malloc,			ThreadId tid, SizeT n
+void*,	__builtin_new,		ThreadId tid, SizeT n
+void*,	__builtin_vec_new,	ThreadId tid, SizeT n
+void*,	memalign,		ThreadId tid, SizeT align, SizeT n
+void*,	calloc,			ThreadId tid, SizeT nmemb, SizeT n
+void,	free,			ThreadId tid, void* p
+void,	__builtin_delete,	ThreadId tid, void* p
+void,	__builtin_vec_delete,	ThreadId tid, void* p
+void*,	realloc,		ThreadId tid, void* p, SizeT size
diff --git a/coregrind/vg_default.c b/coregrind/vg_default.c
index 967aa9d..2283b53 100644
--- a/coregrind/vg_default.c
+++ b/coregrind/vg_default.c
@@ -79,7 +79,7 @@
    malloc()-replacing tool cannot forget to implement TL_(malloc)() or
    TL_(free)().  */
 __attribute__ ((weak))
-void* TL_(malloc)( SizeT size )
+void* TL_(malloc)( ThreadId tid, SizeT size )
 {
    if (VG_(sk_malloc_called_by_scheduler))
       return VG_(cli_malloc)(VG_MIN_MALLOC_SZB, size);
@@ -88,7 +88,7 @@
 }
 
 __attribute__ ((weak))
-void  TL_(free)( void* p )
+void  TL_(free)( ThreadId tid, void* p )
 {
    /* see comment for TL_(malloc)() above */
    if (VG_(sk_malloc_called_by_scheduler))
diff --git a/coregrind/vg_execontext.c b/coregrind/vg_execontext.c
index 4539fb1..371d26e 100644
--- a/coregrind/vg_execontext.c
+++ b/coregrind/vg_execontext.c
@@ -305,20 +305,11 @@
 void get_needed_regs(ThreadId tid, Addr* ip, Addr* fp, Addr* sp,
                      Addr* stack_highest_word)
 {
-   if (VG_(is_running_thread)(tid)) {
-      /* thread currently in baseblock */
-      *ip                 = BASEBLOCK_INSTR_PTR;
-      *fp                 = BASEBLOCK_FRAME_PTR;
-      *sp                 = BASEBLOCK_STACK_PTR;
-      *stack_highest_word = VG_(threads)[tid].stack_highest_word;
-   } else {
-      /* thread in thread table */
-      ThreadState* tst = & VG_(threads)[ tid ];
-      *ip                 = ARCH_INSTR_PTR(tst->arch);
-      *fp                 = ARCH_FRAME_PTR(tst->arch);
-      *sp                 = ARCH_STACK_PTR(tst->arch);
-      *stack_highest_word = tst->stack_highest_word;
-   }
+   ThreadState* tst = & VG_(threads)[ tid ];
+   *ip                 = ARCH_INSTR_PTR(tst->arch);
+   *fp                 = ARCH_FRAME_PTR(tst->arch);
+   *sp                 = ARCH_STACK_PTR(tst->arch);
+   *stack_highest_word = tst->stack_highest_word;
 
    /* Nasty little hack to deal with sysinfo syscalls - if libc is
       using the sysinfo page for syscalls (the TLS version does), then
@@ -364,14 +355,7 @@
 
 Addr VG_(get_EIP) ( ThreadId tid )
 {
-   Addr ret;
-
-   if (VG_(is_running_thread)(tid))
-      ret = BASEBLOCK_INSTR_PTR;
-   else
-      ret = ARCH_INSTR_PTR(VG_(threads)[ tid ].arch);
-
-   return ret;
+   return ARCH_INSTR_PTR(VG_(threads)[ tid ].arch);
 }
 
 /*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
index cea7e69..091391b 100644
--- a/coregrind/vg_main.c
+++ b/coregrind/vg_main.c
@@ -213,10 +213,7 @@
 
 static Int ptrace_setregs(Int pid, ThreadId tid)
 {
-   if (VG_(is_running_thread)( tid ))
-      return VGA_(ptrace_setregs_from_BB)(pid);
-   else
-      return VGA_(ptrace_setregs_from_tst)(pid, &VG_(threads)[tid].arch);
+   return VGA_(ptrace_setregs_from_tst)(pid, &VG_(threads)[tid].arch);
 }
 
 /* Start debugger and get it to attach to this process.  Called if the
@@ -326,15 +323,10 @@
    VG_(exit)(1);
 }
 
-Addr VG_(get_stack_pointer) ( void )
+/* Get the simulated stack pointer */
+Addr VG_(get_stack_pointer) ( ThreadId tid )
 {
-   return BASEBLOCK_STACK_PTR;
-}
-
-/* Debugging thing .. can be called from assembly with OYNK macro. */
-void VG_(oynk) ( Int n )
-{
-   OINK(n);
+   return ARCH_STACK_PTR( VG_(threads)[tid].arch );
 }
 
 /* Initialize the PID and PGRP of scheduler LWP; this is also called
@@ -1584,6 +1576,16 @@
 "    --vex-guest-max-insns             1 .. 100 [50]\n"
 "    --vex-guest-chase-thresh          0 .. 99  [10]\n"
 "\n"
+"    --trace-codegen values (omit the middle space):\n"
+"       1000 0000   show conversion into IR\n"
+"       0100 0000   show after initial opt\n"
+"       0010 0000   show after instrumentation\n"
+"       0001 0000   show after second opt\n"
+"       0000 1000   show after tree building\n"
+"       0000 0100   show selecting insns\n"
+"       0000 0010   show after reg-alloc\n"
+"       0000 0001   show final assembly\n"
+"\n"
 "  debugging options for Valgrind tools that report errors\n"
 "    --dump-error=<number>     show translation for basic block associated\n"
 "                              with <number>'th error context [0=show none]\n"
@@ -1827,7 +1829,7 @@
          Int j;
          char* opt = & arg[16];
    
-         if (5 != VG_(strlen)(opt)) {
+         if (8 != VG_(strlen)(opt)) {
             VG_(message)(Vg_UserMsg, 
                          "--trace-codegen argument must have 8 digits");
             VG_(bad_option)(arg);
@@ -2164,40 +2166,6 @@
 /*=== baseBlock: definition + setup                                ===*/
 /*====================================================================*/
 
-/* This is the actual defn of baseblock. */
-UInt VG_(baseBlock)[VG_BASEBLOCK_WORDS];
-
-/* Words. */
-static Int baB_off = 0;
-
-
-/* Returns the offset, in words. */
-Int VG_(alloc_BaB) ( Int words )
-{
-   Int off = baB_off;
-   baB_off += words;
-   if (baB_off >= VG_BASEBLOCK_WORDS)
-      VG_(core_panic)( "VG_(alloc_BaB): baseBlock is too small");
-
-   return off;   
-}
-
-/* Align offset, in *bytes* */
-void VG_(align_BaB) ( UInt align )
-{
-   vg_assert(2 == align || 4 == align || 8 == align || 16 == align);
-   baB_off +=  (align-1);
-   baB_off &= ~(align-1);
-}
-
-/* Allocate 1 word in baseBlock and set it to the given value. */
-Int VG_(alloc_BaB_1_set) ( Addr a )
-{
-   Int off = VG_(alloc_BaB)(1);
-   VG_(baseBlock)[off] = (UInt)a;
-   return off;
-}
-
 Bool VG_(need_to_handle_SP_assignment)(void)
 {
    return ( VG_(defined_new_mem_stack_4)()  ||
@@ -2215,14 +2183,6 @@
           );
 }
 
-// The low/high split is for x86, so that the more common helpers can be
-// in the first 128 bytes of the start, which allows the use of a more
-// compact addressing mode.
-static void init_baseBlock ( Addr client_eip, Addr sp_at_startup )
-{
-   VGA_(init_baseBlock)(client_eip, sp_at_startup);
-}
-
 
 /*====================================================================*/
 /*===  Initialise program data/text, etc.                          ===*/
@@ -2652,14 +2612,6 @@
    }
 
    //--------------------------------------------------------------
-   // Set up baseBlock
-   //   p: {pre,post}_clo_init()  [for tool helper registration]
-   //      load_client()          [for 'client_eip']
-   //      setup_client_stack()   [for 'sp_at_startup']
-   //--------------------------------------------------------------
-   init_baseBlock(client_eip, sp_at_startup);
-
-   //--------------------------------------------------------------
    // Search for file descriptors that are inherited from our parent
    //   p: process_cmd_line_options  [for VG_(clo_track_fds)]
    //--------------------------------------------------------------
@@ -2674,6 +2626,15 @@
    VG_(scheduler_init)();
 
    //--------------------------------------------------------------
+   // Set up state of thread 1
+   //   p: {pre,post}_clo_init()  [for tool helper registration]
+   //      load_client()          [for 'client_eip']
+   //      setup_client_stack()   [for 'sp_at_startup']
+   //      setup_scheduler()      [for the rest of state 1 stuff]
+   //--------------------------------------------------------------
+   VGA_(init_thread1state)(client_eip, sp_at_startup, &VG_(threads)[1].arch );
+
+   //--------------------------------------------------------------
    // Set up the ProxyLWP machinery
    //   p: VG_(scheduler_init)()?  [XXX: subtle dependency?]
    //--------------------------------------------------------------
diff --git a/coregrind/vg_memory.c b/coregrind/vg_memory.c
index 6a884b8..a65c290 100644
--- a/coregrind/vg_memory.c
+++ b/coregrind/vg_memory.c
@@ -667,10 +667,9 @@
 /* This function gets called if new_mem_stack and/or die_mem_stack are
    tracked by the tool, and one of the specialised cases (eg. new_mem_stack_4)
    isn't used in preference */
-REGPARM(1)
-void VG_(unknown_SP_update)(Addr new_SP)
+REGPARM(2)
+void VG_(unknown_SP_update)( Addr old_SP, Addr new_SP )
 {
-   Addr old_SP = BASEBLOCK_STACK_PTR;
    Word delta  = (Word)new_SP - (Word)old_SP;
 
    if (delta < -(VG_HUGE_DELTA) || VG_HUGE_DELTA < delta) {
diff --git a/coregrind/vg_proxylwp.c b/coregrind/vg_proxylwp.c
index 8a97585..dde25f7 100644
--- a/coregrind/vg_proxylwp.c
+++ b/coregrind/vg_proxylwp.c
@@ -715,7 +715,7 @@
       message gets sent back, thus making the signal synchronous. */
    if (sig != 0 && 
        !VG_(is_sig_ign)(sig) &&
-       tid == VG_(get_current_or_recent_tid)() && 
+       tid == VG_(get_current_tid)() && 
        !VG_(sigismember)(&tst->eff_sig_mask, sig)) {
       /* If the LWP is actually blocked in a sigtimedwait, then it
 	 will eat the signal rather than make it pending and deliver
diff --git a/coregrind/vg_scheduler.c b/coregrind/vg_scheduler.c
index 8a2f1bd..4179360 100644
--- a/coregrind/vg_scheduler.c
+++ b/coregrind/vg_scheduler.c
@@ -49,13 +49,10 @@
 static ForkHandlerEntry vg_fhstack[VG_N_FORKHANDLERSTACK];
 
 
-/* The tid of the thread currently in VG_(baseBlock). */
-static ThreadId vg_tid_currently_in_baseBlock = VG_INVALID_THREADID;
+/* The tid of the thread currently running, or VG_INVALID_THREADID if
+   none. */
+static ThreadId vg_tid_currently_running = VG_INVALID_THREADID;
 
-/* The tid either currently in baseBlock, or was in baseBlock before
-   was saved it out; this is only updated when a new thread is loaded
-   into the baseBlock */
-static ThreadId vg_tid_last_in_baseBlock = VG_INVALID_THREADID;
 
 /* vg_oursignalhandler() might longjmp().  Here's the jmp_buf. */
 static jmp_buf scheduler_jmpbuf;
@@ -138,8 +135,7 @@
 
 /* For constructing error messages only: try and identify a thread
    whose stack satisfies the predicate p, or return VG_INVALID_THREADID
-   if none do.  A small complication is dealing with any currently
-   VG_(baseBlock)-resident thread. 
+   if none do.
 */
 ThreadId VG_(first_matching_thread_stack)
               ( Bool (*p) ( Addr stack_min, Addr stack_max, void* d ),
@@ -149,17 +145,6 @@
 
    tid_to_skip = VG_INVALID_THREADID;
 
-   /* First check to see if there's a currently-loaded thread in
-      VG_(baseBlock). */
-   if (vg_tid_currently_in_baseBlock != VG_INVALID_THREADID) {
-      tid = vg_tid_currently_in_baseBlock;
-      if ( p ( BASEBLOCK_STACK_PTR, 
-               VG_(threads)[tid].stack_highest_word, d ) )
-         return tid;
-      else
-         tid_to_skip = tid;
-   }
-
    for (tid = 1; tid < VG_N_THREADS; tid++) {
       if (VG_(threads)[tid].status == VgTs_Empty) continue;
       if (tid == tid_to_skip) continue;
@@ -252,71 +237,48 @@
    return &VG_(threads)[tid];
 }
 
-Bool VG_(is_running_thread)(ThreadId tid)
+/* Return True precisely when get_current_tid can return
+   successfully. */
+Bool VG_(running_a_thread) ( void )
 {
-   ThreadId curr = VG_(get_current_tid)();
-   return (curr == tid && VG_INVALID_THREADID != tid);
+   if (vg_tid_currently_running == VG_INVALID_THREADID)
+      return False;
+   /* Otherwise, it must be a valid thread ID. */
+   vg_assert(VG_(is_valid_tid)(vg_tid_currently_running));
+   return True;
 }
 
 ThreadId VG_(get_current_tid) ( void )
 {
-   if (!VG_(is_valid_tid)(vg_tid_currently_in_baseBlock))
-      return VG_INVALID_THREADID;
-   return vg_tid_currently_in_baseBlock;
+   if (vg_tid_currently_running == VG_INVALID_THREADID)
+      VG_(core_panic)("VG_(get_current_tid): not running generated code");
+   /* Otherwise, it must be a valid thread ID. */
+   vg_assert(VG_(is_valid_tid)(vg_tid_currently_running));
+   return vg_tid_currently_running;
 }
 
-ThreadId VG_(get_current_or_recent_tid) ( void )
-{
-   vg_assert(vg_tid_currently_in_baseBlock == vg_tid_last_in_baseBlock ||
-	     vg_tid_currently_in_baseBlock == VG_INVALID_THREADID);
-   vg_assert(VG_(is_valid_tid)(vg_tid_last_in_baseBlock));
-
-   return vg_tid_last_in_baseBlock;
-}
-
-/* Copy the saved state of a thread into VG_(baseBlock), ready for it
-   to be run. */
-static void load_thread_state ( ThreadId tid )
-{
-   vg_assert(vg_tid_currently_in_baseBlock == VG_INVALID_THREADID);
-
-   VGA_(load_state)(&VG_(threads)[tid].arch, tid);
-
-   vg_tid_currently_in_baseBlock = tid;
-   vg_tid_last_in_baseBlock = tid;
-}
-
-
-/* Copy the state of a thread from VG_(baseBlock), presumably after it
-   has been descheduled.  For sanity-check purposes, fill the vacated
-   VG_(baseBlock) with garbage so as to make the system more likely to
-   fail quickly if we erroneously continue to poke around inside
-   VG_(baseBlock) without first doing a load_thread_state().  
-*/
-static void save_thread_state ( ThreadId tid )
-{
-   vg_assert(vg_tid_currently_in_baseBlock != VG_INVALID_THREADID);
-
-   VGA_(save_state)(&VG_(threads)[tid].arch, tid);
-
-   vg_tid_currently_in_baseBlock = VG_INVALID_THREADID;
-}
-
-
 void VG_(resume_scheduler)(Int sigNo, vki_siginfo_t *info)
 {
    if (scheduler_jmpbuf_valid) {
       /* Can't continue; must longjmp back to the scheduler and thus
          enter the sighandler immediately. */
+      vg_assert(vg_tid_currently_running != VG_INVALID_THREADID);
       VG_(memcpy)(&unresumable_siginfo, info, sizeof(vki_siginfo_t));
    
       longjmpd_on_signal = sigNo;
       __builtin_longjmp(scheduler_jmpbuf,1);
+   } else {
+      vg_assert(vg_tid_currently_running == VG_INVALID_THREADID);
    }
 }
 
 /* Run the thread tid for a while, and return a VG_TRC_* value to the
    scheduler indicating what happened. */
+void VG_(oynk) ( Int n )
+{
+  VG_(printf)("OYNK %d\n", n);
+}
+
 static
 UInt run_thread_for_a_while ( ThreadId tid )
 {
@@ -327,31 +289,32 @@
    vg_assert(VG_(is_valid_tid)(tid));
    vg_assert(VG_(threads)[tid].status == VgTs_Runnable);
    vg_assert(!scheduler_jmpbuf_valid);
+   vg_assert(vg_tid_currently_running == VG_INVALID_THREADID);
 
    VGP_PUSHCC(VgpRun);
-   load_thread_state ( tid );
 
    /* there should be no undealt-with signals */
    vg_assert(unresumable_siginfo.si_signo == 0);
 
    if (__builtin_setjmp(scheduler_jmpbuf) == 0) {
       /* try this ... */
-      scheduler_jmpbuf_valid = True;
-      trc = VG_(run_innerloop)();
-      scheduler_jmpbuf_valid = False;
+      vg_tid_currently_running = tid;
+      scheduler_jmpbuf_valid   = True;
+      trc = VG_(run_innerloop)( &VG_(threads)[tid].arch.vex );
+      scheduler_jmpbuf_valid   = False;
+      vg_tid_currently_running = VG_INVALID_THREADID;
       /* We get here if the client didn't take a fault. */
    } else {
       /* We get here if the client took a fault, which caused our
          signal handler to longjmp. */
-      scheduler_jmpbuf_valid = False;
+      scheduler_jmpbuf_valid   = False;
+      vg_tid_currently_running = VG_INVALID_THREADID;
       vg_assert(trc == 0);
       trc = VG_TRC_UNRESUMABLE_SIGNAL;
    }
 
    vg_assert(!scheduler_jmpbuf_valid);
 
-   save_thread_state ( tid );
-
    done_this_time = (Int)dispatch_ctr_SAVED - (Int)VG_(dispatch_ctr) - 0;
 
    vg_assert(done_this_time >= 0);
@@ -400,8 +363,9 @@
 
 
 /* Initialise the scheduler.  Create a single "main" thread ready to
-   run, with special ThreadId of one.  This is called at startup; the
-   caller takes care to park the client's state in VG_(baseBlock).  
+   run, with special ThreadId of one.  This is called at startup.  The
+   caller subsequently initialises the guest state components of
+   this main thread, thread 1.  
 */
 void VG_(scheduler_init) ( void )
 {
@@ -423,27 +387,16 @@
 
    vg_fhstack_used = 0;
 
-   /* Assert this is thread zero, which has certain magic
+   /* Assert this is thread one, which has certain magic
       properties. */
    tid_main = vg_alloc_ThreadState();
    vg_assert(tid_main == 1); 
    VG_(threads)[tid_main].status = VgTs_Runnable;
 
-   /* Copy VG_(baseBlock) state to tid_main's slot. */
-   vg_tid_currently_in_baseBlock = tid_main;
-   vg_tid_last_in_baseBlock = tid_main;
-
-   VGA_(init_thread)(&VG_(threads)[tid_main].arch);
-   save_thread_state ( tid_main );
-
-   VG_(threads)[tid_main].stack_highest_word 
-      = VG_(clstk_end) - 4;
+   VG_(threads)[tid_main].stack_highest_word = VG_(clstk_end) - 4;
    VG_(threads)[tid_main].stack_base = VG_(clstk_base);
    VG_(threads)[tid_main].stack_size = VG_(client_rlimit_stack).rlim_cur;
 
-   /* So now ... */
-   vg_assert(vg_tid_currently_in_baseBlock == VG_INVALID_THREADID);
-
    /* Not running client code right now. */
    scheduler_jmpbuf_valid = False;
 
@@ -1802,13 +1755,9 @@
    mostly_clear_thread_record(tid);
    VG_(threads)[tid].status = VgTs_Runnable;
 
-   /* Copy the parent's CPU state into the child's, in a roundabout
-      way (via baseBlock). */
-   load_thread_state(parent_tid);
+   /* Copy the parent's CPU state into the child's. */
    VGA_(setup_child)( &VG_(threads)[tid].arch,
                       &VG_(threads)[parent_tid].arch );
-   save_thread_state(tid);
-   vg_tid_last_in_baseBlock = tid;
 
    /* Consider allocating the child a stack, if the one it already has
       is inadequate. */
@@ -2871,9 +2820,9 @@
    VGA_(set_thread_shadow_archreg)(tid, R_SYSCALL_RET, ret_shadow);
 }
 
-UInt VG_(get_exit_status_shadow) ( void )
+UInt VG_(get_exit_status_shadow) ( ThreadId tid )
 {
-   return VGA_(get_shadow_archreg)(R_SYSCALL_ARG1);
+   return VGA_(get_thread_shadow_archreg)(tid, R_SYSCALL_ARG1);
 }
 
 void VG_(intercept_libc_freeres_wrapper)(Addr addr)
@@ -2899,35 +2848,35 @@
    switch (req_no) {
 
       case VG_USERREQ__CLIENT_CALL0: {
-         UWord (*f)(void) = (void*)arg[1];
+         UWord (*f)(ThreadId) = (void*)arg[1];
 	 if (f == NULL)
 	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL0: func=%p\n", f);
 	 else
-	    SET_CLCALL_RETVAL(tid, f ( ), (Addr)f);
+	    SET_CLCALL_RETVAL(tid, f ( tid ), (Addr)f);
          break;
       }
       case VG_USERREQ__CLIENT_CALL1: {
-         UWord (*f)(UWord) = (void*)arg[1];
+         UWord (*f)(ThreadId, UWord) = (void*)arg[1];
 	 if (f == NULL)
 	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL1: func=%p\n", f);
 	 else
-	    SET_CLCALL_RETVAL(tid, f ( arg[2] ), (Addr)f );
+	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2] ), (Addr)f );
          break;
       }
       case VG_USERREQ__CLIENT_CALL2: {
-         UWord (*f)(UWord, UWord) = (void*)arg[1];
+         UWord (*f)(ThreadId, UWord, UWord) = (void*)arg[1];
 	 if (f == NULL)
 	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL2: func=%p\n", f);
 	 else
-	    SET_CLCALL_RETVAL(tid, f ( arg[2], arg[3] ), (Addr)f );
+	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2], arg[3] ), (Addr)f );
          break;
       }
       case VG_USERREQ__CLIENT_CALL3: {
-         UWord (*f)(UWord, UWord, UWord) = (void*)arg[1];
+         UWord (*f)(ThreadId, UWord, UWord, UWord) = (void*)arg[1];
 	 if (f == NULL)
 	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL3: func=%p\n", f);
 	 else
-	    SET_CLCALL_RETVAL(tid, f ( arg[2], arg[3], arg[4] ), (Addr)f );
+	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2], arg[3], arg[4] ), (Addr)f );
          break;
       }
 
@@ -2942,14 +2891,14 @@
       case VG_USERREQ__MALLOC:
          VG_(sk_malloc_called_by_scheduler) = True;
          SET_PTHREQ_RETVAL(
-            tid, (Addr)TL_(malloc) ( arg[1] ) 
+            tid, (Addr)TL_(malloc) ( tid, arg[1] ) 
          );
          VG_(sk_malloc_called_by_scheduler) = False;
          break;
 
       case VG_USERREQ__FREE:
          VG_(sk_malloc_called_by_scheduler) = True;
-         TL_(free) ( (void*)arg[1] );
+         TL_(free) ( tid, (void*)arg[1] );
          VG_(sk_malloc_called_by_scheduler) = False;
 	 SET_PTHREQ_RETVAL(tid, 0); /* irrelevant */
          break;
diff --git a/coregrind/vg_signals.c b/coregrind/vg_signals.c
index 78300b5..046ce2d 100644
--- a/coregrind/vg_signals.c
+++ b/coregrind/vg_signals.c
@@ -1155,27 +1155,17 @@
 
    vg_assert(sizeof(*regs) == sizeof(prs->pr_reg));
 
-   if (VG_(is_running_thread)(tst->tid)) {
-      VGA_(fill_elfregs_from_BB)(regs);
-   } else {
-      VGA_(fill_elfregs_from_tst)(regs, &tst->arch);
-   }
+   VGA_(fill_elfregs_from_tst)(regs, &tst->arch);
 }
 
 static void fill_fpu(const ThreadState *tst, vki_elf_fpregset_t *fpu)
 {
-   if (VG_(is_running_thread)(tst->tid))
-      VGA_(fill_elffpregs_from_BB)(fpu);
-   else
-      VGA_(fill_elffpregs_from_tst)(fpu, &tst->arch);
+   VGA_(fill_elffpregs_from_tst)(fpu, &tst->arch);
 }
 
 static void fill_xfpu(const ThreadState *tst, vki_elf_fpxregset_t *xfpu)
 {
-   if (VG_(is_running_thread)(tst->tid))
-      VGA_(fill_elffpxregs_from_BB)(xfpu);
-   else
-      VGA_(fill_elffpxregs_from_tst)(xfpu, &tst->arch);
+   VGA_(fill_elffpxregs_from_tst)(xfpu, &tst->arch);
 }
 
 static void make_coredump(ThreadId tid, const vki_siginfo_t *si, UInt max_size)
@@ -1656,7 +1646,7 @@
 }
 
 /* 
-   Recieve a sync signal from the host. 
+   Receive a sync signal from the host. 
 
    This should always be called from the main thread, though it may be
    called in a proxy LWP if someone sends an async version of one of
@@ -1668,6 +1658,19 @@
    Int           dummy_local;
 
    vg_assert(info != NULL);
+
+   if (VG_(clo_trace_signals)) {
+      VG_(message)(Vg_DebugMsg, "");
+      VG_(message)(Vg_DebugMsg, "signal %d arrived ... si_code = %d",
+                   sigNo, info->si_code );
+      if (VG_(running_a_thread)()) {
+         VG_(message)(Vg_DebugMsg, "   running thread %d", 
+                                   VG_(get_current_tid)());
+      } else {
+         VG_(message)(Vg_DebugMsg, "   not running a thread");
+      }
+   }
+
    vg_assert(info->si_signo == sigNo);
    vg_assert(sigNo == VKI_SIGSEGV ||
 	     sigNo == VKI_SIGBUS  ||
@@ -1696,11 +1699,6 @@
    }
    */
 
-   if (VG_(clo_trace_signals)) {
-      VG_(message)(Vg_DebugMsg, "");
-      VG_(message)(Vg_DebugMsg, "signal %d arrived ... si_code=%d",
-                   sigNo, info->si_code );
-   }
    vg_assert(sigNo >= 1 && sigNo <= _VKI_NSIG);
 
    /* Sanity check.  Ensure we're really running on the signal stack
@@ -1729,11 +1727,9 @@
       act upon and immediately restart the faulting instruction.
     */
    if (info->si_signo == VKI_SIGSEGV) {
-      ThreadId tid = VG_(get_current_or_recent_tid)();
+      ThreadId tid = VG_(get_current_tid)();
       Addr fault = (Addr)info->_sifields._sigfault._addr;
-      Addr esp = VG_(is_running_thread)(tid)
-	       ? BASEBLOCK_STACK_PTR
-               : ARCH_STACK_PTR(VG_(threads)[tid].arch);
+      Addr esp   =  ARCH_STACK_PTR(VG_(threads)[tid].arch);
       Segment *seg;
 
       seg = VG_(find_segment)(fault);
diff --git a/coregrind/vg_symtab2.c b/coregrind/vg_symtab2.c
index 9b29842..9b63a94 100644
--- a/coregrind/vg_symtab2.c
+++ b/coregrind/vg_symtab2.c
@@ -2004,10 +2004,7 @@
 {
    UInt *ret = 0;
 
-   if (VG_(is_running_thread)(tid))
-      ret = VGA_(reg_addr_from_BB)(regno);
-   else
-      ret = VGA_(reg_addr_from_tst)(regno, &VG_(threads)[tid].arch);
+   ret = VGA_(reg_addr_from_tst)(regno, &VG_(threads)[tid].arch);
 
    if (ret == 0) {
       Char file[100];
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index a6ee66e..d72eaa7 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -174,21 +174,23 @@
             default:  goto generic;
          }
       } else {
+         IRTemp old_SP;
         generic:
-         /* I don't know if it's really necessary to say that the call
-            reads the stack pointer.  But anyway, we do. */
-         dcall = unsafeIRDirty_0_N( 
-                    1/*regparms*/, 
-                    "VG_(unknown_SP_update)", &VG_(unknown_SP_update),
-                    mkIRExprVec_1(st->Ist.Put.data) 
-                 );
-         dcall->nFxState = 1;
-         dcall->fxState[0].fx     = Ifx_Read;
-         dcall->fxState[0].offset = layout->offset_SP;
-         dcall->fxState[0].size   = layout->sizeof_SP;
+         /* Pass both the old and new SP values to this helper. */
+         old_SP = newIRTemp(bb->tyenv, typeof_SP);
+         addStmtToIRBB( 
+            bb,
+            IRStmt_Tmp( old_SP, IRExpr_Get(offset_SP, typeof_SP) ) 
+         );
 
+         dcall = unsafeIRDirty_0_N( 
+                    2/*regparms*/, 
+                    "VG_(unknown_SP_update)", &VG_(unknown_SP_update),
+                    mkIRExprVec_2( IRExpr_Tmp(old_SP), st->Ist.Put.data ) 
+                 );
          addStmtToIRBB( bb, IRStmt_Dirty(dcall) );
-         addStmtToIRBB(bb,st);
+
+         addStmtToIRBB( bb, st );
 
          curr = st->Ist.Put.data->Iex.Tmp.tmp;
          delta = 0;
diff --git a/coregrind/x86-linux/ldt.c b/coregrind/x86-linux/ldt.c
index 3bcaef1..86173d1 100644
--- a/coregrind/x86-linux/ldt.c
+++ b/coregrind/x86-linux/ldt.c
@@ -168,6 +168,7 @@
 }
 
 
+#if 0
 /* Actually _DO_ the segment translation.  This is the whole entire
    point of this accursed, overcomplicated, baroque, pointless
    segment-override-and-LDT/GDT garbage foisted upon us all by Intel,
@@ -251,7 +252,7 @@
 
    return base + virtual_addr;
 }
-
+#endif
 
 /* Translate a struct modify_ldt_ldt_s to an VgLdtEntry, using the
    Linux kernel's logic (cut-n-paste of code in linux/kernel/ldt.c).  */
diff --git a/coregrind/x86-linux/syscalls.c b/coregrind/x86-linux/syscalls.c
index b2106f3..70d5758 100644
--- a/coregrind/x86-linux/syscalls.c
+++ b/coregrind/x86-linux/syscalls.c
@@ -129,7 +129,7 @@
    Assumes that the only thread state which matters is the contents of
    %eax-%ebp and the return value in %eax.
  */
-void VGA_(thread_syscall)(Int syscallno, arch_thread_t *arch, 
+void VGA_(thread_syscall)(Int syscallno, ThreadArchState *arch, 
                           enum PXState *state , enum PXState poststate)
 {
    do_thread_syscall(syscallno,            // syscall no.
@@ -147,7 +147,7 @@
 
 
 // Back up to restart a system call.
-void VGA_(restart_syscall)(arch_thread_t *arch)
+void VGA_(restart_syscall)(ThreadArchState *arch)
 {
    arch->vex.guest_EIP -= 2;             // sizeof(int $0x80)
 
diff --git a/coregrind/x86/core_arch.h b/coregrind/x86/core_arch.h
index dde329c..9a10dcd 100644
--- a/coregrind/x86/core_arch.h
+++ b/coregrind/x86/core_arch.h
@@ -41,15 +41,6 @@
    Interesting registers
    ------------------------------------------------------------------ */
 
-/* Generate a pointer into baseBlock via which we can prod the
-   Vex guest state. */
-#define BASEBLOCK_VEX  \
-   ((VexGuestX86State*)(&VG_(baseBlock)[VGOFF_(m_vex)]))
-
-/* Ditto the Vex shadow guest state. */
-#define BASEBLOCK_VEX_SHADOW  \
-   ((VexGuestX86State*)(&VG_(baseBlock)[VGOFF_(m_vex_shadow)]))
-
 // Accessors for the arch_thread_t
 #define ARCH_INSTR_PTR(regs)           ((regs).vex.guest_EIP)
 #define ARCH_STACK_PTR(regs)           ((regs).vex.guest_ESP)
@@ -71,11 +62,6 @@
 #define STACK_FRAME_RET(ebp)           (((UInt*)ebp)[1])
 #define STACK_FRAME_NEXT(ebp)          (((UInt*)ebp)[0])
 
-// Baseblock access to interesting registers
-#define BASEBLOCK_INSTR_PTR            BASEBLOCK_VEX->guest_EIP
-#define BASEBLOCK_STACK_PTR            BASEBLOCK_VEX->guest_ESP
-#define BASEBLOCK_FRAME_PTR            BASEBLOCK_VEX->guest_EBP
-
 // Get stack pointer and frame pointer
 #define ARCH_GET_REAL_STACK_PTR(esp) do {   \
    asm("movl %%esp, %0" : "=r" (esp));       \
@@ -85,28 +71,8 @@
    asm("movl %%ebp, %0" : "=r" (ebp));       \
 } while (0)
 
-
-/* -----------------------------------------------------
-   Read-write parts of baseBlock.
-   -------------------------------------------------- */
-
-/* State of the simulated CPU. */
-extern Int VGOFF_(m_vex);
-extern Int VGOFF_(m_vex_shadow);
-
-/* Reg-alloc spill area (VG_MAX_SPILLSLOTS words long). */
-extern Int VGOFF_(spillslots);
-
-
-/* -----------------------------------------------------
-   Read-only parts of baseBlock.
-   -------------------------------------------------- */
-
-/* This thread's LDT pointer. */
-extern Int VGOFF_(ldt);
-
-/* This thread's TLS pointer. */
-extern Int VGOFF_(tls_ptr);
+// So the dispatch loop can find %EIP
+extern Int VGOFF_(m_eip);
 
 
 /* ---------------------------------------------------------------------
@@ -151,35 +117,44 @@
     LdtEnt;
 } VgLdtEntry;
 
+
 /* ---------------------------------------------------------------------
-   Constants pertaining to the simulated CPU state, VG_(baseBlock),
-   which need to go here to avoid ugly circularities.
+   Architecture-specific part of a ThreadState
    ------------------------------------------------------------------ */
 
 // Architecture-specific part of a ThreadState
 // XXX: eventually this should be made abstract, ie. the fields not visible
 //      to the core...  then VgLdtEntry can be made non-visible to the core
 //      also.
-typedef struct {
-   /* Pointer to this thread's Local (Segment) Descriptor Table.
-      Starts out as NULL, indicating there is no table, and we hope to
-      keep it that way.  If the thread does __NR_modify_ldt to create
-      entries, we allocate a 8192-entry table at that point.  This is
-      a straight copy of the Linux kernel's scheme.  Don't forget to
-      deallocate this at thread exit. */
-   VgLdtEntry* ldt;
+typedef 
+   struct {
+      /* Pointer to this thread's Local (Segment) Descriptor Table.
+         Starts out as NULL, indicating there is no table, and we hope
+         to keep it that way.  If the thread does __NR_modify_ldt to
+         create entries, we allocate a 8192-entry table at that point.
+         This is a straight copy of the Linux kernel's scheme.  Don't
+         forget to deallocate this at thread exit. */
+      VgLdtEntry* ldt;
 
-   /* TLS table. This consists of a small number (currently 3) of
-      entries from the Global Descriptor Table. */
-   VgLdtEntry tls[VKI_GDT_ENTRY_TLS_ENTRIES];
+      /* TLS table. This consists of a small number (currently 3) of
+         entries from the Global Descriptor Table. */
+      VgLdtEntry tls[VKI_GDT_ENTRY_TLS_ENTRIES];
 
-   /* Saved machine context. */
-   VexGuestX86State vex;
+      /* --- BEGIN vex-mandated guest state --- */
 
-   /* Saved shadow context. */
-   VexGuestX86State vex_shadow;
-} 
-arch_thread_t;
+      /* Saved machine context. */
+      VexGuestX86State vex;
+
+      /* Saved shadow context. */
+      VexGuestX86State vex_shadow;
+
+      /* Spill area. */
+      UChar vex_spill[LibVEX_N_SPILL_BYTES];
+
+      /* --- END vex-mandated guest state --- */
+   } 
+   ThreadArchState;
+
 
 /* ---------------------------------------------------------------------
    libpthread stuff
diff --git a/coregrind/x86/core_arch_asm.h b/coregrind/x86/core_arch_asm.h
index 0f932f6..9e08991 100644
--- a/coregrind/x86/core_arch_asm.h
+++ b/coregrind/x86/core_arch_asm.h
@@ -42,19 +42,6 @@
 /* Offset of code in a TCEntry */
 #define VG_CODE_OFFSET		(8 + VG_MAX_JUMPS * 2)
 
-/* Debugging hack for assembly code ... sigh. */
-#if 0
-#define OYNK(nnn) pushal;  pushl $nnn; call VG_(oynk) ; addl $4,%esp; popal
-#else
-#define OYNK(nnn)
-#endif
-
-#if 0
-#define OYNNK(nnn) pushal;  pushl $nnn; call VG_(oynk) ; addl $4,%esp; popal
-#else
-#define OYNNK(nnn)
-#endif
-
 
 #endif   // __X86_CORE_ARCH_ASM_H
 
diff --git a/coregrind/x86/dispatch.S b/coregrind/x86/dispatch.S
index bd6a183..db4a9a8 100644
--- a/coregrind/x86/dispatch.S
+++ b/coregrind/x86/dispatch.S
@@ -65,10 +65,12 @@
    signal, for example SIGSEGV.  It then longjmp()s back past here.
 */
 
+/* signature: UInt VG_(run_innerloop) ( void* guest_state ) */
+
 .globl VG_(run_innerloop)
 VG_(run_innerloop):
-	/* OYNK(1000) */
-
+	/* 4(%esp) holds guest_state */
+	
 	/* ----- entry point to VG_(run_innerloop) ----- */
 	pushl	%ebx
 	pushl	%ecx
@@ -76,20 +78,12 @@
 	pushl	%esi
 	pushl	%edi
 	pushl	%ebp
-
-	/* check to see if we're doing pointer checking */
-	movb	VG_(clo_pointercheck), %al
-	testb	%al,%al
-	jz	1f
 	
-	pushl	%fs						/* save %fs     */
-	mov	$(VG_POINTERCHECK_SEGIDX << 3) + 7, %eax	/* load new %fs */
-	movw	%ax,%fs
+	/* 28(%esp) holds guest_state */
 
-1:	
 	/* Set up the baseBlock pointer */
-	movl	$VG_(baseBlock), %ebp
-
+	movl	28(%esp), %ebp
+	
 	/* fetch m_eip into %eax */
 	movl	VGOFF_(m_eip), %esi
 	movl	(%ebp, %esi, 4), %eax
@@ -97,7 +91,7 @@
 	/* fall into main loop */
 
 dispatch_boring:
-	/* save the jump address at VG_(baseBlock)[VGOFF_(m_eip)] */
+	/* save the jump address in the guest state */
 	movl	VGOFF_(m_eip), %esi
 	movl	%eax, (%ebp, %esi, 4)
 
@@ -117,7 +111,7 @@
 	   %ebp indicates further details of the control transfer
 	   requested to the address in %eax.
 	
-	   If ebp == & VG_(baseBlock), just jump next to %eax.
+	   If ebp is unchanged (== * 28(%esp)), just jump next to %eax.
 	 
 	   If ebp == VG_EBP_JMP_SYSCALL, do a system call before 
 	   continuing at eax.
@@ -128,7 +122,7 @@
 	   If %ebp has any other value, we panic.
 	*/
 
-	cmpl	$VG_(baseBlock), %ebp
+	cmpl	28(%esp), %ebp
 	jz	dispatch_boring
 
 	jmp	dispatch_exceptional
@@ -147,14 +141,7 @@
 	jmp	run_innerloop_exit
 	
 run_innerloop_exit:
-	movb	VG_(clo_pointercheck), %bl
-	testb	%bl,%bl
-	jz	1f
-
-	/* restore %fs */
-	popl	%fs
-	
-1:	popl	%ebp
+	popl	%ebp
 	popl	%edi
 	popl	%esi
 	popl	%edx
@@ -174,7 +161,8 @@
 
 	/* save %eax in %EIP and defer to sched */
 	movl	VGOFF_(m_eip), %esi
-	movl	%eax, VG_(baseBlock)(,%esi, 4)
+	movl	28(%esp), %edi
+	movl	%eax, (%edi, %esi, 4)
 	movl	%ebp, %eax
 	jmp	run_innerloop_exit
 
diff --git a/coregrind/x86/signal.c b/coregrind/x86/signal.c
index bd16a7f..59cf02b 100644
--- a/coregrind/x86/signal.c
+++ b/coregrind/x86/signal.c
@@ -299,31 +299,8 @@
 /*--- Making coredumps                                     ---*/
 /*------------------------------------------------------------*/
 
-void VGA_(fill_elfregs_from_BB)(struct vki_user_regs_struct* regs)
-{
-   regs->eflags = LibVEX_GuestX86_get_eflags(BASEBLOCK_VEX);
-   regs->esp    = BASEBLOCK_VEX->guest_ESP;
-   regs->eip    = BASEBLOCK_VEX->guest_EIP;
-
-   regs->ebx    = BASEBLOCK_VEX->guest_EBX;
-   regs->ecx    = BASEBLOCK_VEX->guest_ECX;
-   regs->edx    = BASEBLOCK_VEX->guest_EDX;
-   regs->esi    = BASEBLOCK_VEX->guest_ESI;
-   regs->edi    = BASEBLOCK_VEX->guest_EDI;
-   regs->ebp    = BASEBLOCK_VEX->guest_EBP;
-   regs->eax    = BASEBLOCK_VEX->guest_EAX;
-
-   regs->cs     = BASEBLOCK_VEX->guest_CS;
-   regs->ds     = BASEBLOCK_VEX->guest_DS;
-   regs->ss     = BASEBLOCK_VEX->guest_SS;
-   regs->es     = BASEBLOCK_VEX->guest_ES;
-   regs->fs     = BASEBLOCK_VEX->guest_FS;
-   regs->gs     = BASEBLOCK_VEX->guest_GS;
-}
-
-
 void VGA_(fill_elfregs_from_tst)(struct vki_user_regs_struct* regs, 
-                                 arch_thread_t* arch)
+                                 ThreadArchState* arch)
 {
    regs->eflags = LibVEX_GuestX86_get_eflags(&arch->vex);
    regs->esp    = arch->vex.guest_ESP;
@@ -371,7 +348,7 @@
 }
 
 void VGA_(fill_elffpregs_from_tst)( vki_elf_fpregset_t* fpu,
-                                    const arch_thread_t* arch)
+                                    const ThreadArchState* arch)
 {
   //fill_fpu(fpu, (const Char *)&arch->m_sse);
 }
@@ -382,7 +359,7 @@
 }
 
 void VGA_(fill_elffpxregs_from_tst) ( vki_elf_fpxregset_t* xfpu,
-                                      const arch_thread_t* arch )
+                                      const ThreadArchState* arch )
 {
   //VG_(memcpy)(xfpu, arch->m_sse, sizeof(*xfpu));
 }
diff --git a/coregrind/x86/state.c b/coregrind/x86/state.c
index e9a2185..3f710a9 100644
--- a/coregrind/x86/state.c
+++ b/coregrind/x86/state.c
@@ -38,49 +38,45 @@
 /*--- baseBlock setup and operations                       ---*/
 /*------------------------------------------------------------*/
 
-/* The variables storing offsets. */
-Int VGOFF_(m_vex) = INVALID_OFFSET;
-Int VGOFF_(m_vex_shadow) = INVALID_OFFSET;
-
-Int VGOFF_(ldt)   = INVALID_OFFSET;
-Int VGOFF_(tls_ptr) = INVALID_OFFSET;
 Int VGOFF_(m_eip) = INVALID_OFFSET;
 
-Int VGOFF_(spillslots) = INVALID_OFFSET;
 
-
-
-/* Here we assign actual offsets.  VEX dictates the layout (see
-   comment at the end of libvex.h).  
+/* Given a pointer to the ThreadArchState for thread 1 (the root
+   thread), initialise the VEX guest state, and copy in essential
+   starting values.
 */
-void VGA_(init_baseBlock) ( Addr client_eip, Addr esp_at_startup )
+void VGA_(init_thread1state) ( Addr client_eip, 
+                               Addr esp_at_startup,
+			       /*MOD*/ ThreadArchState* arch )
 {
    vg_assert(0 == sizeof(VexGuestX86State) % 8);
 
-   /* First the guest state. */
-   VGOFF_(m_vex) = VG_(alloc_BaB)( sizeof(VexGuestX86State) / 4 );
-
-   /* Then equal sized shadow state. */
-   VGOFF_(m_vex_shadow) = VG_(alloc_BaB)( sizeof(VexGuestX86State) / 4 );
-
-   /* Finally the spill area. */
-   VGOFF_(spillslots) = VG_(alloc_BaB)( LibVEX_N_SPILL_BYTES/4 );
-   if (0) VG_(printf)("SPILL SLOTS start at %d\n", VGOFF_(spillslots));
-
    /* Zero out the initial state, and set up the simulated FPU in a
       sane way. */
-   LibVEX_GuestX86_initialise(BASEBLOCK_VEX);
+   LibVEX_GuestX86_initialise(&arch->vex);
 
    /* Zero out the shadow area. */
-   VG_(memset)(BASEBLOCK_VEX_SHADOW, 0, sizeof(VexGuestX86State));
+   VG_(memset)(&arch->vex_shadow, 0, sizeof(VexGuestX86State));
 
    /* Put essential stuff into the new state. */
-   BASEBLOCK_VEX->guest_ESP = esp_at_startup;
-   BASEBLOCK_VEX->guest_EIP = client_eip;
+   /* initialise %cs, %ds and %ss to point at the operating systems
+      default code, data and stack segments */
+   arch->vex.guest_ESP = esp_at_startup;
+   arch->vex.guest_EIP = client_eip;
 
-   /* The dispatch loop needs to be able to find %EIP. */
-   VGOFF_(m_eip)
-      = VGOFF_(m_vex) + offsetof(VexGuestX86State,guest_EIP)/4;
+   asm volatile("movw %%cs, %0"
+                :
+                : "m" (arch->vex.guest_CS));
+   asm volatile("movw %%ds, %0"
+                :
+                : "m" (arch->vex.guest_DS));
+   asm volatile("movw %%ss, %0"
+                :
+                : "m" (arch->vex.guest_SS));
+
+   /* The dispatch loop needs to be able to find %EIP given a pointer
+      to the start of the .vex field. */
+   VGOFF_(m_eip) = offsetof(VexGuestX86State,guest_EIP)/4;
 
    if (VG_(needs).shadow_regs) {
       VG_TRACK( post_regs_write_init );
@@ -97,112 +93,8 @@
       else
          VG_(printf)("Looks like a MMX-only CPU\n");
    }
-
-   /* LDT pointer: pretend the root thread has an empty LDT to start with. */
-   VGOFF_(ldt)   = VG_(alloc_BaB_1_set)((UInt)NULL);
-
-   /* TLS pointer: pretend the root thread has no TLS array for now. */
-   VGOFF_(tls_ptr) = VG_(alloc_BaB_1_set)((UInt)NULL);
-
-   /* initialise %cs, %ds and %ss to point at the operating systems
-      default code, data and stack segments */
-   asm volatile("movw %%cs, %0"
-                :
-                : "m" (BASEBLOCK_VEX->guest_CS));
-   asm volatile("movw %%ds, %0"
-                :
-                : "m" (BASEBLOCK_VEX->guest_DS));
-   asm volatile("movw %%ss, %0"
-                :
-                : "m" (BASEBLOCK_VEX->guest_SS));
 }
 
-/* Junk to fill up a thread's shadow regs with when shadow regs aren't
-   being used. */
-#define VG_UNUSED_SHADOW_REG_VALUE  0x27182818
-
-void VGA_(load_state) ( arch_thread_t* arch, ThreadId tid )
-{
-   VG_(baseBlock)[VGOFF_(ldt)]  = (UInt)arch->ldt;
-   VG_(baseBlock)[VGOFF_(tls_ptr)]  = (UInt)arch->tls;
-
-   *BASEBLOCK_VEX = arch->vex;
-
-   if (VG_(needs).shadow_regs) {
-      *BASEBLOCK_VEX_SHADOW = arch->vex_shadow;
-   } else {
-      /* Fields shouldn't be used -- check their values haven't changed. */
-     /* ummm ...
-      vg_assert(
-         VG_UNUSED_SHADOW_REG_VALUE == arch->sh_eax &&
-         VG_UNUSED_SHADOW_REG_VALUE == arch->sh_ebx &&
-         VG_UNUSED_SHADOW_REG_VALUE == arch->sh_ecx &&
-         VG_UNUSED_SHADOW_REG_VALUE == arch->sh_edx &&
-         VG_UNUSED_SHADOW_REG_VALUE == arch->sh_esi &&
-         VG_UNUSED_SHADOW_REG_VALUE == arch->sh_edi &&
-         VG_UNUSED_SHADOW_REG_VALUE == arch->sh_ebp &&
-         VG_UNUSED_SHADOW_REG_VALUE == arch->sh_esp &&
-         VG_UNUSED_SHADOW_REG_VALUE == arch->sh_eflags);
-     */
-   }
-}
-
-void VGA_(save_state)( arch_thread_t *arch, ThreadId tid )
-{
-   Int i;
-   const UInt junk = 0xDEADBEEF;
-
-   /* We don't copy out the LDT entry, because it can never be changed
-      by the normal actions of the thread, only by the modify_ldt
-      syscall, in which case we will correctly be updating
-      VG_(threads)[tid].ldt.  This printf happens iff the following
-      assertion fails. */
-   if ((void*)arch->ldt != (void*)VG_(baseBlock)[VGOFF_(ldt)])
-      VG_(printf)("VG_(threads)[%d].ldt=%p  VG_(baseBlock)[VGOFF_(ldt)]=%p\n",
-                 tid, (void*)arch->ldt,
-                       (void*)VG_(baseBlock)[VGOFF_(ldt)]);
-
-   vg_assert((void*)arch->ldt == (void*)VG_(baseBlock)[VGOFF_(ldt)]);
-
-   /* We don't copy out the TLS entry, because it can never be changed
-      by the normal actions of the thread, only by the set_thread_area
-      syscall, in which case we will correctly be updating
-      arch->tls.  This printf happens iff the following
-      assertion fails. */
-   if ((void*)arch->tls != (void*)VG_(baseBlock)[VGOFF_(tls_ptr)])
-      VG_(printf)("VG_(threads)[%d].tls=%p  VG_(baseBlock)[VGOFF_(tls_ptr)]=%p\
-n",
-                 tid, (void*)arch->tls,
-                       (void*)VG_(baseBlock)[VGOFF_(tls_ptr)]);
-
-   vg_assert((void*)arch->tls
-             == (void*)VG_(baseBlock)[VGOFF_(tls_ptr)]);
-
-   arch->vex = *BASEBLOCK_VEX;
-
-   if (VG_(needs).shadow_regs) {
-      arch->vex_shadow = *BASEBLOCK_VEX_SHADOW;
-   } else {
-      /* Fill with recognisable junk */
-      /* can't easily do this ...
-      arch->sh_eax =
-      arch->sh_ebx =
-      arch->sh_ecx =
-      arch->sh_edx =
-      arch->sh_esi =
-      arch->sh_edi =
-      arch->sh_ebp =
-      arch->sh_esp =
-      arch->sh_eflags = VG_UNUSED_SHADOW_REG_VALUE;
-      */
-   }
-   /* Fill it up with junk. */
-   VG_(baseBlock)[VGOFF_(ldt)] = junk;
-   VG_(baseBlock)[VGOFF_(tls_ptr)] = junk;
-
-   for (i = 0; i < (3 + sizeof(VexGuestX86State)) / 4; i++)
-      VG_(baseBlock)[VGOFF_(m_vex) + i] = junk;
-}
 
 /*------------------------------------------------------------*/
 /*--- Register access stuff                                ---*/
@@ -254,48 +146,18 @@
    }
 }
 
-/* Return the baseBlock index for the specified shadow register */
-static Int shadow_reg_index ( Int arch )
-{
-   if (0)
-   VG_(printf)("shadow_reg_index(%d)\n",
-               arch);
-   switch (arch) {
-      case R_EAX: return VGOFF_(m_vex_shadow) + offsetof(VexGuestX86State,guest_EAX)/4;
-      case R_ECX: return VGOFF_(m_vex_shadow) + offsetof(VexGuestX86State,guest_ECX)/4;
-      case R_EDX: return VGOFF_(m_vex_shadow) + offsetof(VexGuestX86State,guest_EDX)/4;
-      case R_EBX: return VGOFF_(m_vex_shadow) + offsetof(VexGuestX86State,guest_EBX)/4;
-      case R_ESP: return VGOFF_(m_vex_shadow) + offsetof(VexGuestX86State,guest_ESP)/4;
-      case R_EBP: return VGOFF_(m_vex_shadow) + offsetof(VexGuestX86State,guest_EBP)/4;
-      case R_ESI: return VGOFF_(m_vex_shadow) + offsetof(VexGuestX86State,guest_ESI)/4;
-      case R_EDI: return VGOFF_(m_vex_shadow) + offsetof(VexGuestX86State,guest_EDI)/4;
-      default:    VG_(core_panic)( "shadow_reg_index");
-   }
-}
-
-/* Accessing shadow arch. registers */
-UInt VGA_(get_shadow_archreg) ( UInt archreg )
-{
-   return VG_(baseBlock)[ shadow_reg_index(archreg) ];
-}
-
 
 /*------------------------------------------------------------*/
 /*--- Thread stuff                                         ---*/
 /*------------------------------------------------------------*/
 
-void VGA_(clear_thread)( arch_thread_t *arch )
+void VGA_(clear_thread)( ThreadArchState *arch )
 {
    arch->ldt = NULL;
    VG_(clear_TLS_for_thread)(arch->tls);
 }  
 
-void VGA_(init_thread)( arch_thread_t *arch )
-{
-   VG_(baseBlock)[VGOFF_(tls_ptr)] = (UInt)arch->tls;
-}  
-
-void VGA_(cleanup_thread) ( arch_thread_t *arch )
+void VGA_(cleanup_thread) ( ThreadArchState *arch )
 {  
    /* Deallocate its LDT, if it ever had one. */
    VG_(deallocate_LDT_for_thread)( arch->ldt ); 
@@ -305,21 +167,19 @@
    VG_(clear_TLS_for_thread)( arch->tls );
 }  
 
-void VGA_(setup_child) ( arch_thread_t *regs, arch_thread_t *parent_regs )
+void VGA_(setup_child) ( ThreadArchState *arch, ThreadArchState *parent_arch )
 {  
    /* We inherit our parent's LDT. */
-   if (parent_regs->ldt == NULL) {
+   if (parent_arch->ldt == NULL) {
       /* We hope this is the common case. */
-      VG_(baseBlock)[VGOFF_(ldt)] = 0;
+      arch->ldt = NULL;
    } else {
       /* No luck .. we have to take a copy of the parent's. */
-      regs->ldt = VG_(allocate_LDT_for_thread)( parent_regs->ldt );
-      VG_(baseBlock)[VGOFF_(ldt)] = (UInt) regs->ldt;
+      arch->ldt = VG_(allocate_LDT_for_thread)( parent_arch->ldt );
    }
 
    /* Initialise the thread's TLS array */
-   VG_(clear_TLS_for_thread)( regs->tls );
-   VG_(baseBlock)[VGOFF_(tls_ptr)] = (UInt) regs->tls;
+   VG_(clear_TLS_for_thread)( arch->tls );
 }  
 
 void VGA_(set_arg_and_bogus_ret)( ThreadId tid, UWord arg, Addr ret )
@@ -327,7 +187,8 @@
    /* Push the arg, and mark it as readable. */
    SET_PTHREQ_ESP(tid, VG_(threads)[tid].arch.vex.guest_ESP - sizeof(UWord));
    * (UInt*)(VG_(threads)[tid].arch.vex.guest_ESP) = arg;
-   VG_TRACK( post_mem_write, VG_(threads)[tid].arch.vex.guest_ESP, sizeof(void*) );
+   VG_TRACK( post_mem_write, VG_(threads)[tid].arch.vex.guest_ESP, 
+                             sizeof(void*) );
 
    /* Don't mark the pushed return address as readable; any attempt to read
       this is an internal valgrind bug since thread_exit_wrapper() should not
@@ -360,22 +221,7 @@
 /*--- Symtab stuff                                         ---*/
 /*------------------------------------------------------------*/
 
-UInt *VGA_(reg_addr_from_BB)(Int regno)
-{
-   switch (regno) {
-   case R_EAX: return &(BASEBLOCK_VEX->guest_EAX);
-   case R_ECX: return &(BASEBLOCK_VEX->guest_ECX);
-   case R_EDX: return &(BASEBLOCK_VEX->guest_EDX);
-   case R_EBX: return &(BASEBLOCK_VEX->guest_EBX);
-   case R_ESP: return &(BASEBLOCK_VEX->guest_ESP);
-   case R_EBP: return &(BASEBLOCK_VEX->guest_EBP);
-   case R_ESI: return &(BASEBLOCK_VEX->guest_ESI);
-   case R_EDI: return &(BASEBLOCK_VEX->guest_EDI);
-   default:    return NULL;
-   }
-}
-
-UInt *VGA_(reg_addr_from_tst)(Int regno, arch_thread_t *arch)
+UInt *VGA_(reg_addr_from_tst)(Int regno, ThreadArchState *arch)
 {
    switch (regno) {
    case R_EAX: return &arch->vex.guest_EAX;
@@ -422,31 +268,7 @@
 /*--- Debugger-related operations                          ---*/
 /*------------------------------------------------------------*/
 
-Int VGA_(ptrace_setregs_from_BB)(Int pid)
-{
-   struct vki_user_regs_struct regs;
-
-   regs.cs     = BASEBLOCK_VEX->guest_CS;
-   regs.ss     = BASEBLOCK_VEX->guest_SS;
-   regs.ds     = BASEBLOCK_VEX->guest_DS;
-   regs.es     = BASEBLOCK_VEX->guest_ES;
-   regs.fs     = BASEBLOCK_VEX->guest_FS;
-   regs.gs     = BASEBLOCK_VEX->guest_GS;
-   regs.eax    = BASEBLOCK_VEX->guest_EAX;
-   regs.ebx    = BASEBLOCK_VEX->guest_EBX;
-   regs.ecx    = BASEBLOCK_VEX->guest_ECX;
-   regs.edx    = BASEBLOCK_VEX->guest_EDX;
-   regs.esi    = BASEBLOCK_VEX->guest_ESI;
-   regs.edi    = BASEBLOCK_VEX->guest_EDI;
-   regs.ebp    = BASEBLOCK_VEX->guest_EBP;
-   regs.esp    = BASEBLOCK_VEX->guest_ESP;
-   regs.eflags = LibVEX_GuestX86_get_eflags(BASEBLOCK_VEX);
-   regs.eip    = BASEBLOCK_VEX->guest_EIP;
-
-   return ptrace(PTRACE_SETREGS, pid, NULL, &regs);
-}
-
-Int VGA_(ptrace_setregs_from_tst)(Int pid, arch_thread_t* arch)
+Int VGA_(ptrace_setregs_from_tst)(Int pid, ThreadArchState* arch)
 {
    struct vki_user_regs_struct regs;