Merge in a somewhat modified patch version of Jeremy Fitzhardinge's
translation chaining patch.

47-chained-bb

This implements basic-block chaining. Rather than always going through
the dispatch loop, a BB may jump directly to a successor BB if it is
present in the translation cache.

When the BB's code is first generated, the jumps to the successor BBs
are filled with undefined instructions. When the BB is inserted into
the translation cache, the undefined instructions are replaced with a
call to VG_(patch_me). When VG_(patch_me) is called, it looks up the
desired target address in the fast translation cache. If present, it
backpatches the call to patch_me with a jump to the translated target
BB. If the fast lookup fails, it falls back into the normal dispatch
loop.

When the parts of the translation cache are discarded, all translations
are unchained, so as to ensure we don't have direct jumps to code which
has been thrown away.

This optimisation only has effect on direct jumps; indirect jumps
(including returns) still go through the dispatch loop.  The -v stats
indicate a worst-case rate of about 16% of jumps having to go via the
slow mechanism.  This will be a combination of function returns and
genuine indirect jumps.

Certain parts of the dispatch loop's actions have to be moved into
each basic block; namely: updating the virtual EIP and keeping track
of the basic block counter.

At present, basic block chaining seems to improve performance by up to
25% with --skin=none.  Gains for skins adding more instrumentation
will be correspondingly smaller.

There is a command line option: --chain-bb=yes|no (defaults to yes).


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1336 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/vg_constants.h b/coregrind/vg_constants.h
index 926fa01..4bc4351 100644
--- a/coregrind/vg_constants.h
+++ b/coregrind/vg_constants.h
@@ -53,6 +53,17 @@
 #define VG_TRC_INNER_COUNTERZERO  29 /* TRC only; means bb ctr == 0 */
 #define VG_TRC_UNRESUMABLE_SIGNAL 37 /* TRC only; got sigsegv/sigbus */
 
+/* size of call instruction put into generated code at jump sites */
+#define VG_PATCHME_CALLSZ	5
+
+/* size of jmp instruction which overwrites the call */
+#define VG_PATCHME_JMPSZ	5
+
+/* maximum number of normal jumps which can appear in a basic block */
+#define VG_MAX_JUMPS		2
+
+/* Offset of code in a TCEntry */
+#define VG_CODE_OFFSET		(8 + VG_MAX_JUMPS * 2)
 
 /* Debugging hack for assembly code ... sigh. */
 #if 0
diff --git a/coregrind/vg_dispatch.S b/coregrind/vg_dispatch.S
index 195e290..08ae86b 100644
--- a/coregrind/vg_dispatch.S
+++ b/coregrind/vg_dispatch.S
@@ -50,13 +50,20 @@
 */
 
 	
+#define TT_LOOKUP(reg, fail)				\
+	movl %eax, reg;					\
+	andl $VG_TT_FAST_MASK, reg;			\
+	movl VG_(tt_fast)(,reg,4), reg;			\
+	cmpl %eax, (reg);				\
+	jnz  fail
+	
 /* The C world needs a way to get started simulating.  So we provide
    a function void vg_run_innerloop ( void ), which starts running
    from vg_m_eip, and exits when the counter reaches zero.  This loop
    can also exit if vg_oursignalhandler() catches a non-resumable
    signal, for example SIGSEGV.  It then longjmp()s back past here.
 */
-	
+
 .globl VG_(run_innerloop)
 VG_(run_innerloop):
 	/* OYNK(1000) */
@@ -101,47 +108,16 @@
 	/* save the jump address at VG_(baseBlock)[VGOFF_(m_eip)] */
 	movl	VGOFF_(m_eip), %esi
 	movl	%eax, (%ebp, %esi, 4)
-	/* Are we out of timeslice?  If yes, defer to scheduler. */
-	decl	VG_(dispatch_ctr)
-	jz	counter_is_zero
-	/* try a fast lookup in the translation cache */
-	movl	%eax, %ebx
-	andl	$VG_TT_FAST_MASK, %ebx	
-	/* ebx = tt_fast index */
-	movl	VG_(tt_fast)(,%ebx,4), %ebx	
-	/* ebx points at a tc entry
-	   now compare target with the tce.orig_addr field (+0) */
-	cmpl	%eax, (%ebx)
-	jnz	fast_lookup_failed
-
-	/* Found a match.  Call the tce.payload field (+8) */
-	addl	$8, %ebx
-	call	*%ebx
-	
-	cmpl	$VG_(baseBlock), %ebp
-	jnz	dispatch_exceptional
-
-dispatch_boring_unroll2:
-	/* save the jump address at VG_(baseBlock)[VGOFF_(m_eip)] */
-	movl	VGOFF_(m_eip), %esi
-	movl	%eax, (%ebp, %esi, 4)
 
 	/* Are we out of timeslice?  If yes, defer to scheduler. */
-	decl	VG_(dispatch_ctr)
+	cmpl	$0, VG_(dispatch_ctr)
 	jz	counter_is_zero
-
 	/* try a fast lookup in the translation cache */
-	movl	%eax, %ebx
-	andl	$VG_TT_FAST_MASK, %ebx	
-	/* ebx = tt_fast index */
-	movl	VG_(tt_fast)(,%ebx,4), %ebx	
-	/* ebx points at a tc entry
-	   now compare target with the tce.orig_addr field (+0) */
-	cmpl	%eax, (%ebx)
-	jnz	fast_lookup_failed
+	TT_LOOKUP(%ebx, fast_lookup_failed)
 
-	/* Found a match.  Call the tce.payload field (+8) */
-	addl	$8, %ebx
+	/* Found a match.  Call the tce.payload field (+VG_CODE_OFFSET) */
+	addl	$VG_CODE_OFFSET, %ebx
+	incl	VG_(unchained_jumps_done)	      /* update stats */
 	call	*%ebx
 	
 	cmpl	$VG_(baseBlock), %ebp
@@ -180,7 +156,9 @@
 	jz	dispatch_syscall
 	cmpl	$VG_TRC_EBP_JMP_CLIENTREQ, %ebp
 	jz	dispatch_clientreq
-
+	cmpl	$VG_TRC_INNER_COUNTERZERO, %ebp
+	jz	counter_is_zero
+	
 	/* ebp has an invalid value ... crap out. */
 	pushl	$panic_msg_ebp
 	call	VG_(core_panic)
@@ -202,6 +180,34 @@
 	movl	$VG_TRC_EBP_JMP_CLIENTREQ, %eax
 	jmp	run_innerloop_exit
 
+
+/* This is the translation chainer, our run-time linker, if you like.
+	
+   This enters with %eax pointing to next eip we want.  If
+   we've already compiled that eip (ie, get a fast hit), we
+   backpatch the call instruction with a jump, and jump there.
+   Otherwise, we do a slow hit/compile through the normal path
+   (and get to do a backpatch next time through). 
+*/
+.globl VG_(patch_me)
+VG_(patch_me):
+	/* try a fast lookup in the translation cache */
+	TT_LOOKUP(%ebx, 1f)
+
+	/* Patch call instruction at callsite into a chained jmp */
+	popl	%eax	    /* eax = just after (VG_PATCHME_CALLSZ byte) call */
+	addl	$VG_CODE_OFFSET, %ebx	/* ebx = target eip */
+	subl	%eax, %ebx		/* ebx = delta */
+	movb	$0xE9, -(VG_PATCHME_CALLSZ-0)(%eax)		/* 0xe9 = jmp */
+	movl	%ebx,  -(VG_PATCHME_CALLSZ-1)(%eax)	       /* store delta */
+	addl	%eax, %ebx
+	incl	VG_(bb_enchain_count)			      /* update stats */
+	jmp	*%ebx					       /* jmp to dest */
+
+	/* tt_fast miss: return into main dispatch loop */
+1:	addl	$4, %esp	/* remove our call address */
+	ret			/* return into main dispatch loop above */
+	
 .data
 panic_msg_ebp:
 .ascii	"vg_dispatch: %ebp has invalid value!"
diff --git a/coregrind/vg_errcontext.c b/coregrind/vg_errcontext.c
index f00e6b6..004110a 100644
--- a/coregrind/vg_errcontext.c
+++ b/coregrind/vg_errcontext.c
@@ -416,7 +416,7 @@
 
       if ((i+1 == VG_(clo_dump_error))) {
 	VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to below NULLs */,
-                         p_min->where->eips[0], NULL, NULL, NULL );
+                         p_min->where->eips[0], NULL, NULL, NULL, NULL );
       }
 
       p_min->count = 1 << 30;
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index feff35a..11843ee 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -69,6 +69,10 @@
 static Int    emitted_code_used;
 static Int    emitted_code_size;
 
+/* offset (in bytes into the basic block)  */
+static UShort jumps[VG_MAX_JUMPS];
+static Int    jumpidx;
+
 /* Statistics about C functions called from generated code. */
 static UInt ccalls                 = 0;
 static UInt ccall_reg_saves        = 0;
@@ -1040,6 +1044,110 @@
       VG_(printf)("\n\t\tret\n");
 }
 
+/* Predicate used in sanity checks elsewhere - returns true if any
+   jump-site is an actual chained jump */
+Bool VG_(is_chained_jumpsite)(Addr a)
+{
+   UChar *cp = (UChar *)a;
+
+   return (*cp == 0xE9);		/* 0xE9 -- jmp */
+}
+
+/* Predicate used in sanity checks elsewhere - returns true if all
+   jump-sites are calls to VG_(patch_me) */
+Bool VG_(is_unchained_jumpsite)(Addr a)
+{
+   UChar *cp = (UChar *)a;
+   Int delta = ((Addr)&VG_(patch_me)) - (a + VG_PATCHME_CALLSZ);
+   Int idelta;
+
+   if (*cp++ != 0xE8)	/* 0xE8 == call */
+      return False;
+
+   idelta  = (*cp++) <<  0;
+   idelta |= (*cp++) <<  8;
+   idelta |= (*cp++) << 16;
+   idelta |= (*cp++) << 24;
+      
+   return idelta == delta;
+}
+
+/* Return target address for a direct jmp */
+Addr VG_(get_jmp_dest)(Addr a)
+{
+   Int delta;
+   UChar *cp = (UChar *)a;
+
+   if (*cp++ != 0xE9)	/* 0xE9 == jmp */
+      return 0;
+
+   delta  = (*cp++) <<  0;
+   delta |= (*cp++) <<  8;
+   delta |= (*cp++) << 16;
+   delta |= (*cp++) << 24;
+
+   return a + VG_PATCHME_JMPSZ + delta;
+}
+
+/* unchain a BB by generating a call to VG_(patch_me) */
+void VG_(unchain_jumpsite)(Addr a)
+{
+   Int delta = ((Addr)&VG_(patch_me)) - (a + VG_PATCHME_CALLSZ);
+   UChar *cp = (UChar *)a;
+
+   if (VG_(is_unchained_jumpsite)(a))
+      return;			/* don't write unnecessarily */
+
+   *cp++ = 0xE8;		/* call */
+   *cp++ = (delta >>  0) & 0xff;
+   *cp++ = (delta >>  8) & 0xff;
+   *cp++ = (delta >> 16) & 0xff;
+   *cp++ = (delta >> 24) & 0xff;
+   VG_(bb_dechain_count)++;     /* update stats */
+}
+
+/* This doesn't actually generate a call to VG_(patch_me), but
+   reserves enough space in the instruction stream for it to happen
+   and records the offset into the jump table.  This is because call
+   is a relative jump, and so will be affected when this code gets
+   moved about.  The translation table will "unchain" this basic block
+   on insertion (with VG_(unchain_BB)()), and thereby generate a
+   proper call instruction. */
+static void emit_call_patchme( void )
+{
+   vg_assert(VG_PATCHME_CALLSZ == 5);
+
+   VG_(new_emit)();
+
+   if (jumpidx >= VG_MAX_JUMPS) {
+      /* If there too many jumps in this basic block, fall back to
+	 dispatch loop.  We still need to keep it the same size as the
+	 call sequence. */
+      VG_(emitB) ( 0xC3 );	/* ret */
+      VG_(emitB) ( 0x90 );	/* nop */
+      VG_(emitB) ( 0x90 );	/* nop */
+      VG_(emitB) ( 0x90 );	/* nop */
+      VG_(emitB) ( 0x90 );	/* nop */
+
+      if (dis)
+	 VG_(printf)("\n\t\tret; nop; nop; nop; nop\n");
+
+      if (0 && VG_(clo_verbosity))
+	 VG_(message)(Vg_DebugMsg, "too many chained jumps in basic-block");
+   } else {
+      jumps[jumpidx++] = emitted_code_used;
+      
+      VG_(emitB) ( 0x0F );		/* UD2 - undefined instruction */
+      VG_(emitB) ( 0x0B );
+      VG_(emitB) ( 0x0F );		/* UD2 - undefined instruction */
+      VG_(emitB) ( 0x0B );
+      VG_(emitB) ( 0x90 );		/* NOP */
+
+      if (dis)
+	 VG_(printf)("\n\t\tud2; ud2; nop\n");
+   }
+}   
+
 void VG_(emit_pushal) ( void )
 {
    VG_(new_emit)();
@@ -1410,13 +1518,20 @@
    emit_ret();
 }
 
+static void synth_mov_reg_offregmem ( Int size, Int reg, Int off, Int areg );
 
 /* Same deal as synth_jmp_reg. */
 static void synth_jmp_lit ( Addr addr, JmpKind jmpkind )
 {
-   load_ebp_from_JmpKind ( jmpkind );
    VG_(emit_movv_lit_reg) ( 4, addr, R_EAX );
-   emit_ret();
+
+   if (VG_(clo_chain_bb) && (jmpkind == JmpBoring || jmpkind == JmpCall)) {
+      synth_mov_reg_offregmem(4, R_EAX, 4*VGOFF_(m_eip), R_EBP); /* update EIP */
+      emit_call_patchme();
+   } else {
+      load_ebp_from_JmpKind ( jmpkind );
+      emit_ret();
+   }
 }
 
 
@@ -1436,7 +1551,23 @@
    6                    xyxyxy:
   */
    emit_get_eflags();
-   VG_(emit_jcondshort_delta) ( invertCondition(cond), 5+1 );
+   if (VG_(clo_chain_bb)) {
+      /* When using BB chaining, the jump sequence is:
+        jmp short if not cond to xyxyxy
+        addr -> eax
+	call VG_(patch_me)/jmp target
+        xyxyxy
+	 
+		je     1f
+		mov    $0x4000d190,%eax			// 5
+		mov    %eax, VGOFF_(m_eip)(%ebp)	// 3
+		call   0x40050f9a <vgPlain_patch_me>	// 5
+	1:	mov    $0x4000d042,%eax
+		call   0x40050f9a <vgPlain_patch_me>
+      */
+      VG_(emit_jcondshort_delta) ( invertCondition(cond), 5+3+5 );
+   } else
+      VG_(emit_jcondshort_delta) ( invertCondition(cond), 5+1 );
    synth_jmp_lit ( addr, JmpBoring );
 }
 
@@ -1450,7 +1581,10 @@
       next:
    */
    VG_(emit_cmpl_zero_reg) ( reg );
-   VG_(emit_jcondshort_delta) ( CondNZ, 5+1 );
+   if (VG_(clo_chain_bb))
+      VG_(emit_jcondshort_delta) ( CondNZ, 5+3+5 );
+   else
+      VG_(emit_jcondshort_delta) ( CondNZ, 5+1 );
    synth_jmp_lit ( addr, JmpBoring );
 }
 
@@ -1965,8 +2099,8 @@
    there is any chance at all that the code generated for a UInstr
    will change the real FPU state.  
 */
-static Bool emitUInstr ( UCodeBlock* cb, Int i, RRegSet regs_live_before, 
-                         Bool fplive )
+static Bool emitUInstr ( UCodeBlock* cb, Int i, 
+                         RRegSet regs_live_before, Bool fplive )
 {
    Int     old_emitted_code_used;
    UInstr* u = &cb->instrs[i];
@@ -2466,18 +2600,31 @@
 
 /* Emit x86 for the ucode in cb, returning the address of the
    generated code and setting *nbytes to its size. */
-UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes )
+UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes, UShort j[VG_MAX_JUMPS] )
 {
    Int i;
    UChar regs_live_before = 0;   /* No regs live at BB start */
    Bool fplive;
-
+   
    emitted_code_used = 0;
    emitted_code_size = 500; /* reasonable initial size */
    emitted_code = VG_(arena_malloc)(VG_AR_JITTER, emitted_code_size);
+   jumpidx = 0;
 
    if (dis) VG_(printf)("Generated x86 code:\n");
 
+   /* Generate decl VG_(dispatch_ctr) and drop into dispatch if we hit
+      zero.  We have to do this regardless of whether we're t-chaining
+      or not. */
+   VG_(new_emit)();
+   VG_(emitB) (0xFF);	/* decl */
+   emit_amode_litmem_reg((Addr)&VG_(dispatch_ctr), 1);
+   if (dis)
+      VG_(printf)("\n\t\tdecl (%p)\n", &VG_(dispatch_ctr));
+   VG_(emit_jcondshort_delta)(CondNZ, 5+1);
+   VG_(emit_movv_lit_reg) ( 4, VG_TRC_INNER_COUNTERZERO, R_EBP );
+   emit_ret();
+
    fplive = False;
    for (i = 0; i < cb->used; i++) {
       UInstr* u = &cb->instrs[i];
@@ -2497,6 +2644,12 @@
    if (dis) VG_(printf)("\n");
    vg_assert(!fplive);	/* FPU state must be saved by end of BB */
 
+   if (j != NULL) {
+      vg_assert(jumpidx <= VG_MAX_JUMPS);
+      for(i = 0; i < jumpidx; i++)
+	 j[i] = jumps[i];
+   }
+
    /* Returns a pointer to the emitted code.  This will have to be
       copied by the caller into the translation cache, and then freed */
    *nbytes = emitted_code_used;
diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h
index 5fa88b4..00fa4ba 100644
--- a/coregrind/vg_include.h
+++ b/coregrind/vg_include.h
@@ -244,6 +244,8 @@
    is ignored.  Ie if a skin says no, I don't want this to run, that
    cannot be overridden from the command line. */
 extern Bool  VG_(clo_run_libc_freeres);
+/* Use the basic-block chaining optimisation */
+extern Bool VG_(clo_chain_bb);
 
 
 /* ---------------------------------------------------------------------
@@ -1045,11 +1047,16 @@
    Exports of vg_from_ucode.c
    ------------------------------------------------------------------ */
 
-extern UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes );
+extern UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes, UShort jumps[VG_MAX_JUMPS] );
 
 extern void   VG_(print_ccall_stats)      ( void );
 extern void   VG_(print_UInstr_histogram) ( void );
 
+extern void   VG_(unchain_jumpsite)	  ( Addr jumpsite );
+extern Addr   VG_(get_jmp_dest)           ( Addr jumpsite );
+extern Bool   VG_(is_unchained_jumpsite)  ( Addr jumpsite );
+extern Bool   VG_(is_chained_jumpsite)    ( Addr jumpsite );
+
 /* ---------------------------------------------------------------------
    Exports of vg_to_ucode.c
    ------------------------------------------------------------------ */
@@ -1062,6 +1069,7 @@
 
 /* Expandable arrays of uinstrs. */
 struct _UCodeBlock { 
+   Addr	   orig_eip;
    Int     used; 
    Int     size; 
    UInstr* instrs;
@@ -1074,7 +1082,8 @@
                                Addr  orig_addr,
                                UInt* orig_size,
                                Addr* trans_addr,
-                               UInt* trans_size );
+                               UInt* trans_size,
+			       UShort jumps[VG_MAX_JUMPS]);
 
 extern Char* VG_(nameCondcode)        ( Condcode cond );
 extern Bool  VG_(saneUInstr)          ( Bool beforeRA, Bool beforeLiveness,
@@ -1369,9 +1378,14 @@
 extern UInt VG_(overall_out_count);
 extern UInt VG_(overall_out_osize);
 extern UInt VG_(overall_out_tsize);
-
 /* The number of discards of TT/TC. */
 extern UInt VG_(number_of_tc_discards);
+/* Counts of chain and unchain operations done. */
+extern UInt VG_(bb_enchain_count);
+extern UInt VG_(bb_dechain_count);
+/* Number of unchained jumps performed. */
+extern UInt VG_(unchained_jumps_done);
+
 
 /* Counts pertaining to the register allocator. */
 
@@ -1445,7 +1459,8 @@
 extern void VG_(get_tt_tc_used) ( UInt* tt_used, UInt* tc_used );
 
 extern void VG_(add_to_trans_tab) ( Addr orig_addr,  Int orig_size,
-                                    Addr trans_addr, Int trans_size );
+                                    Addr trans_addr, Int trans_size,
+				    UShort jumps[VG_MAX_JUMPS]);
 
 extern void VG_(invalidate_translations) ( Addr start, UInt range );
 
@@ -1482,6 +1497,9 @@
    which means we need to defer to the scheduler. */
 extern UInt VG_(run_innerloop) ( void );
 
+/* The patching routing called when a BB wants to chain itself to
+   another. */
+extern UInt VG_(patch_me);
 
 /* ---------------------------------------------------------------------
    Exports of vg_helpers.S
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
index b9792b4..4fed580 100644
--- a/coregrind/vg_main.c
+++ b/coregrind/vg_main.c
@@ -403,9 +403,13 @@
 UInt VG_(overall_out_count) = 0;
 UInt VG_(overall_out_osize) = 0;
 UInt VG_(overall_out_tsize) = 0;
-
 /* The number of discards of TT/TC. */
 UInt VG_(number_of_tc_discards) = 0;
+/* Counts of chain and unchain operations done. */
+UInt VG_(bb_enchain_count) = 0;
+UInt VG_(bb_dechain_count) = 0;
+/* Number of unchained jumps performed. */
+UInt VG_(unchained_jumps_done) = 0;
 
 
 /* Counts pertaining to the register allocator. */
@@ -468,6 +472,7 @@
 Int    VG_(clo_backtrace_size) = 4;
 Char*  VG_(clo_weird_hacks)    = NULL;
 Bool   VG_(clo_run_libc_freeres) = True;
+Bool   VG_(clo_chain_bb)       = True;
 
 /* This Bool is needed by wrappers in vg_clientmalloc.c to decide how
    to behave.  Initially we say False. */
@@ -558,6 +563,7 @@
 "    --single-step=no|yes      translate each instr separately? [no]\n"
 "    --optimise=no|yes         improve intermediate code? [yes]\n"
 "    --profile=no|yes          profile? (skin must be built for it) [no]\n"
+"    --chain-bb=no|yes         do basic-block chaining? [yes]\n"
 "    --trace-codegen=<XXXXX>   show generated code? (X = 0|1) [00000]\n"
 "    --trace-syscalls=no|yes   show all system calls? [no]\n"
 "    --trace-signals=no|yes    show signal handling details? [no]\n"
@@ -833,6 +839,11 @@
       else if (STREQ(argv[i], "--profile=no"))
          VG_(clo_profile) = False;
 
+      else if (STREQ(argv[i], "--chain-bb=yes"))
+	 VG_(clo_chain_bb) = True;
+      else if (STREQ(argv[i], "--chain-bb=no"))
+	 VG_(clo_chain_bb) = False;
+
       else if (STREQ(argv[i], "--single-step=yes"))
          VG_(clo_single_step) = True;
       else if (STREQ(argv[i], "--single-step=no"))
@@ -1174,6 +1185,9 @@
 		"    TT/TC: %d tc sectors discarded.",
                 VG_(number_of_tc_discards) );
    VG_(message)(Vg_DebugMsg,
+                "           %d chainings, %d unchainings.",
+                VG_(bb_enchain_count), VG_(bb_dechain_count) );
+   VG_(message)(Vg_DebugMsg,
                 "translate: new     %d (%d -> %d; ratio %d:10)",
                 VG_(overall_in_count),
                 VG_(overall_in_osize),
@@ -1186,10 +1200,19 @@
                 VG_(overall_out_tsize),
                 safe_idiv(10*VG_(overall_out_tsize), VG_(overall_out_osize)));
    VG_(message)(Vg_DebugMsg,
-      " dispatch: %lu basic blocks, %d/%d sched events, %d tt_fast misses.", 
-      VG_(bbs_done), VG_(num_scheduling_events_MAJOR), 
+      " dispatch: %lu jumps (bb entries), of which %u (%lu%%) were unchained.", 
+      VG_(bbs_done), 
+      VG_(unchained_jumps_done),
+      ((ULong)(100) * (ULong)(VG_(unchained_jumps_done)))
+         / ( VG_(bbs_done)==0 ? 1 : VG_(bbs_done) )
+   );
+
+   VG_(message)(Vg_DebugMsg,
+      "           %d/%d major/minor sched events.  %d tt_fast misses.", 
+                     VG_(num_scheduling_events_MAJOR), 
                      VG_(num_scheduling_events_MINOR), 
                      VG_(tt_fast_misses));
+
    VG_(message)(Vg_DebugMsg, 
                 "reg-alloc: %d t-req-spill, "
                 "%d+%d orig+spill uis, %d total-reg-r.",
diff --git a/coregrind/vg_scheduler.c b/coregrind/vg_scheduler.c
index f11ed79..4e6561b 100644
--- a/coregrind/vg_scheduler.c
+++ b/coregrind/vg_scheduler.c
@@ -316,12 +316,17 @@
 static
 void create_translation_for ( ThreadId tid, Addr orig_addr )
 {
-   Addr trans_addr;
-   Int  orig_size, trans_size;
+   Addr   trans_addr;
+   Int    orig_size, trans_size;
+   UShort jumps[VG_MAX_JUMPS];
+   Int    i;
+
+   for(i = 0; i < VG_MAX_JUMPS; i++)
+      jumps[i] = (UShort)-1;
 
    /* Make a translation, into temporary storage. */
    VG_(translate)( &VG_(threads)[tid],
-                   orig_addr, &orig_size, &trans_addr, &trans_size );
+                   orig_addr, &orig_size, &trans_addr, &trans_size, jumps );
 
    /* Copy data at trans_addr into the translation cache. */
    /* Since the .orig_size and .trans_size fields are
@@ -329,7 +334,7 @@
    vg_assert(orig_size > 0 && orig_size < 65536);
    vg_assert(trans_size > 0 && trans_size < 65536);
 
-   VG_(add_to_trans_tab)( orig_addr, orig_size, trans_addr, trans_size );
+   VG_(add_to_trans_tab)( orig_addr, orig_size, trans_addr, trans_size, jumps );
 
    /* Free the intermediary -- was allocated by VG_(emit_code). */
    VG_(arena_free)( VG_AR_JITTER, (void*)trans_addr );
@@ -1579,7 +1584,7 @@
    VG_(printf)(
       "======vvvvvvvv====== LAST TRANSLATION ======vvvvvvvv======\n");
    VG_(translate)( &VG_(threads)[tid], 
-                   VG_(threads)[tid].m_eip, NULL, NULL, NULL );
+                   VG_(threads)[tid].m_eip, NULL, NULL, NULL, NULL );
    VG_(printf)("\n");
    VG_(printf)(
       "======^^^^^^^^====== LAST TRANSLATION ======^^^^^^^^======\n");
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index 8f5ac3d..6904c56 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -55,6 +55,7 @@
 UCodeBlock* VG_(setup_UCodeBlock) ( UCodeBlock* cb_in )
 {
    UCodeBlock* cb = VG_(arena_malloc)(VG_AR_CORE, sizeof(UCodeBlock));
+   cb->orig_eip = cb_in->orig_eip;
    cb->used = cb->size = 0;
    cb->nextTemp = cb_in->nextTemp;
    cb->instrs = NULL;
@@ -1762,6 +1763,7 @@
    /* Resulting code goes here.  We generate it all in a forwards
       pass. */
    c2 = VG_(alloc_UCodeBlock)();
+   c2->orig_eip = c1->orig_eip;
 
    /* At the start, no TempRegs are assigned to any real register.
       Correspondingly, all temps claim to be currently resident in
@@ -2017,7 +2019,8 @@
 		      /*IN*/  Addr  orig_addr,  
                       /*OUT*/ UInt* orig_size,
                       /*OUT*/ Addr* trans_addr, 
-                      /*OUT*/ UInt* trans_size )
+                      /*OUT*/ UInt* trans_size,
+		      /*OUT*/ UShort jumps[VG_MAX_JUMPS])
 {
    Int         n_disassembled_bytes, final_code_size;
    Bool        debugging_translation;
@@ -2032,6 +2035,7 @@
       VG_TRACK( pre_mem_read, Vg_CoreTranslate, tst, "", orig_addr, 1 );
 
    cb = VG_(alloc_UCodeBlock)();
+   cb->orig_eip = orig_addr;
 
    /* If doing any code printing, print a basic block start marker */
    if (VG_(clo_trace_codegen)) {
@@ -2088,7 +2092,7 @@
    VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(5);
 
    VGP_PUSHCC(VgpFromUcode);
-   final_code = VG_(emit_code)(cb, &final_code_size );
+   final_code = VG_(emit_code)(cb, &final_code_size, jumps );
    VGP_POPCC(VgpFromUcode);
    VG_(free_UCodeBlock)(cb);
 
diff --git a/coregrind/vg_transtab.c b/coregrind/vg_transtab.c
index 8372d6c..630e48a 100644
--- a/coregrind/vg_transtab.c
+++ b/coregrind/vg_transtab.c
@@ -30,6 +30,7 @@
 */
 
 #include "vg_include.h"
+#include <stddef.h>
 
 /* #define DEBUG_TRANSTAB */
 
@@ -58,6 +59,10 @@
 
 /*------------------ TYPES ------------------*/
 
+#define CODE_ALIGNMENT	16	/* alignment of TCEntries */
+#define CODE_ALIGN(a)	(((a)+CODE_ALIGNMENT-1) & ~(CODE_ALIGNMENT-1))
+#define IS_ALIGNED(a)	(((a) & (CODE_ALIGNMENT-1)) == 0)
+
 /* An entry in TC.  Payload always is always padded out to a 4-aligned
    quantity so that these structs are always word-aligned. */
 typedef
@@ -65,7 +70,8 @@
       /* +0 */ Addr   orig_addr;
       /* +4 */ UShort orig_size;
       /* +6 */ UShort trans_size;
-      /* +8 */ UChar  payload[0];
+      /* +8 */ UShort jump_sites[VG_MAX_JUMPS];
+      /* +VG_CODE_OFFSET */ UChar  payload[0];
    }
    TCEntry;
 
@@ -130,6 +136,50 @@
    vg_dispatch.S. */
 Addr /* TCEntry*, really */ VG_(tt_fast)[VG_TT_FAST_SIZE];
 
+static void for_each_tc(Int sector, void (*fn)(TCEntry *));
+
+
+/*------------------ T-CHAINING HELPERS ------------------*/
+
+static
+void for_each_jumpsite(TCEntry *tce, void (*fn)(Addr))
+{
+   Int i;
+   for(i = 0; i < VG_MAX_JUMPS; i++) {
+      Addr a;
+      UShort idx = tce->jump_sites[i];
+
+      if (idx == (UShort)-1)
+	 continue;
+      
+      a = (Addr)&tce->payload[idx];
+
+      (*fn)(a);
+   }
+}
+
+static inline
+void unchain_tce(TCEntry *tce)
+{
+   for_each_jumpsite(tce, VG_(unchain_jumpsite));
+}
+
+/* Unchain any jumps pointing to a sector we're about to free */
+static
+void unchain_sector(Int s, Addr base, UInt len)
+{
+   void unchain_site(Addr a) {
+      Addr jmp = VG_(get_jmp_dest)(a);
+      if (jmp >= base && jmp < (base+len))
+	 VG_(unchain_jumpsite)(a);
+   }
+   void _unchain_tce(TCEntry *tce) {
+      for_each_jumpsite(tce, unchain_site);
+   }
+
+   for_each_tc(s, _unchain_tce);
+}
+
 
 /*------------------ TT HELPERS ------------------*/
 
@@ -176,6 +226,7 @@
       if (i == VG_TT_SIZE) 
          i = 0;
    }
+
    vg_tt[i].orig_addr = tce->orig_addr;
    vg_tt[i].tcentry = tce;
    vg_tt_used++;
@@ -221,24 +272,13 @@
 void rebuild_TT ( void )
 {
    Int      s;
-   UChar*   pc;
-   UChar*   pc_lim;
-   TCEntry* tce;
 
    /* Throw away TT. */
    initialise_tt();
    
    /* Rebuild TT from the remaining quarters. */
    for (s = 0; s < VG_TC_N_SECTORS; s++) {
-      pc     = &(vg_tc[s][0]);
-      pc_lim = &(vg_tc[s][vg_tc_used[s]]);
-      while (True) {
-         if (pc >= pc_lim) break;
-         tce = (TCEntry*)pc;
-         pc += sizeof(TCEntry) + tce->trans_size;
-         if (tce->orig_addr != VG_TTE_DELETED)
-            add_tt_entry(tce);
-      }
+      for_each_tc(s, add_tt_entry);
    }
    pp_tt_tc_status ( "after  rebuild of TC" );
 }
@@ -246,6 +286,24 @@
 
 /*------------------ TC HELPERS ------------------*/
 
+static
+void for_each_tc(Int s, void (*fn)(TCEntry *))
+{
+   UChar *pc;
+   UChar *pc_lim;
+   TCEntry *tce;
+
+   pc     = &(vg_tc[s][0]);
+   pc_lim = &(vg_tc[s][vg_tc_used[s]]);
+   while (True) {
+      if (pc >= pc_lim) break;
+      tce = (TCEntry*)pc;
+      pc += sizeof(TCEntry) + tce->trans_size;
+      if (tce->orig_addr != VG_TTE_DELETED)
+	 (*fn)(tce);
+   }
+}
+
 /* Find the oldest non-NULL, non-empty sector, or -1 if none such. */
 static 
 Int find_oldest_sector ( void ) 
@@ -274,9 +332,17 @@
    Char msg[100];
    Int s = find_oldest_sector();
    if (s != -1) {
+      Int i;
+
       vg_assert(s >= 0 && s < VG_TC_N_SECTORS);
       VG_(sprintf)(msg, "before discard of sector %d (%d bytes)", 
                         s, vg_tc_used[s]);
+
+      for(i = 0; i < VG_TC_N_SECTORS; i++) {
+	 if (i != s && vg_tc[i] != NULL)
+	    unchain_sector(i, (Addr)vg_tc[s], vg_tc_used[s]);
+      }
+
       pp_tt_tc_status ( msg );
       VG_(overall_out_count) += vg_tc_stats_count[s];
       VG_(overall_out_osize) += vg_tc_stats_osize[s];
@@ -331,7 +397,7 @@
 {
    Int i;
 
-   vg_assert(0 == (nBytes & 3));
+   vg_assert(IS_ALIGNED(nBytes));
 
    /* Ensure the TT is still OK. */
    while (vg_tt_used >= VG_TT_LIMIT) {
@@ -421,7 +487,8 @@
    pointer, which is inserted here.
 */
 void VG_(add_to_trans_tab) ( Addr orig_addr,  Int orig_size,
-                             Addr trans_addr, Int trans_size )
+                             Addr trans_addr, Int trans_size,
+			     UShort jumps[VG_MAX_JUMPS])
 {
    Int i, nBytes, trans_size_aligned;
    TCEntry* tce;
@@ -431,12 +498,12 @@
                tte->trans_addr, tte->trans_size);
    */
 
+   vg_assert(offsetof(TCEntry, payload) == VG_CODE_OFFSET);
+
    /* figure out how many bytes we require. */
-   trans_size_aligned = trans_size;
-   while ((trans_size_aligned & 3) != 0) 
-      trans_size_aligned++;
-   nBytes = trans_size_aligned + sizeof(TCEntry);
-   vg_assert((nBytes & 3) == 0);
+   nBytes = CODE_ALIGN(trans_size + sizeof(TCEntry));
+   trans_size_aligned = nBytes-sizeof(TCEntry);
+   vg_assert(IS_ALIGNED(nBytes));
 
    tce = (TCEntry*)allocate(nBytes);
    /* VG_(printf)("allocate returned %p\n", tce); */
@@ -445,10 +512,14 @@
    tce->orig_addr  = orig_addr;
    tce->orig_size  = (UShort)orig_size;  /* what's the point of storing this? */
    tce->trans_size = (UShort)trans_size_aligned;
+   for (i = 0; i < VG_MAX_JUMPS; i++) {
+      tce->jump_sites[i] = jumps[i];
+   }
    for (i = 0; i < trans_size; i++) {
       tce->payload[i] = ((UChar*)trans_addr)[i];
    }
-
+   
+   unchain_tce(tce);
    add_tt_entry(tce);
 
    /* Update stats. */
@@ -553,6 +624,10 @@
 {
    Int s;
 
+   /* Otherwise we wind up with non-32-bit-aligned code in
+      TCEntries. */
+   vg_assert((VG_MAX_JUMPS % 2) == 0);
+
    /* Figure out how big each sector should be.  */
    vg_tc_sector_szB 
       = (VG_TT_LIMIT /* max TT entries we expect */