Merge in a somewhat modified patch version of Jeremy Fitzhardinge's
translation chaining patch.

47-chained-bb

This implements basic-block chaining. Rather than always going through
the dispatch loop, a BB may jump directly to a successor BB if it is
present in the translation cache.

When the BB's code is first generated, the jumps to the successor BBs
are filled with undefined instructions. When the BB is inserted into
the translation cache, the undefined instructions are replaced with a
call to VG_(patch_me). When VG_(patch_me) is called, it looks up the
desired target address in the fast translation cache. If present, it
backpatches the call to patch_me with a jump to the translated target
BB. If the fast lookup fails, it falls back into the normal dispatch
loop.

When the parts of the translation cache are discarded, all translations
are unchained, so as to ensure we don't have direct jumps to code which
has been thrown away.

This optimisation only has effect on direct jumps; indirect jumps
(including returns) still go through the dispatch loop.  The -v stats
indicate a worst-case rate of about 16% of jumps having to go via the
slow mechanism.  This will be a combination of function returns and
genuine indirect jumps.

Certain parts of the dispatch loop's actions have to be moved into
each basic block; namely: updating the virtual EIP and keeping track
of the basic block counter.

At present, basic block chaining seems to improve performance by up to
25% with --skin=none.  Gains for skins adding more instrumentation
will be correspondingly smaller.

There is a command line option: --chain-bb=yes|no (defaults to yes).


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1336 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h
index 5fa88b4..00fa4ba 100644
--- a/coregrind/vg_include.h
+++ b/coregrind/vg_include.h
@@ -244,6 +244,8 @@
    is ignored.  Ie if a skin says no, I don't want this to run, that
    cannot be overridden from the command line. */
 extern Bool  VG_(clo_run_libc_freeres);
+/* Use the basic-block chaining optimisation */
+extern Bool VG_(clo_chain_bb);
 
 
 /* ---------------------------------------------------------------------
@@ -1045,11 +1047,16 @@
    Exports of vg_from_ucode.c
    ------------------------------------------------------------------ */
 
-extern UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes );
+extern UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes, UShort jumps[VG_MAX_JUMPS] );
 
 extern void   VG_(print_ccall_stats)      ( void );
 extern void   VG_(print_UInstr_histogram) ( void );
 
+extern void   VG_(unchain_jumpsite)	  ( Addr jumpsite );
+extern Addr   VG_(get_jmp_dest)           ( Addr jumpsite );
+extern Bool   VG_(is_unchained_jumpsite)  ( Addr jumpsite );
+extern Bool   VG_(is_chained_jumpsite)    ( Addr jumpsite );
+
 /* ---------------------------------------------------------------------
    Exports of vg_to_ucode.c
    ------------------------------------------------------------------ */
@@ -1062,6 +1069,7 @@
 
 /* Expandable arrays of uinstrs. */
 struct _UCodeBlock { 
+   Addr	   orig_eip;
    Int     used; 
    Int     size; 
    UInstr* instrs;
@@ -1074,7 +1082,8 @@
                                Addr  orig_addr,
                                UInt* orig_size,
                                Addr* trans_addr,
-                               UInt* trans_size );
+                               UInt* trans_size,
+			       UShort jumps[VG_MAX_JUMPS]);
 
 extern Char* VG_(nameCondcode)        ( Condcode cond );
 extern Bool  VG_(saneUInstr)          ( Bool beforeRA, Bool beforeLiveness,
@@ -1369,9 +1378,14 @@
 extern UInt VG_(overall_out_count);
 extern UInt VG_(overall_out_osize);
 extern UInt VG_(overall_out_tsize);
-
 /* The number of discards of TT/TC. */
 extern UInt VG_(number_of_tc_discards);
+/* Counts of chain and unchain operations done. */
+extern UInt VG_(bb_enchain_count);
+extern UInt VG_(bb_dechain_count);
+/* Number of unchained jumps performed. */
+extern UInt VG_(unchained_jumps_done);
+
 
 /* Counts pertaining to the register allocator. */
 
@@ -1445,7 +1459,8 @@
 extern void VG_(get_tt_tc_used) ( UInt* tt_used, UInt* tc_used );
 
 extern void VG_(add_to_trans_tab) ( Addr orig_addr,  Int orig_size,
-                                    Addr trans_addr, Int trans_size );
+                                    Addr trans_addr, Int trans_size,
+				    UShort jumps[VG_MAX_JUMPS]);
 
 extern void VG_(invalidate_translations) ( Addr start, UInt range );
 
@@ -1482,6 +1497,9 @@
    which means we need to defer to the scheduler. */
 extern UInt VG_(run_innerloop) ( void );
 
+/* The patching routing called when a BB wants to chain itself to
+   another. */
+extern UInt VG_(patch_me);
 
 /* ---------------------------------------------------------------------
    Exports of vg_helpers.S