-----------------------------------------------------------------------------
overview
-----------------------------------------------------------------------------
This commit introduces an optimisation that speeds up Memcheck by roughly
-3 -- 28%, and Addrcheck by 1 -- 36%, at least for the SPEC2000 benchmarks on
my 1400MHz Athlon.

Basic idea: that handling of A/V bit updates on %esp-adjustments was quite
sub-optimal -- for each "PUT ESP", a function was called that computed the
delta from the old and new ESPs, and then called a looping function to deal
with it.

Improvements:

  1. most of the time, the delta can be seen from the code.  So there's no need
     to compute it.

  2. when the delta is known, we can directly call a skin function to handle it.

  3. we can specialise for certain common cases (eg. +/- 4, 8, 12, 16, 32),
     including having unrolled loops for these.

This slightly bloats UCode because of setting up args for the call, and for
updating ESP in code (previously was done in the called C function).  Eg. for
`date' the code expansion ratio goes from 14.2 --> 14.6.  But it's much faster.

Note that skins don't have to use the specialised cases, they can just
define the ordinary case if they want;  the specialised cases are only used
if present.

-----------------------------------------------------------------------------
details
-----------------------------------------------------------------------------
Removed addrcheck/ac_common.c, put its (minimal) contents in ac_main.c.

Updated the major interface version, because this change isn't binary
compatible with the old core/skin interface.

Removed the hooks {new,die}_mem_stack_aligned, replaced with the better
{new,die}_mem_stack_{4,8,12,16,32}.  Still have the generic {die,new}_mem_stack
hooks.  These are called directly from UCode, thanks to a new pass that occurs
between instrumentation and register allocation (but only if the skin uses
these stack-adjustment hooks).  VG_(unknown_esp_update)() is called from UCode
for the generic case;  it determines if it's a stack switch, and calls the
generic {new,die}_stack_mem hooks accordingly.  This meant
synth_handle_esp_assignment() could be removed.

The new %esp-delta computation phase is in vg_translate.c.

In Memcheck and Addrcheck, added functions for updating the A and V bits of a
single aligned word and a single aligned doubleword.  These are called from the
specialised functions new_mem_stack_4, etc.  Could remove the one for the old
hooks new_mem_stack_aligned and die_mem_stack_aligned.

In mc_common.h, added a big macro containing the definitions of new_mem_stack_4
et al.  It's ``instantiated'' separately by Memcheck and Addrcheck.  The macro
is a bit klugey, but I did it that way because speed is vital for these
functions, so eg. a function pointer would have slowed things down.

Updated the built-in profiling events appropriately for the changes (removed
one old event, added a new one;  finding their names is left as an exercise for
the reader).

Fixed memory event profiling in {Addr,Mem}check, which had rotted.

A few other minor things.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1510 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/addrcheck/ac_main.c b/addrcheck/ac_main.c
index eb8a31d..5b3cc6b 100644
--- a/addrcheck/ac_main.c
+++ b/addrcheck/ac_main.c
@@ -34,6 +34,10 @@
 #include "memcheck.h"
 //#include "vg_profile.c"
 
+#include "mc_common.c"
+
+
+
 VG_DETERMINE_INTERFACE_VERSION
 
 /*------------------------------------------------------------*/
@@ -228,17 +232,6 @@
 #  undef STREQ
 
 
-/*------------------------------------------------------------*/
-/*--- Profiling events                                     ---*/
-/*------------------------------------------------------------*/
-
-typedef 
-   enum { 
-      VgpCheckMem = VgpFini+1,
-      VgpSetMem
-   } 
-   VgpSkinCC;
-
 #define DEBUG(fmt, args...) //VG_(printf)(fmt, ## args)
 
 /*------------------------------------------------------------*/
@@ -536,6 +529,82 @@
    set_address_range_perms ( a, len, VGM_BIT_VALID );
 }
 
+static __inline__
+void make_aligned_word_noaccess(Addr a)
+{
+   AcSecMap* sm;
+   UInt      sm_off;
+   UChar     mask;
+
+   VGP_PUSHCC(VgpESPAdj);
+   ENSURE_MAPPABLE(a, "make_aligned_word_noaccess");
+   sm     = primary_map[a >> 16];
+   sm_off = a & 0xFFFF;
+   mask = 0x0F;
+   mask <<= (a & 4 /* 100b */);   /* a & 4 is either 0 or 4 */
+   /* mask now contains 1s where we wish to make address bits invalid (1s). */
+   sm->abits[sm_off >> 3] |= mask;
+   VGP_POPCC(VgpESPAdj);
+}
+
+static __inline__
+void make_aligned_word_accessible(Addr a)
+{
+   AcSecMap* sm;
+   UInt      sm_off;
+   UChar     mask;
+
+   VGP_PUSHCC(VgpESPAdj);
+   ENSURE_MAPPABLE(a, "make_aligned_word_accessible");
+   sm     = primary_map[a >> 16];
+   sm_off = a & 0xFFFF;
+   mask = 0x0F;
+   mask <<= (a & 4 /* 100b */);   /* a & 4 is either 0 or 4 */
+   /* mask now contains 1s where we wish to make address bits
+      invalid (0s). */
+   sm->abits[sm_off >> 3] &= ~mask;
+   VGP_POPCC(VgpESPAdj);
+}
+
+/* Nb: by "aligned" here we mean 8-byte aligned */
+static __inline__
+void make_aligned_doubleword_accessible(Addr a)
+{  
+   AcSecMap* sm;
+   UInt      sm_off;
+   
+   VGP_PUSHCC(VgpESPAdj);
+   ENSURE_MAPPABLE(a, "make_aligned_doubleword_accessible");
+   sm = primary_map[a >> 16];
+   sm_off = a & 0xFFFF;
+   sm->abits[sm_off >> 3] = VGM_BYTE_VALID;
+   VGP_POPCC(VgpESPAdj);
+}  
+   
+static __inline__
+void make_aligned_doubleword_noaccess(Addr a)
+{  
+   AcSecMap* sm;
+   UInt      sm_off;
+   
+   VGP_PUSHCC(VgpESPAdj);
+   ENSURE_MAPPABLE(a, "make_aligned_doubleword_noaccess");
+   sm = primary_map[a >> 16];
+   sm_off = a & 0xFFFF;
+   sm->abits[sm_off >> 3] = VGM_BYTE_INVALID;
+   VGP_POPCC(VgpESPAdj);
+}  
+   
+/* The %esp update handling functions */
+ESP_UPDATE_HANDLERS ( make_aligned_word_accessible,  
+                      make_aligned_word_noaccess,
+                      make_aligned_doubleword_accessible,
+                      make_aligned_doubleword_noaccess,
+                      ac_make_accessible,
+                      ac_make_noaccess 
+                    );
+
+
 /* Block-copy permissions (needed for implementing realloc()). */
 
 static void ac_copy_address_range_state ( Addr src, Addr dst, UInt len )
@@ -603,66 +672,6 @@
 /*--- Memory event handlers                                ---*/
 /*------------------------------------------------------------*/
 
-/* Setting permissions for aligned words.  This supports fast stack
-   operations. */
-
-static void ac_make_noaccess_aligned ( Addr a, UInt len )
-{
-   AcSecMap* sm;
-   UInt    sm_off;
-   UChar   mask;
-   Addr    a_past_end = a + len;
-
-   VGP_PUSHCC(VgpSetMem);
-
-   PROF_EVENT(50);
-#  ifdef VG_DEBUG_MEMORY
-   sk_assert(IS_ALIGNED4_ADDR(a));
-   sk_assert(IS_ALIGNED4_ADDR(len));
-#  endif
-
-   for ( ; a < a_past_end; a += 4) {
-      ENSURE_MAPPABLE(a, "ac_make_noaccess_aligned");
-      sm     = primary_map[a >> 16];
-      sm_off = a & 0xFFFF;
-      mask = 0x0F;
-      mask <<= (a & 4 /* 100b */);   /* a & 4 is either 0 or 4 */
-      /* mask now contains 1s where we wish to make address bits
-         invalid (1s). */
-      sm->abits[sm_off >> 3] |= mask;
-   }
-   VGP_POPCC(VgpSetMem);
-}
-
-static void ac_make_writable_aligned ( Addr a, UInt len )
-{
-   AcSecMap* sm;
-   UInt    sm_off;
-   UChar   mask;
-   Addr    a_past_end = a + len;
-
-   VGP_PUSHCC(VgpSetMem);
-
-   PROF_EVENT(51);
-#  ifdef VG_DEBUG_MEMORY
-   sk_assert(IS_ALIGNED4_ADDR(a));
-   sk_assert(IS_ALIGNED4_ADDR(len));
-#  endif
-
-   for ( ; a < a_past_end; a += 4) {
-      ENSURE_MAPPABLE(a, "ac_make_writable_aligned");
-      sm     = primary_map[a >> 16];
-      sm_off = a & 0xFFFF;
-      mask = 0x0F;
-      mask <<= (a & 4 /* 100b */);   /* a & 4 is either 0 or 4 */
-      /* mask now contains 1s where we wish to make address bits
-         invalid (0s). */
-      sm->abits[sm_off >> 3] &= ~mask;
-   }
-   VGP_POPCC(VgpSetMem);
-}
-
-
 static __inline__
 void ac_check_is_accessible ( CorePart part, ThreadState* tst,
                               Char* s, Addr base, UInt size, Bool isWrite )
@@ -1061,7 +1070,7 @@
             break;
 
          /* For memory-ref instrs, copy the data_addr into a temporary to be
-          * passed to the cachesim_* helper at the end of the instruction.
+          * passed to the helper at the end of the instruction.
           */
          case LOAD: 
             t_addr = u_in->val1; 
@@ -1320,12 +1329,17 @@
 
    VG_(track_new_mem_startup)      ( & ac_new_mem_startup );
    VG_(track_new_mem_heap)         ( & ac_new_mem_heap );
-   VG_(track_new_mem_stack)        ( & ac_make_accessible );
-   VG_(track_new_mem_stack_aligned)( & ac_make_writable_aligned );
    VG_(track_new_mem_stack_signal) ( & ac_make_accessible );
    VG_(track_new_mem_brk)          ( & ac_make_accessible );
    VG_(track_new_mem_mmap)         ( & ac_set_perms );
    
+   VG_(track_new_mem_stack_4)      ( & MC_(new_mem_stack_4)  );
+   VG_(track_new_mem_stack_8)      ( & MC_(new_mem_stack_8)  );
+   VG_(track_new_mem_stack_12)     ( & MC_(new_mem_stack_12) );
+   VG_(track_new_mem_stack_16)     ( & MC_(new_mem_stack_16) );
+   VG_(track_new_mem_stack_32)     ( & MC_(new_mem_stack_32) );
+   VG_(track_new_mem_stack)        ( & MC_(new_mem_stack)    );
+
    VG_(track_copy_mem_heap)        ( & ac_copy_address_range_state );
    VG_(track_copy_mem_remap)       ( & ac_copy_address_range_state );
    VG_(track_change_mem_mprotect)  ( & ac_set_perms );
@@ -1334,12 +1348,17 @@
    VG_(track_ban_mem_stack)        ( & ac_make_noaccess );
 
    VG_(track_die_mem_heap)         ( & ac_make_noaccess );
-   VG_(track_die_mem_stack)        ( & ac_make_noaccess );
-   VG_(track_die_mem_stack_aligned)( & ac_make_noaccess_aligned ); 
    VG_(track_die_mem_stack_signal) ( & ac_make_noaccess ); 
    VG_(track_die_mem_brk)          ( & ac_make_noaccess );
    VG_(track_die_mem_munmap)       ( & ac_make_noaccess ); 
 
+   VG_(track_die_mem_stack_4)      ( & MC_(die_mem_stack_4)  );
+   VG_(track_die_mem_stack_8)      ( & MC_(die_mem_stack_8)  );
+   VG_(track_die_mem_stack_12)     ( & MC_(die_mem_stack_12) );
+   VG_(track_die_mem_stack_16)     ( & MC_(die_mem_stack_16) );
+   VG_(track_die_mem_stack_32)     ( & MC_(die_mem_stack_32) );
+   VG_(track_die_mem_stack)        ( & MC_(die_mem_stack)    );
+   
    VG_(track_bad_free)             ( & MC_(record_free_error) );
    VG_(track_mismatched_free)      ( & MC_(record_freemismatch_error) );
 
@@ -1355,6 +1374,7 @@
 
    VGP_(register_profile_event) ( VgpSetMem,   "set-mem-perms" );
    VGP_(register_profile_event) ( VgpCheckMem, "check-mem-perms" );
+   VGP_(register_profile_event) ( VgpESPAdj,   "adjust-ESP" );
 
    init_shadow_memory();
    MC_(init_prof_mem)();