Files updated, added and removed in order to turn the ERASER branch into HEAD


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1086 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index 05f4186..b21815e 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -1,7 +1,7 @@
 
 /*--------------------------------------------------------------------*/
-/*--- The cache simulation framework: instrumentation, recording   ---*/
-/*--- and results printing.                                        ---*/
+/*--- The cache simulation skin: cache detection; instrumentation, ---*/
+/*--- recording and results printing.                              ---*/
 /*---                                                vg_cachesim.c ---*/
 /*--------------------------------------------------------------------*/
 
@@ -27,19 +27,32 @@
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307, USA.
 
-   The GNU General Public License is contained in the file LICENSE.
+   The GNU General Public License is contained in the file COPYING.
 */
 
-#include "vg_include.h"
+#include "vg_skin.h"
+//#include "vg_profile.c"
+
+/* For cache simulation */
+typedef struct {
+    int size;       /* bytes */ 
+    int assoc;
+    int line_size;  /* bytes */ 
+} cache_t;
 
 #include "vg_cachesim_L2.c"
 #include "vg_cachesim_I1.c"
 #include "vg_cachesim_D1.c"
 
+/*------------------------------------------------------------*/
+/*--- Constants                                            ---*/
+/*------------------------------------------------------------*/
 
 /* According to IA-32 Intel Architecture Software Developer's Manual: Vol 2 */
 #define MAX_x86_INSTR_SIZE              16
 
+#define MIN_LINE_SIZE   16
+
 /* Size of various buffers used for storing strings */
 #define FILENAME_LEN                    256
 #define FN_NAME_LEN                     256
@@ -48,33 +61,29 @@
 #define RESULTS_BUF_LEN                 128
 #define LINE_BUF_LEN                     64
 
-
 /*------------------------------------------------------------*/
-/*--- Generic utility stuff                                ---*/
+/*--- Profiling events                                     ---*/
 /*------------------------------------------------------------*/
 
-Int VG_(log2) ( Int x ) 
-{
-   Int i;
-   /* Any more than 32 and we overflow anyway... */
-   for (i = 0; i < 32; i++) {
-      if (1 << i == x) return i;
-   }
-   return -1;
-}
-
+typedef 
+   enum { 
+      VgpGetBBCC = VgpFini+1,
+      VgpCacheSimulate,
+      VgpCacheResults
+   } 
+   VgpSkinCC;
 
 /*------------------------------------------------------------*/
 /*--- Output file related stuff                            ---*/
 /*------------------------------------------------------------*/
 
-#define OUT_FILE        "cachegrind.out"
+Char cachegrind_out_file[FILENAME_LEN];
 
 static void file_err()
 {
    VG_(message)(Vg_UserMsg,
                 "error: can't open cache simulation output file `%s'",
-                OUT_FILE );
+                cachegrind_out_file );
    VG_(exit)(1);
 }
 
@@ -95,7 +104,15 @@
     cc->m2 = 0;
 }
 
-typedef enum { INSTR_CC, READ_CC, WRITE_CC, MOD_CC } CC_type;
+typedef 
+   enum {
+      InstrCC,         /* eg. mov %eax,   %ebx                      */
+      ReadCC,          /* eg. mov (%ecx), %esi                      */
+      WriteCC,         /* eg. mov %eax,   (%edx)                    */
+      ModCC,           /* eg. incl (%eax) (read+write one addr)     */
+      ReadWriteCC,     /* eg. call*l (%esi), pushl 0x4(%ebx), movsw 
+                               (read+write two different addrs)      */
+   } CC_type;
 
 /* Instruction-level cost-centres.  The typedefs for these structs are in
  * vg_include.c 
@@ -104,33 +121,53 @@
  *
  * This is because we use it to work out what kind of CC we're dealing with.
  */ 
-struct _iCC {
-   /* word 1 */
-   UChar tag;
-   UChar instr_size;
-   /* 2 bytes padding */
+typedef
+   struct {
+      /* word 1 */
+      UChar tag;
+      UChar instr_size;
+      /* 2 bytes padding */
 
-   /* words 2+ */
-   Addr instr_addr;
-   CC I;
-};
+      /* words 2+ */
+      Addr instr_addr;
+      CC I;
+   }
+   iCC;
 
-struct _idCC {
-   /* word 1 */
-   UChar tag;
-   UChar instr_size;
-   UChar data_size;
-   /* 1 byte padding */
+typedef
+   struct _idCC {
+      /* word 1 */
+      UChar tag;
+      UChar instr_size;
+      UChar data_size;
+      /* 1 byte padding */
 
-   /* words 2+ */
-   Addr instr_addr;
-   CC I;
-   CC D;
-};
+      /* words 2+ */
+      Addr instr_addr;
+      CC I;
+      CC D;
+   }
+   idCC;
+
+typedef
+   struct _iddCC {
+      /* word 1 */
+      UChar tag;
+      UChar instr_size;
+      UChar data_size;
+      /* 1 byte padding */
+
+      /* words 2+ */
+      Addr instr_addr;
+      CC I;
+      CC Da;
+      CC Db;
+   }
+   iddCC;
 
 static void init_iCC(iCC* cc, Addr instr_addr, UInt instr_size)
 {
-   cc->tag        = INSTR_CC;
+   cc->tag        = InstrCC;
    cc->instr_size = instr_size;
    cc->instr_addr = instr_addr;
    initCC(&cc->I);
@@ -147,6 +184,18 @@
    initCC(&cc->D);
 }
 
+static void init_iddCC(iddCC* cc, Addr instr_addr,
+                       UInt instr_size, UInt data_size)
+{
+   cc->tag        = ReadWriteCC;
+   cc->instr_size = instr_size;
+   cc->data_size  = data_size;
+   cc->instr_addr = instr_addr;
+   initCC(&cc->I);
+   initCC(&cc->Da);
+   initCC(&cc->Db);
+}
+
 #define ADD_CC_TO(CC_type, cc, total)           \
    total.a  += ((CC_type*)BBCC_ptr)->cc.a;      \
    total.m1 += ((CC_type*)BBCC_ptr)->cc.m1;     \
@@ -193,6 +242,22 @@
 #endif
 }
 
+static __inline__ void sprint_read_write_CC(Char buf[BUF_LEN], iddCC* cc)
+{
+#if PRINT_INSTR_ADDRS
+   VG_(sprintf)(buf, "%llu %llu %llu %llu %llu %llu # %x\n",
+                      cc->I.a,  cc->I.m1,  cc->I.m2, 
+                      cc->Da.a, cc->Da.m1, cc->Da.m2,
+                      cc->Db.a, cc->Db.m1, cc->Db.m2, cc->instr_addr);
+#else
+   VG_(sprintf)(buf, "%llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+                      cc->I.a,  cc->I.m1,  cc->I.m2, 
+                      cc->Da.a, cc->Da.m1, cc->Da.m2,
+                      cc->Db.a, cc->Db.m1, cc->Db.m2);
+#endif
+}
+
+
 /*------------------------------------------------------------*/
 /*--- BBCC hash table stuff                                ---*/
 /*------------------------------------------------------------*/
@@ -257,11 +322,11 @@
 static void get_debug_info(Addr instr_addr, Char filename[FILENAME_LEN],
                            Char fn_name[FN_NAME_LEN], Int* line_num)
 {
-   Bool found1, found2, no_demangle = False;
+   Bool found1, found2;
 
-   found1 = VG_(what_line_is_this)(instr_addr, filename,
-                                   FILENAME_LEN, line_num);
-   found2 = VG_(what_fn_is_this)(no_demangle, instr_addr, fn_name, FN_NAME_LEN);
+   found1 = VG_(get_filename_linenum)(instr_addr, filename,
+                                      FILENAME_LEN, line_num);
+   found2 = VG_(get_fnname)(instr_addr, fn_name, FN_NAME_LEN);
 
    if (!found1 && !found2) {
       no_debug_BBs++;
@@ -290,8 +355,8 @@
 file_node* new_file_node(Char filename[FILENAME_LEN], file_node* next)
 {
    Int i;
-   file_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(file_node));
-   new->filename  = VG_(strdup)(VG_AR_PRIVATE, filename);
+   file_node* new = VG_(malloc)(sizeof(file_node));
+   new->filename  = VG_(strdup)(filename);
    for (i = 0; i < N_FN_ENTRIES; i++) {
       new->fns[i] = NULL;
    }
@@ -303,8 +368,8 @@
 fn_node* new_fn_node(Char fn_name[FILENAME_LEN], fn_node* next)
 {
    Int i;
-   fn_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(fn_node));
-   new->fn_name = VG_(strdup)(VG_AR_PRIVATE, fn_name);
+   fn_node* new = VG_(malloc)(sizeof(fn_node));
+   new->fn_name = VG_(strdup)(fn_name);
    for (i = 0; i < N_BBCC_ENTRIES; i++) {
       new->BBCCs[i] = NULL;
    }
@@ -318,7 +383,7 @@
    Int BBCC_array_size = compute_BBCC_array_size(cb);
    BBCC* new;
 
-   new = (BBCC*)VG_(malloc)(VG_AR_PRIVATE, sizeof(BBCC) + BBCC_array_size);
+   new = (BBCC*)VG_(malloc)(sizeof(BBCC) + BBCC_array_size);
    new->orig_addr  = bb_orig_addr;
    new->array_size = BBCC_array_size;
    new->next = next;
@@ -352,7 +417,7 @@
 
    get_debug_info(bb_orig_addr, filename, fn_name, &dummy_line_num);
 
-   VGP_PUSHCC(VgpCacheGetBBCC);
+   VGP_PUSHCC(VgpGetBBCC);
    filename_hash = hash(filename, N_FILE_ENTRIES);
    curr_file_node = BBCC_table[filename_hash];
    while (NULL != curr_file_node && 
@@ -410,7 +475,7 @@
           BB_retranslations++;
       }
    }
-   VGP_POPCC;
+   VGP_POPCC(VgpGetBBCC);
    return curr_BBCC;
 }
 
@@ -418,11 +483,12 @@
 /*--- Cache simulation instrumentation phase               ---*/
 /*------------------------------------------------------------*/
 
+// SSS: do something about all these...
 #define uInstr1   VG_(newUInstr1)
 #define uInstr2   VG_(newUInstr2)
 #define uInstr3   VG_(newUInstr3)
-#define dis       VG_(disassemble)
 #define uLiteral  VG_(setLiteralField)
+#define uCCall    VG_(setCCallFields)
 #define newTemp   VG_(getNewTemp)
 
 static Int compute_BBCC_array_size(UCodeBlock* cb)
@@ -430,12 +496,12 @@
    UInstr* u_in;
    Int     i, CC_size, BBCC_size = 0;
    Bool    is_LOAD, is_STORE, is_FPU_R, is_FPU_W;
+   Int     t_read, t_write;
     
    is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
+   t_read = t_write = INVALID_TEMPREG;
 
    for (i = 0; i < cb->used; i++) {
-      /* VG_(ppUInstr)(0, &cb->instrs[i]); */
-
       u_in = &cb->instrs[i];
       switch(u_in->opcode) {
 
@@ -449,8 +515,13 @@
 
             case_for_end_of_instr:
 
-            CC_size = (is_LOAD || is_STORE || is_FPU_R || is_FPU_W 
-                      ? sizeof(idCC) : sizeof(iCC));
+            if (((is_LOAD && is_STORE) || (is_FPU_R && is_FPU_W)) && 
+                 t_read != t_write)
+               CC_size = sizeof(iddCC);
+            else if (is_LOAD || is_STORE || is_FPU_R || is_FPU_W)
+               CC_size = sizeof(idCC);
+            else
+               CC_size = sizeof(iCC);
 
             BBCC_size += CC_size;
             is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
@@ -461,22 +532,26 @@
             /* Also, a STORE can come after a LOAD for bts/btr/btc */
             vg_assert(/*!is_LOAD &&*/ /* !is_STORE && */ 
                       !is_FPU_R && !is_FPU_W);
+            t_read = u_in->val1;
             is_LOAD = True;
             break;
 
          case STORE:
             /* Multiple STOREs are possible for 'pushal' */
             vg_assert(            /*!is_STORE &&*/ !is_FPU_R && !is_FPU_W);
+            t_write = u_in->val2;
             is_STORE = True;
             break;
 
          case FPU_R:
             vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
+            t_read = u_in->val2;
             is_FPU_R = True;
             break;
 
          case FPU_W:
             vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
+            t_write = u_in->val2;
             is_FPU_W = True;
             break;
 
@@ -488,41 +563,153 @@
    return BBCC_size;
 }
 
-/* Use this rather than eg. -1 because it's stored as a UInt. */
+static __attribute__ ((regparm (1)))
+void log_1I_0D_cache_access(iCC* cc)
+{
+   //VG_(printf)("1I_0D: CCaddr=0x%x, iaddr=0x%x, isize=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
+   cc->I.a++;
+   VGP_POPCC(VgpCacheSimulate);
+}
+
+/* Difference between this function and log_1I_0D_cache_access() is that
+   this one can be passed any kind of CC, not just an iCC.  So we have to
+   be careful to make sure we don't make any assumptions about CC layout.
+   (As it stands, they would be safe, but this will avoid potential heartache
+   if anyone else changes CC layout.)  
+   Note that we only do the switch for the JIFZ version because if we always
+   called this switching version, things would run about 5% slower. */
+static __attribute__ ((regparm (1)))
+void log_1I_0D_cache_access_JIFZ(iCC* cc)
+{
+   UChar instr_size;
+   Addr instr_addr;
+   CC* I;
+
+   //VG_(printf)("1I_0D: CCaddr=0x%x, iaddr=0x%x, isize=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+
+   switch(cc->tag) {
+       case InstrCC:
+           instr_size = cc->instr_size;
+           instr_addr = cc->instr_addr;
+           I = &(cc->I);
+           break;
+       case ReadCC:
+       case WriteCC:
+       case ModCC:
+           instr_size = ((idCC*)cc)->instr_size;
+           instr_addr = ((idCC*)cc)->instr_addr;
+           I = &( ((idCC*)cc)->I );
+           break;
+       case ReadWriteCC:
+           instr_size = ((iddCC*)cc)->instr_size;
+           instr_addr = ((iddCC*)cc)->instr_addr;
+           I = &( ((iddCC*)cc)->I );
+           break;
+       default:
+           VG_(panic)("Unknown CC type in log_1I_0D_cache_access_JIFZ()\n");
+           break;
+   }
+   cachesim_I1_doref(instr_addr, instr_size, &I->m1, &I->m2);
+   I->a++;
+   VGP_POPCC(VgpCacheSimulate);
+}
+
+__attribute__ ((regparm (2))) static 
+void log_0I_1D_cache_access(idCC* cc, Addr data_addr)
+{
+   //VG_(printf)("0I_1D: CCaddr=%p, iaddr=%p, isize=%u, daddr=%p, dsize=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size, data_addr, cc->data_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_D1_doref(data_addr,      cc->data_size,  &cc->D.m1, &cc->D.m2);
+   cc->D.a++;
+   VGP_POPCC(VgpCacheSimulate);
+}
+
+__attribute__ ((regparm (2))) static
+void log_1I_1D_cache_access(idCC* cc, Addr data_addr)
+{
+   //VG_(printf)("1I_1D: CCaddr=%p, iaddr=%p, isize=%u, daddr=%p, dsize=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size, data_addr, cc->data_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
+   cc->I.a++;
+
+   cachesim_D1_doref(data_addr,      cc->data_size,  &cc->D.m1, &cc->D.m2);
+   cc->D.a++;
+   VGP_POPCC(VgpCacheSimulate);
+}
+
+__attribute__ ((regparm (3))) static 
+void log_0I_2D_cache_access(iddCC* cc, Addr data_addr1, Addr data_addr2)
+{
+   //VG_(printf)("0I_2D: CCaddr=%p, iaddr=%p, isize=%u, daddr1=0x%x, daddr2=%p, size=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size, data_addr1, data_addr2, cc->data_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_D1_doref(data_addr1, cc->data_size,  &cc->Da.m1, &cc->Da.m2);
+   cc->Da.a++;
+   cachesim_D1_doref(data_addr2, cc->data_size,  &cc->Db.m1, &cc->Db.m2);
+   cc->Db.a++;
+   VGP_POPCC(VgpCacheSimulate);
+}
+
+__attribute__ ((regparm (3))) static
+void log_1I_2D_cache_access(iddCC* cc, Addr data_addr1, Addr data_addr2)
+{
+   //VG_(printf)("1I_2D: CCaddr=%p, iaddr=%p, isize=%u, daddr1=%p, daddr2=%p, dsize=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size, data_addr1, data_addr2, cc->data_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1,  &cc->I.m2);
+   cc->I.a++;
+
+   cachesim_D1_doref(data_addr1,     cc->data_size,  &cc->Da.m1, &cc->Da.m2);
+   cc->Da.a++;
+   cachesim_D1_doref(data_addr2,     cc->data_size,  &cc->Db.m1, &cc->Db.m2);
+   cc->Db.a++;
+   VGP_POPCC(VgpCacheSimulate);
+}
+
+UCodeBlock* SK_(instrument)(UCodeBlock* cb_in, Addr orig_addr)
+{
+/* Use this rather than eg. -1 because it's a UInt. */
 #define INVALID_DATA_SIZE   999999
 
-UCodeBlock* VG_(cachesim_instrument)(UCodeBlock* cb_in, Addr orig_addr)
-{
    UCodeBlock* cb;
    Int         i;
    UInstr*     u_in;
    BBCC*       BBCC_node;
-   Int         t_CC_addr, t_read_addr, t_write_addr, t_data_addr;
+   Int         t_CC_addr, t_read_addr, t_write_addr, t_data_addr1,
+               t_data_addr2, t_read, t_write;
    Int         CC_size = -1;    /* Shut gcc warnings up */
-   Addr        instr_addr = orig_addr;
-   UInt        instr_size, data_size = INVALID_DATA_SIZE;
-   Int         helper = -1;     /* Shut gcc warnings up */
+   Addr        x86_instr_addr = orig_addr;
+   UInt        x86_instr_size, data_size = INVALID_DATA_SIZE;
+   Addr        helper;
+   Int         argc;
    UInt        stack_used;
-   Bool        BB_seen_before       = False;
-   Bool        prev_instr_was_Jcond = False;
+   Bool        BB_seen_before     = False;
+   Bool        instrumented_Jcond = False;
+   Bool        has_rep_prefix     = False;
    Addr        BBCC_ptr0, BBCC_ptr; 
 
    /* Get BBCC (creating if necessary -- requires a counting pass over the BB
     * if it's the first time it's been seen), and point to start of the 
     * BBCC array.  */
-   BBCC_node = get_BBCC(orig_addr, cb_in, False, &BB_seen_before);
+   BBCC_node = get_BBCC(orig_addr, cb_in, /*remove=*/False, &BB_seen_before);
    BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);
 
    cb = VG_(allocCodeBlock)();
    cb->nextTemp = cb_in->nextTemp;
 
-   t_CC_addr = t_read_addr = t_write_addr = t_data_addr = INVALID_TEMPREG;
+   t_CC_addr = t_read_addr = t_write_addr = t_data_addr1 = t_data_addr2 =
+               t_read = t_write = INVALID_TEMPREG;
 
    for (i = 0; i < cb_in->used; i++) {
       u_in = &cb_in->instrs[i];
 
-      //VG_(ppUInstr)(0, u_in);
-
       /* What this is all about:  we want to instrument each x86 instruction 
        * translation.  The end of these are marked in three ways.  The three
        * ways, and the way we instrument them, are as follows:
@@ -531,144 +718,33 @@
        * 2. UCode, Juncond        --> UCode, Instrumentation, Juncond
        * 3. UCode, Jcond, Juncond --> UCode, Instrumentation, Jcond, Juncond
        *
-       * We must put the instrumentation before the jumps so that it is always
+       * The last UInstr in a basic block is always a Juncond.  Jconds,
+       * when they appear, are always second last.  We check this with 
+       * various assertions.
+       *
+       * We must put the instrumentation before any jumps so that it is always
        * executed.  We don't have to put the instrumentation before the INCEIP
        * (it could go after) but we do so for consistency.
        *
-       * Junconds are always the last instruction in a basic block.  Jconds are
-       * always the 2nd last, and must be followed by a Jcond.  We check this
-       * with various assertions.
+       * x86 instruction sizes are obtained from INCEIPs (for case 1) or
+       * from .extra4b field of the final JMP (for case 2 & 3).
        *
-       * Note that in VG_(disBB) we patched the `extra4b' field of the first
-       * occurring JMP in a block with the size of its x86 instruction.  This
-       * is used now.
-       *
-       * Note that we don't have to treat JIFZ specially;  unlike JMPs, JIFZ
-       * occurs in the middle of a BB and gets an INCEIP after it.
+       * Note that JIFZ is treated differently.
        *
        * The instrumentation is just a call to the appropriate helper function,
        * passing it the address of the instruction's CC.
        */
-      if (prev_instr_was_Jcond) vg_assert(u_in->opcode == JMP);
+      if (instrumented_Jcond) vg_assert(u_in->opcode == JMP);
 
       switch (u_in->opcode) {
-
-         case INCEIP:
-            instr_size = u_in->val1;
-            goto case_for_end_of_x86_instr;
-
-         case JMP:
-            if (u_in->cond == CondAlways) {
-               vg_assert(i+1 == cb_in->used); 
-
-               /* Don't instrument if previous instr was a Jcond. */
-               if (prev_instr_was_Jcond) {
-                  vg_assert(0 == u_in->extra4b);
-                  VG_(copyUInstr)(cb, u_in);
-                  break;
-               }
-               prev_instr_was_Jcond = False;
-
-            } else {
-               vg_assert(i+2 == cb_in->used);  /* 2nd last instr in block */
-               prev_instr_was_Jcond = True;
-            }
-
-            /* Ah, the first JMP... instrument, please. */
-            instr_size = u_in->extra4b;
-            goto case_for_end_of_x86_instr;
-
-            /* Shared code that is executed at the end of an x86 translation
-             * block, marked by either an INCEIP or an unconditional JMP. */
-            case_for_end_of_x86_instr:
-
-#define IS_(X)      (INVALID_TEMPREG != t_##X##_addr)
-             
-            /* Initialise the CC in the BBCC array appropriately if it hasn't
-             * been initialised before.
-             * Then call appropriate sim function, passing it the CC address.
-             * Note that CALLM_S/CALL_E aren't required here;  by this point,
-             * the checking related to them has already happened. */
-            stack_used = 0;
-
-            vg_assert(instr_size >= 1 && instr_size <= MAX_x86_INSTR_SIZE);
-            vg_assert(0 != instr_addr);
-
-            if (!IS_(read) && !IS_(write)) {
-               iCC* CC_ptr = (iCC*)(BBCC_ptr);
-               vg_assert(INVALID_DATA_SIZE == data_size);
-               vg_assert(INVALID_TEMPREG == t_read_addr && 
-                         INVALID_TEMPREG == t_write_addr);
-               CC_size = sizeof(iCC);
-               if (!BB_seen_before)
-                   init_iCC(CC_ptr, instr_addr, instr_size);
-
-               /* 1st arg: CC addr */
-               t_CC_addr = newTemp(cb);
-               uInstr2(cb, MOV,   4, Literal, 0, TempReg, t_CC_addr);
-               uLiteral(cb, BBCC_ptr);
-
-               uInstr1(cb, CCALL_1_0, 0, TempReg, t_CC_addr);
-               uLiteral(cb, VGOFF_(cachesim_log_non_mem_instr));
-
-            } else { 
-               CC_type X_CC;
-               idCC* CC_ptr = (idCC*)(BBCC_ptr);
-                
-               vg_assert(4 == data_size || 2  == data_size || 1 == data_size || 
-                         8 == data_size || 10 == data_size);
-               
-               CC_size = sizeof(idCC);
-               helper = VGOFF_(cachesim_log_mem_instr);
-
-               if (IS_(read) && !IS_(write)) {
-                  X_CC = READ_CC;
-                  vg_assert(INVALID_TEMPREG != t_read_addr && 
-                            INVALID_TEMPREG == t_write_addr);
-                  t_data_addr = t_read_addr;
-
-               } else if (!IS_(read) && IS_(write)) {
-                  X_CC = WRITE_CC;
-                  vg_assert(INVALID_TEMPREG == t_read_addr && 
-                            INVALID_TEMPREG != t_write_addr);
-                  t_data_addr = t_write_addr;
-
-               } else {
-                  vg_assert(IS_(read) && IS_(write));
-                  X_CC = MOD_CC;
-                  vg_assert(INVALID_TEMPREG != t_read_addr && 
-                            INVALID_TEMPREG != t_write_addr);
-                  t_data_addr = t_read_addr;
-               }
-#undef IS_
-               if (!BB_seen_before)
-                  init_idCC(X_CC, CC_ptr, instr_addr, instr_size, data_size);
-
-               /* 1st arg: CC addr */
-               t_CC_addr = newTemp(cb);
-               uInstr2(cb, MOV,   4, Literal, 0, TempReg, t_CC_addr);
-               uLiteral(cb, BBCC_ptr);
-
-               uInstr2(cb, CCALL_2_0, 0, TempReg, t_CC_addr, 
-                                         TempReg, t_data_addr);
-               uLiteral(cb, VGOFF_(cachesim_log_mem_instr));
-            }
-
-            VG_(copyUInstr)(cb, u_in);
-
-            /* Update BBCC_ptr, EIP, de-init read/write temps for next instr */
-            BBCC_ptr   += CC_size; 
-            instr_addr += instr_size;
-            t_CC_addr = t_read_addr = t_write_addr = 
-                                      t_data_addr  = INVALID_TEMPREG;
-            data_size = INVALID_DATA_SIZE;
+         case NOP:  case CALLM_E:  case CALLM_S:
             break;
 
-
          /* For memory-ref instrs, copy the data_addr into a temporary to be
-          * passed to the cachesim_log_function at the end of the instruction.
+          * passed to the cachesim_* helper at the end of the instruction.
           */
          case LOAD: 
+            t_read      = u_in->val1;
             t_read_addr = newTemp(cb);
             uInstr2(cb, MOV, 4, TempReg, u_in->val1,  TempReg, t_read_addr);
             data_size = u_in->size;
@@ -676,26 +752,216 @@
             break;
 
          case FPU_R:
+            t_read      = u_in->val2;
             t_read_addr = newTemp(cb);
             uInstr2(cb, MOV, 4, TempReg, u_in->val2,  TempReg, t_read_addr);
-            data_size = u_in->size;
+            data_size = ( u_in->size <= MIN_LINE_SIZE
+                        ? u_in->size
+                        : MIN_LINE_SIZE);
             VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Note that we must set t_write_addr even for mod instructions;
-          * that's how the code above determines whether it does a write;
-          * without it, it would think a mod instruction is a read.
+          * That's how the code above determines whether it does a write.
+          * Without it, it would think a mod instruction is a read.
           * As for the MOV, if it's a mod instruction it's redundant, but it's
           * not expensive and mod instructions are rare anyway. */
          case STORE:
          case FPU_W:
+            t_write      = u_in->val2;
             t_write_addr = newTemp(cb);
             uInstr2(cb, MOV, 4, TempReg, u_in->val2, TempReg, t_write_addr);
-            data_size = u_in->size;
+            /* 28 and 108 B data-sized instructions will be done
+             * inaccurately but they're very rare and this avoids errors
+             * from hitting more than two cache lines in the simulation. */
+            data_size = ( u_in->size <= MIN_LINE_SIZE
+                        ? u_in->size
+                        : MIN_LINE_SIZE);
             VG_(copyUInstr)(cb, u_in);
             break;
 
-         case NOP:  case CALLM_E:  case CALLM_S:
+
+         /* For rep-prefixed instructions, log a single I-cache access
+          * before the UCode loop that implements the repeated part, which
+          * is where the multiple D-cache accesses are logged. */
+         case JIFZ:
+            has_rep_prefix = True;
+
+            /* Setup 1st and only arg: CC addr */
+            t_CC_addr = newTemp(cb);
+            uInstr2(cb, MOV,  4, Literal, 0, TempReg, t_CC_addr);
+            uLiteral(cb, BBCC_ptr);
+
+            /* Call helper */
+            uInstr1(cb, CCALL, 0, TempReg, t_CC_addr);
+            uCCall(cb, (Addr) & log_1I_0D_cache_access_JIFZ, 1, 1, False);
+            VG_(copyUInstr)(cb, u_in);
+            break;
+
+
+         /* INCEIP: insert instrumentation */
+         case INCEIP:
+            x86_instr_size = u_in->val1;
+            goto instrument_x86_instr;
+
+         /* JMP: insert instrumentation if the first JMP */
+         case JMP:
+            if (instrumented_Jcond) {
+               vg_assert(CondAlways == u_in->cond);
+               vg_assert(i+1 == cb_in->used);
+               VG_(copyUInstr)(cb, u_in);
+               instrumented_Jcond = False;    /* reset */
+               break;
+            }
+            /* The first JMP... instrument. */
+            if (CondAlways != u_in->cond) {
+               vg_assert(i+2 == cb_in->used);
+               instrumented_Jcond = True;
+            } else {
+               vg_assert(i+1 == cb_in->used);
+            }
+
+            /* Get x86 instr size from final JMP. */
+            x86_instr_size = LAST_UINSTR(cb_in).extra4b;
+            goto instrument_x86_instr;
+
+
+            /* Code executed at the end of each x86 instruction. */
+            instrument_x86_instr:
+             
+            /* Initialise the CC in the BBCC array appropriately if it
+             * hasn't been initialised before.  Then call appropriate sim
+             * function, passing it the CC address. */
+            stack_used = 0;
+
+            vg_assert(x86_instr_size >= 1 && 
+                      x86_instr_size <= MAX_x86_INSTR_SIZE);
+
+#define IS_(X)      (INVALID_TEMPREG != t_##X##_addr)
+
+            if (!IS_(read) && !IS_(write)) {
+               vg_assert(INVALID_DATA_SIZE == data_size);
+               vg_assert(INVALID_TEMPREG == t_read_addr  && 
+                         INVALID_TEMPREG == t_read       && 
+                         INVALID_TEMPREG == t_write_addr &&
+                         INVALID_TEMPREG == t_write);
+               CC_size = sizeof(iCC);
+               if (!BB_seen_before)
+                   init_iCC((iCC*)BBCC_ptr, x86_instr_addr, x86_instr_size);
+               helper = ( has_rep_prefix 
+                        ? (Addr)0      /* no extra log needed */
+                        : (Addr) & log_1I_0D_cache_access
+                        );
+               argc = 1;
+
+            } else { 
+               vg_assert(4 == data_size || 2  == data_size || 1 == data_size || 
+                         8 == data_size || 10 == data_size ||
+                         MIN_LINE_SIZE == data_size);
+               
+               if (IS_(read) && !IS_(write)) {
+                  CC_size = sizeof(idCC);
+                  /* If it uses 'rep', we've already logged the I-cache 
+                   * access at the JIFZ UInstr (see JIFZ case below) so
+                   * don't do it here */
+                  helper = ( has_rep_prefix 
+                           ? (Addr) & log_0I_1D_cache_access
+                           : (Addr) & log_1I_1D_cache_access
+                           );
+                  argc = 2;
+                  if (!BB_seen_before)
+                     init_idCC(ReadCC, (idCC*)BBCC_ptr, x86_instr_addr,
+                               x86_instr_size, data_size);
+                  vg_assert(INVALID_TEMPREG != t_read_addr  && 
+                            INVALID_TEMPREG != t_read       && 
+                            INVALID_TEMPREG == t_write_addr &&
+                            INVALID_TEMPREG == t_write);
+                  t_data_addr1 = t_read_addr;
+
+               } else if (!IS_(read) && IS_(write)) {
+                  CC_size = sizeof(idCC);
+                  helper = ( has_rep_prefix 
+                           ? (Addr) & log_0I_1D_cache_access
+                           : (Addr) & log_1I_1D_cache_access
+                           );
+                  argc = 2;
+                  if (!BB_seen_before)
+                     init_idCC(WriteCC, (idCC*)BBCC_ptr, x86_instr_addr,
+                               x86_instr_size, data_size);
+                  vg_assert(INVALID_TEMPREG == t_read_addr  && 
+                            INVALID_TEMPREG == t_read       && 
+                            INVALID_TEMPREG != t_write_addr &&
+                            INVALID_TEMPREG != t_write);
+                  t_data_addr1 = t_write_addr;
+
+               } else {
+                  vg_assert(IS_(read) && IS_(write));
+                  vg_assert(INVALID_TEMPREG != t_read_addr  && 
+                            INVALID_TEMPREG != t_read       && 
+                            INVALID_TEMPREG != t_write_addr &&
+                            INVALID_TEMPREG != t_write);
+                  if (t_read == t_write) {
+                     CC_size = sizeof(idCC);
+                     helper = ( has_rep_prefix 
+                              ? (Addr) & log_0I_1D_cache_access
+                              : (Addr) & log_1I_1D_cache_access
+                              );
+                     argc = 2;
+                     if (!BB_seen_before)
+                        init_idCC(ModCC, (idCC*)BBCC_ptr, x86_instr_addr,
+                                  x86_instr_size, data_size);
+                     t_data_addr1 = t_read_addr;
+                  } else {
+                     CC_size = sizeof(iddCC);
+                     helper = ( has_rep_prefix 
+                              ? (Addr) & log_0I_2D_cache_access
+                              : (Addr) & log_1I_2D_cache_access
+                              );
+                     argc = 3;
+                     if (!BB_seen_before)
+                        init_iddCC((iddCC*)BBCC_ptr, x86_instr_addr,
+                                    x86_instr_size, data_size);
+                     t_data_addr1 = t_read_addr;
+                     t_data_addr2 = t_write_addr;
+                  }
+               }
+#undef IS_
+            }
+
+            /* Call the helper, if necessary */
+            if ((Addr)0 != helper) {
+
+               /* Setup 1st arg: CC addr */
+               t_CC_addr = newTemp(cb);
+               uInstr2(cb, MOV,   4, Literal, 0, TempReg, t_CC_addr);
+               uLiteral(cb, BBCC_ptr);
+
+               /* Call the helper */
+               if      (1 == argc)
+                  uInstr1(cb, CCALL, 0, TempReg, t_CC_addr);
+               else if (2 == argc)
+                  uInstr2(cb, CCALL, 0, TempReg, t_CC_addr, 
+                                        TempReg, t_data_addr1);
+               else if (3 == argc)
+                  uInstr3(cb, CCALL, 0, TempReg, t_CC_addr, 
+                                        TempReg, t_data_addr1,
+                                        TempReg, t_data_addr2);
+               else
+                  VG_(panic)("argc... not 1 or 2 or 3?");
+               
+               uCCall(cb, helper, argc, argc, False);
+            }
+
+            /* Copy original UInstr (INCEIP or JMP) */
+            VG_(copyUInstr)(cb, u_in);
+
+            /* Update BBCC_ptr, EIP, de-init read/write temps for next instr */
+            BBCC_ptr       += CC_size; 
+            x86_instr_addr += x86_instr_size;
+            t_CC_addr = t_read_addr = t_write_addr = t_data_addr1 = 
+                        t_data_addr2 = t_read = t_write = INVALID_TEMPREG;
+            data_size = INVALID_DATA_SIZE;
+            has_rep_prefix = False; 
             break;
 
          default:
@@ -709,19 +975,25 @@
 
    VG_(freeCodeBlock)(cb_in);
    return cb;
+
+#undef INVALID_DATA_SIZE
 }
 
 /*------------------------------------------------------------*/
-/*--- Cache simulation stuff                               ---*/
+/*--- Automagic cache initialisation stuff                 ---*/
 /*------------------------------------------------------------*/
 
-#define MIN_LINE_SIZE   16
-
 /* Total reads/writes/misses.  Calculated during CC traversal at the end. */
 static CC Ir_total;
 static CC Dr_total;
 static CC Dw_total;
 
+#define UNDEFINED_CACHE     ((cache_t) { -1, -1, -1 }) 
+
+static cache_t clo_I1_cache = UNDEFINED_CACHE;
+static cache_t clo_D1_cache = UNDEFINED_CACHE;
+static cache_t clo_L2_cache = UNDEFINED_CACHE;
+
 /* All CPUID info taken from sandpile.org/a32/cpuid.htm */
 /* Probably only works for Intel and AMD chips, and probably only for some of
  * them. 
@@ -739,7 +1011,7 @@
 static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
 {
     VG_(message)(Vg_DebugMsg, 
-       "warning: Pentium with %d K micro_op instruction trace cache", 
+       "warning: Pentium with %d K micro-op instruction trace cache", 
        actual_size);
     VG_(message)(Vg_DebugMsg, 
        "         Simulating a %d KB cache with %d B lines", 
@@ -755,6 +1027,7 @@
 {
    UChar info[16];
    Int   i, trials;
+   Bool  L2_found = False;
 
    if (level < 2) {
       VG_(message)(Vg_DebugMsg, 
@@ -782,8 +1055,9 @@
       case 0x0:       /* ignore zeros */
           break;
           
-      case 0x01: case 0x02: case 0x03: case 0x04:     /* TLB info, ignore */
-      case 0x90: case 0x96: case 0x9b:
+      /* TLB info, ignore */
+      case 0x01: case 0x02: case 0x03: case 0x04:
+      case 0x50: case 0x51: case 0x52: case 0x5b: case 0x5c: case 0x5d:
           break;      
 
       case 0x06: *I1c = (cache_t) {  8, 4, 32 }; break;
@@ -792,22 +1066,35 @@
       case 0x0a: *D1c = (cache_t) {  8, 2, 32 }; break;
       case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break;
 
+      /* IA-64 info -- panic! */
+      case 0x10: case 0x15: case 0x1a: 
+      case 0x88: case 0x89: case 0x8a: case 0x8d:
+      case 0x90: case 0x96: case 0x9b:
+         VG_(message)(Vg_DebugMsg,
+            "error: IA-64 cache stats!  Cachegrind doesn't run on IA-64...");
+         VG_(panic)("IA-64 detected");
+
       case 0x22: case 0x23: case 0x25: case 0x29: 
-      case 0x88: case 0x89: case 0x8a:
           VG_(message)(Vg_DebugMsg, 
              "warning: L3 cache detected but ignored\n");
           break;
 
-      case 0x40: 
-          VG_(message)(Vg_DebugMsg, 
-             "warning: L2 cache not installed, ignore L2 results.");
+      /* These are sectored, whatever that means */
+      case 0x39: *L2c = (cache_t) {  128, 4, 64 }; L2_found = True; break;
+      case 0x3c: *L2c = (cache_t) {  256, 4, 64 }; L2_found = True; break;
+
+      /* If a P6 core, this means "no L2 cache".  
+         If a P4 core, this means "no L3 cache".
+         We don't know what core it is, so don't issue a warning.  To detect
+         a missing L2 cache, we use 'L2_found'. */
+      case 0x40:
           break;
 
-      case 0x41: *L2c = (cache_t) {  128, 4, 32 };    break;
-      case 0x42: *L2c = (cache_t) {  256, 4, 32 };    break;
-      case 0x43: *L2c = (cache_t) {  512, 4, 32 };    break;
-      case 0x44: *L2c = (cache_t) { 1024, 4, 32 };    break;
-      case 0x45: *L2c = (cache_t) { 2048, 4, 32 };    break;
+      case 0x41: *L2c = (cache_t) {  128, 4, 32 }; L2_found = True; break;
+      case 0x42: *L2c = (cache_t) {  256, 4, 32 }; L2_found = True; break;
+      case 0x43: *L2c = (cache_t) {  512, 4, 32 }; L2_found = True; break;
+      case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
+      case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
 
       /* These are sectored, whatever that means */
       case 0x66: *D1c = (cache_t) {  8, 4, 64 };  break;      /* sectored */
@@ -832,24 +1119,31 @@
          micro_ops_warn(32, 32, 32); 
          break;  
 
-      case 0x79: *L2c = (cache_t) {  128, 8, 64 };    break;  /* sectored */
-      case 0x7a: *L2c = (cache_t) {  256, 8, 64 };    break;  /* sectored */
-      case 0x7b: *L2c = (cache_t) {  512, 8, 64 };    break;  /* sectored */
-      case 0x7c: *L2c = (cache_t) { 1024, 8, 64 };    break;  /* sectored */
+      /* These are sectored, whatever that means */
+      case 0x79: *L2c = (cache_t) {  128, 8,  64 }; L2_found = True;  break;
+      case 0x7a: *L2c = (cache_t) {  256, 8,  64 }; L2_found = True;  break;
+      case 0x7b: *L2c = (cache_t) {  512, 8,  64 }; L2_found = True;  break;
+      case 0x7c: *L2c = (cache_t) { 1024, 8,  64 }; L2_found = True;  break;
+      case 0x7e: *L2c = (cache_t) {  256, 8, 128 }; L2_found = True;  break;
 
-      case 0x81: *L2c = (cache_t) {  128, 8, 32 };    break;
-      case 0x82: *L2c = (cache_t) {  256, 8, 32 };    break;
-      case 0x83: *L2c = (cache_t) {  512, 8, 32 };    break;
-      case 0x84: *L2c = (cache_t) { 1024, 8, 32 };    break;
-      case 0x85: *L2c = (cache_t) { 2048, 8, 32 };    break;
+      case 0x81: *L2c = (cache_t) {  128, 8, 32 };  L2_found = True;  break;
+      case 0x82: *L2c = (cache_t) {  256, 8, 32 };  L2_found = True;  break;
+      case 0x83: *L2c = (cache_t) {  512, 8, 32 };  L2_found = True;  break;
+      case 0x84: *L2c = (cache_t) { 1024, 8, 32 };  L2_found = True;  break;
+      case 0x85: *L2c = (cache_t) { 2048, 8, 32 };  L2_found = True;  break;
 
       default:
           VG_(message)(Vg_DebugMsg, 
              "warning: Unknown Intel cache config value "
-             "(0x%x), ignoring\n", info[i]);
+             "(0x%x), ignoring", info[i]);
           break;
       }
    }
+
+   if (!L2_found)
+      VG_(message)(Vg_DebugMsg, 
+         "warning: L2 cache not installed, ignore L2 results.");
+
    return 0;
 }
 
@@ -871,12 +1165,16 @@
  * #3  The AMD K7 processor's L2 cache must be configured prior to relying 
  *     upon this information. (Whatever that means -- njn)
  *
+ * Also, according to Cyrille Chepelov, Duron stepping A0 processors (model
+ * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
+ * so we detect that.
+ * 
  * Returns 0 on success, non-zero on failure.
  */
 static
 Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
 {
-   Int dummy, ext_level;
+   Int dummy, model, ext_level;
    Int I1i, D1i, L2i;
    
    cpuid(0x80000000, &ext_level, &dummy, &dummy, &dummy);
@@ -891,6 +1189,16 @@
    cpuid(0x80000005, &dummy, &dummy, &D1i, &I1i);
    cpuid(0x80000006, &dummy, &dummy, &L2i, &dummy);
 
+   cpuid(0x1, &model, &dummy, &dummy, &dummy);
+   /*VG_(message)(Vg_UserMsg,"CPU model %04x",model);*/
+
+   /* Check for Duron bug */
+   if (model == 0x630) {
+      VG_(message)(Vg_UserMsg,
+         "Buggy Duron stepping A0. Assuming L2 size=65536 bytes");
+      L2i = (64 << 16) | (L2i & 0xffff);
+   }
+
    D1c->size      = (D1i >> 24) & 0xff;
    D1c->assoc     = (D1i >> 16) & 0xff;
    D1c->line_size = (D1i >>  0) & 0xff;
@@ -1044,14 +1352,14 @@
    cache_t D1_dflt = (cache_t) {  65536, 2, 64 };
    cache_t L2_dflt = (cache_t) { 262144, 8, 64 };
 
-#define CMD_LINE_DEFINED(L)                 \
-   (-1 != VG_(clo_##L##_cache).size  ||     \
-    -1 != VG_(clo_##L##_cache).assoc ||     \
-    -1 != VG_(clo_##L##_cache).line_size)
+#define CMD_LINE_DEFINED(L)            \
+   (-1 != clo_##L##_cache.size  ||     \
+    -1 != clo_##L##_cache.assoc ||     \
+    -1 != clo_##L##_cache.line_size)
 
-   *I1c = VG_(clo_I1_cache);
-   *D1c = VG_(clo_D1_cache);
-   *L2c = VG_(clo_L2_cache);
+   *I1c = clo_I1_cache;
+   *D1c = clo_D1_cache;
+   *L2c = clo_L2_cache;
 
    /* If any undefined on command-line, try CPUID */
    if (! CMD_LINE_DEFINED(I1) ||
@@ -1061,9 +1369,9 @@
       /* Overwrite CPUID result for any cache defined on command-line */
       if (0 == get_caches_from_CPUID(I1c, D1c, L2c)) {
    
-         if (CMD_LINE_DEFINED(I1)) *I1c = VG_(clo_I1_cache);
-         if (CMD_LINE_DEFINED(D1)) *D1c = VG_(clo_D1_cache);
-         if (CMD_LINE_DEFINED(L2)) *L2c = VG_(clo_L2_cache);
+         if (CMD_LINE_DEFINED(I1)) *I1c = clo_I1_cache;
+         if (CMD_LINE_DEFINED(D1)) *D1c = clo_D1_cache;
+         if (CMD_LINE_DEFINED(L2)) *L2c = clo_L2_cache;
 
       /* CPUID failed, use defaults for each undefined by command-line */
       } else {
@@ -1071,9 +1379,9 @@
                       "Couldn't detect cache configuration, using one "
                       "or more defaults ");
 
-         *I1c = (CMD_LINE_DEFINED(I1) ? VG_(clo_I1_cache) : I1_dflt);
-         *D1c = (CMD_LINE_DEFINED(D1) ? VG_(clo_D1_cache) : D1_dflt);
-         *L2c = (CMD_LINE_DEFINED(L2) ? VG_(clo_L2_cache) : L2_dflt);
+         *I1c = (CMD_LINE_DEFINED(I1) ? clo_I1_cache : I1_dflt);
+         *D1c = (CMD_LINE_DEFINED(D1) ? clo_D1_cache : D1_dflt);
+         *L2c = (CMD_LINE_DEFINED(L2) ? clo_L2_cache : L2_dflt);
       }
    }
 #undef CMD_LINE_DEFINED
@@ -1093,65 +1401,8 @@
    }
 }
 
-void VG_(init_cachesim)(void)
-{
-   cache_t I1c, D1c, L2c; 
-
-   /* Make sure the output file can be written. */
-   Int fd = VG_(open_write)(OUT_FILE);
-   if (-1 == fd) { 
-      fd = VG_(create_and_write)(OUT_FILE);
-      if (-1 == fd) {
-         file_err(); 
-      }
-   }
-   VG_(close)(fd);
-
-   initCC(&Ir_total);
-   initCC(&Dr_total);
-   initCC(&Dw_total);
-   
-   initCC(&Ir_discards);
-   initCC(&Dr_discards);
-   initCC(&Dw_discards);
-
-   get_caches(&I1c, &D1c, &L2c);
-
-   cachesim_I1_initcache(I1c);
-   //cachesim_I1_initcache();
-   cachesim_D1_initcache(D1c);
-   //cachesim_D1_initcache();
-   cachesim_L2_initcache(L2c);
-   //cachesim_L2_initcache();
-
-   init_BBCC_table();
-}
-
-void VG_(cachesim_log_non_mem_instr)(iCC* cc)
-{
-   //VG_(printf)("sim  I: CCaddr=0x%x, iaddr=0x%x, isize=%u\n",
-   //            cc, cc->instr_addr, cc->instr_size)
-   VGP_PUSHCC(VgpCacheSimulate);
-   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
-   cc->I.a++;
-   VGP_POPCC;
-}
-
-void VG_(cachesim_log_mem_instr)(idCC* cc, Addr data_addr)
-{
-   //VG_(printf)("sim  D: CCaddr=0x%x, iaddr=0x%x, isize=%u, daddr=0x%x, dsize=%u\n",
-   //            cc, cc->instr_addr, cc->instr_size, data_addr, cc->data_size)
-   VGP_PUSHCC(VgpCacheSimulate);
-   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
-   cc->I.a++;
-
-   cachesim_D1_doref(data_addr,      cc->data_size,  &cc->D.m1, &cc->D.m2);
-   cc->D.a++;
-   VGP_POPCC;
-}
-
 /*------------------------------------------------------------*/
-/*--- Printing of output file and summary stats            ---*/
+/*--- SK_(fini)() and related function                     ---*/
 /*------------------------------------------------------------*/
 
 static void fprint_BBCC(Int fd, BBCC* BBCC_node, Char *first_instr_fl, 
@@ -1181,15 +1432,15 @@
       Addr instr_addr;
       switch ( ((iCC*)BBCC_ptr)->tag ) {
 
-         case INSTR_CC:
+         case InstrCC:
             instr_addr = ((iCC*)BBCC_ptr)->instr_addr;
             sprint_iCC(buf, (iCC*)BBCC_ptr);
             ADD_CC_TO(iCC, I, Ir_total);
             BBCC_ptr += sizeof(iCC);
             break;
 
-         case READ_CC:
-         case  MOD_CC:
+         case ReadCC:
+         case  ModCC:
             instr_addr = ((idCC*)BBCC_ptr)->instr_addr;
             sprint_read_or_mod_CC(buf, (idCC*)BBCC_ptr);
             ADD_CC_TO(idCC, I, Ir_total);
@@ -1197,7 +1448,7 @@
             BBCC_ptr += sizeof(idCC);
             break;
 
-         case WRITE_CC:
+         case WriteCC:
             instr_addr = ((idCC*)BBCC_ptr)->instr_addr;
             sprint_write_CC(buf, (idCC*)BBCC_ptr);
             ADD_CC_TO(idCC, I, Ir_total);
@@ -1205,6 +1456,15 @@
             BBCC_ptr += sizeof(idCC);
             break;
 
+         case ReadWriteCC:
+            instr_addr = ((iddCC*)BBCC_ptr)->instr_addr;
+            sprint_read_write_CC(buf, (iddCC*)BBCC_ptr);
+            ADD_CC_TO(iddCC, I,  Ir_total);
+            ADD_CC_TO(iddCC, Da, Dr_total);
+            ADD_CC_TO(iddCC, Db, Dw_total);
+            BBCC_ptr += sizeof(iddCC);
+            break;
+
          default:
             VG_(panic)("Unknown CC type in fprint_BBCC()\n");
             break;
@@ -1223,7 +1483,7 @@
 
       /* If the function name for this instruction doesn't match that of the
        * first instruction in the BB, print warning. */
-      if (VG_(clo_trace_symtab) && 0 != VG_(strcmp)(fn_buf, first_instr_fn)) {
+      if (VG_(clo_verbosity > 2) && 0 != VG_(strcmp)(fn_buf, first_instr_fn)) {
          VG_(printf)("Mismatched function names\n");
          VG_(printf)("  filenames: BB:%s, instr:%s;"
                      "  fn_names:  BB:%s, instr:%s;"
@@ -1251,8 +1511,7 @@
    vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size);
 }
 
-static void fprint_BBCC_table_and_calc_totals(Int client_argc, 
-                                              Char** client_argv)
+static void fprint_BBCC_table_and_calc_totals(void)
 {
    Int        fd;
    Char       buf[BUF_LEN];
@@ -1261,8 +1520,8 @@
    BBCC      *curr_BBCC;
    Int        i,j,k;
 
-   VGP_PUSHCC(VgpCacheDump);
-   fd = VG_(open_write)(OUT_FILE);
+   VGP_PUSHCC(VgpCacheResults);
+   fd = VG_(open)(cachegrind_out_file, VKI_O_WRONLY|VKI_O_TRUNC, 0);
    if (-1 == fd) { file_err(); }
 
    /* "desc:" lines (giving I1/D1/L2 cache configuration) */
@@ -1276,8 +1535,8 @@
    /* "cmd:" line */
    VG_(strcpy)(buf, "cmd:");
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-   for (i = 0; i < client_argc; i++) {
-       VG_(sprintf)(buf, " %s", client_argv[i]);
+   for (i = 0; i < VG_(client_argc); i++) {
+       VG_(sprintf)(buf, " %s", VG_(client_argv)[i]);
        VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
    }
    /* "events:" line */
@@ -1395,6 +1654,7 @@
    VG_(sprintf)(buf, "%d.%d%%", n / pow, n % pow);
    len = VG_(strlen)(buf);
    space = field_width - len;
+   if (space < 0) space = 0;     /* Allow for v. small field_width */
    i = len;
 
    /* Right justify in field */
@@ -1402,7 +1662,7 @@
    for (i = 0; i < space; i++)  buf[i] = ' ';
 }
 
-void VG_(do_cachesim_results)(Int client_argc, Char** client_argv)
+void SK_(fini)(void)
 {
    CC D_total;
    ULong L2_total_m, L2_total_mr, L2_total_mw,
@@ -1413,7 +1673,7 @@
    Int l1, l2, l3;
    Int p;
 
-   fprint_BBCC_table_and_calc_totals(client_argc, client_argv);
+   fprint_BBCC_table_and_calc_totals();
 
    if (VG_(clo_verbosity) == 0) 
       return;
@@ -1431,6 +1691,7 @@
 
    p = 100;
 
+   if (0 == Ir_total.a) Ir_total.a = 1;
    percentify(Ir_total.m1 * 100 * p / Ir_total.a, p, l1+1, buf1);
    VG_(message)(Vg_UserMsg, "I1  miss rate: %s", buf1);
                 
@@ -1464,6 +1725,9 @@
 
    p = 10;
    
+   if (0 == D_total.a)   D_total.a = 1;
+   if (0 == Dr_total.a) Dr_total.a = 1;
+   if (0 == Dw_total.a) Dw_total.a = 1;
    percentify( D_total.m1 * 100 * p / D_total.a,  p, l1+1, buf1);
    percentify(Dr_total.m1 * 100 * p / Dr_total.a, p, l2+1, buf2);
    percentify(Dw_total.m1 * 100 * p / Dw_total.a, p, l3+1, buf3);
@@ -1525,7 +1789,7 @@
        VG_(message)(Vg_DebugMsg, "BBs Retranslated: %d", BB_retranslations);
        VG_(message)(Vg_DebugMsg, "Distinct instrs:  %d", distinct_instrs);
    }
-   VGP_POPCC;
+   VGP_POPCC(VgpCacheResults);
 }
 
 
@@ -1534,19 +1798,18 @@
  *
  * Finds the BBCC in the table, removes it, adds the counts to the discard
  * counters, and then frees the BBCC. */
-void VG_(cachesim_notify_discard) ( TTEntry* tte )
+void SK_(discard_basic_block_info) ( Addr a, UInt size )
 {
    BBCC *BBCC_node;
    Addr BBCC_ptr0, BBCC_ptr;
    Bool BB_seen_before;
     
    if (0)
-   VG_(printf)( "cachesim_notify_discard: %p for %d\n", 
-                tte->orig_addr, (Int)tte->orig_size);
+      VG_(printf)( "discard_basic_block_info: addr %p, size %u\n", a, size);
 
    /* 2nd arg won't be used since BB should have been seen before (assertions
     * ensure this). */
-   BBCC_node = get_BBCC(tte->orig_addr, NULL, True, &BB_seen_before);
+   BBCC_node = get_BBCC(a, NULL, /*remove=*/True, &BB_seen_before);
    BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);
 
    vg_assert(True == BB_seen_before);
@@ -1559,33 +1822,182 @@
 
       switch ( ((iCC*)BBCC_ptr)->tag ) {
 
-         case INSTR_CC:
+         case InstrCC:
             ADD_CC_TO(iCC, I, Ir_discards);
             BBCC_ptr += sizeof(iCC);
             break;
 
-         case READ_CC:
-         case  MOD_CC:
+         case ReadCC:
+         case  ModCC:
             ADD_CC_TO(idCC, I, Ir_discards);
             ADD_CC_TO(idCC, D, Dr_discards);
             BBCC_ptr += sizeof(idCC);
             break;
 
-         case WRITE_CC:
+         case WriteCC:
             ADD_CC_TO(idCC, I, Ir_discards);
             ADD_CC_TO(idCC, D, Dw_discards);
             BBCC_ptr += sizeof(idCC);
             break;
 
+         case ReadWriteCC:
+            ADD_CC_TO(iddCC, I, Ir_discards);
+            ADD_CC_TO(iddCC, Da, Dr_discards);
+            ADD_CC_TO(iddCC, Db, Dw_discards);
+            BBCC_ptr += sizeof(iddCC);
+            break;
+
          default:
-            VG_(panic)("Unknown CC type in VG_(cachesim_notify_discard)()\n");
+            VG_(panic)("Unknown CC type in VG_(discard_basic_block_info)()\n");
             break;
       }
    }
-
-   VG_(free)(VG_AR_PRIVATE, BBCC_node);
+   VG_(free)(BBCC_node);
 }
 
 /*--------------------------------------------------------------------*/
+/*--- Command line processing                                      ---*/
+/*--------------------------------------------------------------------*/
+
+static void parse_cache_opt ( cache_t* cache, char* orig_opt, int opt_len )
+{
+   int   i1, i2, i3;
+   int   i;
+   char *opt = VG_(strdup)(orig_opt);
+
+   i = i1 = opt_len;
+
+   /* Option looks like "--I1=65536,2,64".
+    * Find commas, replace with NULs to make three independent 
+    * strings, then extract numbers.  Yuck. */
+   while (VG_(isdigit)(opt[i])) i++;
+   if (',' == opt[i]) {
+      opt[i++] = '\0';
+      i2 = i;
+   } else goto bad;
+   while (VG_(isdigit)(opt[i])) i++;
+   if (',' == opt[i]) {
+      opt[i++] = '\0';
+      i3 = i;
+   } else goto bad;
+   while (VG_(isdigit)(opt[i])) i++;
+   if ('\0' != opt[i]) goto bad;
+
+   cache->size      = (Int)VG_(atoll)(opt + i1);
+   cache->assoc     = (Int)VG_(atoll)(opt + i2);
+   cache->line_size = (Int)VG_(atoll)(opt + i3);
+
+   VG_(free)(opt);
+
+   return;
+
+  bad:
+   VG_(bad_option)(orig_opt);
+}
+
+Bool SK_(process_cmd_line_option)(Char* arg)
+{
+   /* 5 is length of "--I1=" */
+   if      (0 == VG_(strncmp)(arg, "--I1=", 5))
+      parse_cache_opt(&clo_I1_cache, arg,   5);
+   else if (0 == VG_(strncmp)(arg, "--D1=", 5))
+      parse_cache_opt(&clo_D1_cache, arg,   5);
+   else if (0 == VG_(strncmp)(arg, "--L2=", 5))
+      parse_cache_opt(&clo_L2_cache, arg,   5);
+   else
+      return False;
+
+   return True;
+}
+
+Char* SK_(usage)(void)
+{
+   return 
+"    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
+"    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
+"    --L2=<size>,<assoc>,<line_size>  set L2 cache manually\n";
+}
+
+/*--------------------------------------------------------------------*/
+/*--- Setup                                                        ---*/
+/*--------------------------------------------------------------------*/
+
+void SK_(pre_clo_init)(VgNeeds* needs, VgTrackEvents* not_used) 
+{
+   needs->name                    = "cachegrind";
+   needs->description             = "an I1/D1/L2 cache profiler";
+
+   needs->basic_block_discards    = True;
+   needs->command_line_options    = True;
+
+   VG_(register_compact_helper)((Addr) & log_1I_0D_cache_access);
+   VG_(register_compact_helper)((Addr) & log_1I_0D_cache_access_JIFZ);
+   VG_(register_compact_helper)((Addr) & log_0I_1D_cache_access);
+   VG_(register_compact_helper)((Addr) & log_1I_1D_cache_access);
+   VG_(register_compact_helper)((Addr) & log_0I_2D_cache_access);
+   VG_(register_compact_helper)((Addr) & log_1I_2D_cache_access);
+}
+
+void SK_(post_clo_init)(void)
+{
+   cache_t I1c, D1c, L2c; 
+   Int fd;
+
+   /* Set output file name: cachegrind.<pid>.out */
+   VG_(sprintf)(cachegrind_out_file, "cachegrind.out.%d", VG_(getpid)());
+
+   /* Make sure the output file can be written. */
+   fd = VG_(open)(cachegrind_out_file, VKI_O_WRONLY|VKI_O_TRUNC, 0);
+   if (-1 == fd) { 
+      fd = VG_(open)(cachegrind_out_file, VKI_O_CREAT|VKI_O_WRONLY,
+                                          VKI_S_IRUSR|VKI_S_IWUSR);
+      if (-1 == fd) {
+         file_err(); 
+      }
+   }
+   VG_(close)(fd);
+
+   initCC(&Ir_total);
+   initCC(&Dr_total);
+   initCC(&Dw_total);
+   
+   initCC(&Ir_discards);
+   initCC(&Dr_discards);
+   initCC(&Dw_discards);
+
+   get_caches(&I1c, &D1c, &L2c);
+
+   cachesim_I1_initcache(I1c);
+   cachesim_D1_initcache(D1c);
+   cachesim_L2_initcache(L2c);
+
+   VGP_(register_profile_event)(VgpGetBBCC,       "get-BBCC");
+   VGP_(register_profile_event)(VgpCacheSimulate, "cache-simulate");
+   VGP_(register_profile_event)(VgpCacheResults,  "cache-results");
+   
+   init_BBCC_table();
+}
+
+#if 0
+Bool SK_(cheap_sanity_check)(void) { return True; }
+
+extern TTEntry* vg_tt;
+
+Bool SK_(expensive_sanity_check)(void)
+{ 
+   Int i;
+   Bool dummy;
+   for (i = 0; i < 200191; i++) {
+      if (vg_tt[i].orig_addr != (Addr)1 &&
+          vg_tt[i].orig_addr != (Addr)3) {
+         VG_(printf)(".");
+         get_BBCC(vg_tt[i].orig_addr, NULL, /*remove=*/True, &dummy);
+      }
+   }
+   return True;
+}
+#endif
+
+/*--------------------------------------------------------------------*/
 /*--- end                                            vg_cachesim.c ---*/
 /*--------------------------------------------------------------------*/