New files:
  - vg_cachesim.c
  - vg_cachesim_{I1,D1,L2}.c
  - vg_annotate.in
  - vg_cachegen.in

Changes to existing files:

  - valgrind/valgrind.in, added option:

        --cachesim=no|yes       [no]

  - Makefile/Makefile.am:
        * added vg_cachesim.c to valgrind_so_SOURCES var
        * added vg_cachesim_I1.c, vg_cachesim_D1.c, vg_cachesim_L2.c to
          noinst_HEADERS var
        * added vg_annotate, vg_cachegen to 'bin_SCRIPTS' var, and added empty
          targets for them

  - vg_main.c:
        * added two offsets for cache sim functions (put in positions 17a,17b)
        * added option handling (detection of --cachesim=yes which turns off of
          --instrument);
        * added calls to cachesim initialisation/finalisation functions

  - vg_mylibc: added some system call wrappers (for chmod, open_write, etc) for
    file writing

  - vg_symtab2.c:
        * allow it to read symbols if either of --instrument or --cachesim is
          used
        * made vg_symtab2.c:vg_what_{line,fn}_is_this extern, renaming it as
          VG_(what_line_is_this) (and added to vg_include.h)
        * completely rewrote the read loop in vg_read_lib_symbols, fixing
          several bugs.  Much better now, although probably not perfect.  It's
          also relatively fragile -- I'm using the "die immediately if anything
          unexpected happens" approach.

  - vg_to_ucode.c:
        * in VG_(disBB), patching in x86 instruction size into extra4b field of
          JMP instructions at the end of basic blocks if --cachesim=yes.
          Shifted things around to do this;  also had to fiddle around with
          single-step stuff to get this to work, by not sticking extra JMPs on
          the end of the single-instruction block if there was already one
          there (to avoid breaking an assertion in vg_cachesim.c).  Did a
          similar thing to avoid an extra JMP on huge basic blocks that are
          split.

  - vg_translate.c:
        * if --cachesim=yes call the cachesim instrumentation phase
        * made some functions extern and renamed:
                allocCodeBlock() --> VG_(allocCodeBlock)()
                freeCodeBlock()  --> VG_(freeCodeBlock)()
                copyUInstr()     --> VG_(copyUInstr)()
          (added to vg_include.h too)

  - vg_include.c: declared
        * cachesim offsets
        * exports of vg_cachesim.c
        * added four new profiling events (increasing VGP_M_CCS to 24 -- I kept
          the spare ones)
        * added comment about UInstr.extra4b field being used for instr size in
          JMPs for cache simulation

  - docs/manual.html:
        * Added --cachesim option to section 2.5.
        * Added cache profiling stuff as section 7.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@168 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
new file mode 100644
index 0000000..ea0cb41
--- /dev/null
+++ b/cachegrind/cg_main.c
@@ -0,0 +1,1068 @@
+/*--------------------------------------------------------------------*/
+/*--- The cache simulation framework: instrumentation, recording   ---*/
+/*--- and results printing.                                        ---*/
+/*---                                                vg_cachesim.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, an x86 protected-mode emulator 
+   designed for debugging and profiling binaries on x86-Unixes.
+
+   Copyright (C) 2000-2002 Julian Seward 
+      jseward@acm.org
+      Julian_Seward@muraroa.demon.co.uk
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file LICENSE.
+*/
+
+#include <string.h>
+
+#include "vg_include.h"
+
+#include "vg_cachesim_L2.c"
+#include "vg_cachesim_I1.c"
+#include "vg_cachesim_D1.c"
+
+
+/* According to IA-32 Intel Architecture Software Developer's Manual: Vol 2 */
+#define MAX_x86_INSTR_SIZE  16
+
+/* Size of various buffers used for storing strings */
+#define FILENAME_LEN                      256
+#define FN_NAME_LEN                       256
+#define BUF_LEN                           512
+#define COMMIFY_BUF_LEN                   128
+#define RESULTS_BUF                       128
+
+/*------------------------------------------------------------*/
+/*--- Output file related stuff                            ---*/
+/*------------------------------------------------------------*/
+
+#define OUT_FILE        "cachegrind.out"
+
+static void file_err()
+{
+   VG_(message)(Vg_UserMsg,
+                "FATAL: can't open cache simulation output file `%s'",
+                OUT_FILE );
+   VG_(exit)(1);
+}
+
+/*------------------------------------------------------------*/
+/*--- Cost center types, operations                        ---*/
+/*------------------------------------------------------------*/
+
+typedef struct _CC CC;
+struct _CC {
+   ULong a;
+   ULong m1;
+   ULong m2;
+};
+
+static __inline__ void initCC(CC* cc) {
+    cc->a  = 0;
+    cc->m1 = 0;
+    cc->m2 = 0;
+}
+
+
+typedef enum { INSTR_CC, READ_CC, WRITE_CC, MOD_CC } CC_type;
+
+/* Instruction-level cost-centres.  The typedefs for these structs are in
+ * vg_include.c 
+ *
+ * WARNING:  the 'tag' field *must* be the first byte of both CC types.
+ *           the 'instr_addr' *must* be the second word of both CC types.
+ *
+ * This is because we use them when we don't know what type of CC we're dealing
+ * with.
+ */ 
+struct _iCC {
+   /* word 1 */
+   UChar tag;
+   UChar instr_size;
+
+   /* words 2+ */
+   Addr instr_addr;
+   CC I;
+};
+
+struct _idCC {
+   /* word 1 */
+   UChar tag;
+   UChar instr_size;
+   UChar data_size;
+
+   /* words 2+ */
+   Addr instr_addr;
+   CC I;
+   CC D;
+};
+
+static void init_iCC(iCC* cc, Addr instr_addr, UInt instr_size)
+{
+   cc->tag        = INSTR_CC;
+   cc->instr_size = instr_size;
+   cc->instr_addr = instr_addr;
+   initCC(&cc->I);
+}
+
+static void init_idCC(CC_type X_CC, idCC* cc, Addr instr_addr,
+                      UInt instr_size, UInt data_size)
+{
+   cc->tag        = X_CC;
+   cc->instr_size = instr_size;
+   cc->data_size  = data_size;
+   cc->instr_addr = instr_addr;
+   initCC(&cc->I);
+   initCC(&cc->D);
+}
+
+static __inline__ void sprint_iCC(Char buf[BUF_LEN], UInt ln, iCC* cc)
+{
+   VG_(sprintf)(buf, "%u %llu %llu %llu\n",
+                      ln, cc->I.a, cc->I.m1, cc->I.m2/*, cc->instr_addr*/);
+}
+
+static __inline__ void sprint_read_or_mod_CC(Char buf[BUF_LEN], UInt ln, 
+                                             idCC* cc)
+{
+   VG_(sprintf)(buf, "%u %llu %llu %llu %llu %llu %llu\n",
+                      ln, cc->I.a, cc->I.m1, cc->I.m2, 
+                          cc->D.a, cc->D.m1, cc->D.m2/*, cc->instr_addr*/);
+}
+
+static __inline__ void sprint_write_CC(Char buf[BUF_LEN], UInt ln, idCC* cc)
+{
+   VG_(sprintf)(buf, "%u %llu %llu %llu . . . %llu %llu %llu\n",
+                      ln, cc->I.a, cc->I.m1, cc->I.m2, 
+                          cc->D.a, cc->D.m1, cc->D.m2/*, cc->instr_addr*/);
+}
+
+/*------------------------------------------------------------*/
+/*--- BBCC hash table stuff                                ---*/
+/*------------------------------------------------------------*/
+
+/* The table of BBCCs is of the form hash(filename, hash(fn_name,
+ * hash(BBCCs))).  Each hash table is separately chained.  The sizes below work
+ * fairly well for Konqueror. */
+
+#define N_FILE_ENTRIES        251
+#define   N_FN_ENTRIES         53
+#define N_BBCC_ENTRIES         37
+
+/* The cost centres for a basic block are stored in a contiguous array.
+ * They are distinguishable by their tag field. */
+typedef struct _BBCC BBCC;
+struct _BBCC {
+   Addr  orig_addr;
+   UInt  array_size;    /* byte-size of variable length array */
+   BBCC* next;
+   Addr  array[0];      /* variable length array */
+};
+
+typedef struct _fn_node fn_node;
+struct _fn_node {
+   Char*    fn_name;
+   BBCC*    BBCCs[N_BBCC_ENTRIES];
+   fn_node* next;
+};
+
+typedef struct _file_node file_node;
+struct _file_node {
+   Char*      filename;
+   fn_node*   fns[N_FN_ENTRIES];
+   file_node* next;
+};
+
+/* BBCC_table structure:  list(filename, list(fn_name, list(BBCC))) */
+file_node *BBCC_table[N_FILE_ENTRIES];
+
+Int  distinct_files      = 0;
+Int  distinct_fns        = 0;
+
+Int  distinct_instrs     = 0;
+Int  full_debug_BBs      = 0;
+Int  file_line_debug_BBs = 0;
+Int  fn_name_debug_BBs   = 0;
+Int  no_debug_BBs        = 0;
+
+Int  BB_retranslations   = 0;
+
+static void init_BBCC_table()
+{
+   Int i;
+   for (i = 0; i < N_FILE_ENTRIES; i++)
+      BBCC_table[i] = NULL;
+}
+
+static void get_file_fn_names(Addr instr_addr, Char filename[FILENAME_LEN],
+                       Char fn_name[FN_NAME_LEN])
+{
+   UInt dummy_line_num;
+   Bool found1, found2, no_demangle = False;
+
+   found1 = VG_(what_line_is_this)(instr_addr, filename,
+                                   FILENAME_LEN, &dummy_line_num);
+   found2 = VG_(what_fn_is_this)(no_demangle, instr_addr, fn_name, FN_NAME_LEN);
+
+   if (!found1 && !found2) {
+      no_debug_BBs++;
+      VG_(strcpy)(filename, "???");
+      VG_(strcpy)(fn_name,  "???");
+
+   } else if ( found1 &&  found2) {
+      full_debug_BBs++;
+
+   } else if ( found1 && !found2) {
+      file_line_debug_BBs++;
+      VG_(strcpy)(fn_name,  "???");
+
+   } else  /*(!found1 &&  found2)*/ {
+      fn_name_debug_BBs++;
+      VG_(strcpy)(filename, "???");
+   }
+}
+
+/* Forward declaration. */
+static Int compute_BBCC_array_size(UCodeBlock* cb);
+
+static __inline__ 
+file_node* new_file_node(Char filename[FILENAME_LEN], file_node* next)
+{
+   Int i;
+   file_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(file_node));
+   new->filename  = VG_(strdup)(VG_AR_PRIVATE, filename);
+   for (i = 0; i < N_FN_ENTRIES; i++) {
+      new->fns[i] = NULL;
+   }
+   new->next      = next;
+   return new;
+}
+
+static __inline__ 
+fn_node* new_fn_node(Char fn_name[FILENAME_LEN], fn_node* next)
+{
+   Int i;
+   fn_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(fn_node));
+   new->fn_name = VG_(strdup)(VG_AR_PRIVATE, fn_name);
+   for (i = 0; i < N_BBCC_ENTRIES; i++) {
+      new->BBCCs[i] = NULL;
+   }
+   new->next    = next;
+   return new;
+}
+
+static __inline__ 
+BBCC* new_BBCC(Addr bb_orig_addr, UCodeBlock* cb, BBCC* next)
+{
+   Int BBCC_array_size = compute_BBCC_array_size(cb);
+   BBCC* new;
+
+   new = (BBCC*)VG_(malloc)(VG_AR_PRIVATE, sizeof(BBCC) + BBCC_array_size);
+   new->orig_addr  = bb_orig_addr;
+   new->array_size = BBCC_array_size;
+   new->next = next;
+
+   return new;
+}
+
+#define HASH_CONSTANT   256
+
+static UInt hash(Char *s, UInt table_size)
+{
+    int hash_value = 0;
+    for ( ; *s; s++)
+        hash_value = (HASH_CONSTANT * hash_value + *s) % table_size;
+    return hash_value;
+}
+
+/* Do a three step traversal: by filename, then fn_name, then instr_addr.
+ * In all cases prepends new nodes to their chain.  Returns a pointer to the
+ * cost centre.  Also sets BB_seen_before by reference. 
+ */ 
+static __inline__ BBCC* get_BBCC(Addr bb_orig_addr, UCodeBlock* cb, 
+                                 Bool *BB_seen_before)
+{
+   file_node *curr_file_node;
+   fn_node   *curr_fn_node;
+   BBCC      *curr_BBCC;
+   Char       filename[FILENAME_LEN], fn_name[FN_NAME_LEN];
+   UInt       filename_hash, fnname_hash, BBCC_hash;
+
+   get_file_fn_names(bb_orig_addr, filename, fn_name);
+
+   VGP_PUSHCC(VgpCacheGetBBCC);
+   filename_hash = hash(filename, N_FILE_ENTRIES);
+   curr_file_node = BBCC_table[filename_hash];
+   while (NULL != curr_file_node && 
+          strcmp(filename, curr_file_node->filename) != 0) {
+      curr_file_node = curr_file_node->next;
+   }
+   if (NULL == curr_file_node) {
+      BBCC_table[filename_hash] = curr_file_node = 
+         new_file_node(filename, BBCC_table[filename_hash]);
+      distinct_files++;
+   }
+
+   fnname_hash = hash(fn_name, N_FN_ENTRIES);
+   curr_fn_node = curr_file_node->fns[fnname_hash];
+   while (NULL != curr_fn_node && 
+          strcmp(fn_name, curr_fn_node->fn_name) != 0) {
+      curr_fn_node = curr_fn_node->next;
+   }
+   if (NULL == curr_fn_node) {
+      curr_file_node->fns[fnname_hash] = curr_fn_node = 
+         new_fn_node(fn_name, curr_file_node->fns[fnname_hash]);
+      distinct_fns++;
+   }
+
+   BBCC_hash = bb_orig_addr % N_BBCC_ENTRIES;
+   curr_BBCC = curr_fn_node->BBCCs[BBCC_hash];
+   while (NULL != curr_BBCC && bb_orig_addr != curr_BBCC->orig_addr) {
+      curr_BBCC = curr_BBCC->next;
+   }
+   if (curr_BBCC == NULL) {
+      curr_fn_node->BBCCs[BBCC_hash] = curr_BBCC = 
+         new_BBCC(bb_orig_addr, cb, curr_fn_node->BBCCs[BBCC_hash]);
+      *BB_seen_before = False;
+
+   } else {
+      vg_assert(bb_orig_addr == curr_BBCC->orig_addr);
+      vg_assert(curr_BBCC->array_size > 0 && curr_BBCC->array_size < 1000000);
+      if (VG_(clo_verbosity) > 1) {
+          VG_(message)(Vg_DebugMsg, "BB retranslation, retrieving from BBCC table");
+      }
+      *BB_seen_before = True;
+      BB_retranslations++;
+   }
+   VGP_POPCC;
+   return curr_BBCC;
+}
+
+/*------------------------------------------------------------*/
+/*--- Cache simulation instrumentation phase               ---*/
+/*------------------------------------------------------------*/
+
+#define uInstr1   VG_(newUInstr1)
+#define uInstr2   VG_(newUInstr2)
+#define uInstr3   VG_(newUInstr3)
+#define dis       VG_(disassemble)
+#define uLiteral  VG_(setLiteralField)
+#define newTemp   VG_(getNewTemp)
+
+static Int compute_BBCC_array_size(UCodeBlock* cb)
+{
+   UInstr* u_in;
+   Int     i, CC_size, BBCC_size = 0;
+   Bool    is_LOAD, is_STORE, is_FPU_R, is_FPU_W;
+    
+   is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
+
+   for (i = 0; i < cb->used; i++) {
+      //VG_(ppUInstr)(0, &cb->instrs[i]);
+
+      u_in = &cb->instrs[i];
+      switch(u_in->opcode) {
+
+         case INCEIP: 
+            goto case_for_end_of_instr;
+         
+         case JMP:
+            if (u_in->cond != CondAlways) break;
+
+            goto case_for_end_of_instr;
+
+            case_for_end_of_instr:
+
+            CC_size = (is_LOAD || is_STORE || is_FPU_R || is_FPU_W 
+                      ? sizeof(idCC) : sizeof(iCC));
+
+            BBCC_size += CC_size;
+            is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
+            break;
+
+         case LOAD:
+            /* Two LDBs are possible for a single instruction */
+            vg_assert(/*!is_LOAD &&*/ !is_STORE && !is_FPU_R && !is_FPU_W);
+            is_LOAD = True;
+            break;
+
+         case STORE:
+            /* Multiple STOREs are possible for 'pushal' */
+            vg_assert(            /*!is_STORE &&*/ !is_FPU_R && !is_FPU_W);
+            is_STORE = True;
+            break;
+
+         case FPU_R:
+            vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
+            is_FPU_R = True;
+            break;
+
+         case FPU_W:
+            vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
+            is_FPU_W = True;
+            break;
+
+         default:
+            break;
+      }
+   }
+
+   return BBCC_size;
+}
+
+/* Use this rather than eg. -1 because it's stored as a UInt. */
+#define INVALID_DATA_SIZE   999999
+
+UCodeBlock* VG_(cachesim_instrument)(UCodeBlock* cb_in, Addr orig_addr)
+{
+   UCodeBlock* cb;
+   Int         i;
+   UInstr*     u_in;
+   BBCC*       BBCC_node;
+   Int         t_CC_addr, t_read_addr, t_write_addr, t_data_addr;
+   Int         CC_size = -1;    /* Shut gcc warnings up */
+   Addr        instr_addr = orig_addr;
+   UInt        instr_size, data_size = INVALID_DATA_SIZE;
+   Int         helper = -1;     /* Shut gcc warnings up */
+   UInt        stack_used;
+   Bool        BB_seen_before       = False;
+   Bool        prev_instr_was_Jcond = False;
+   Addr        BBCC_ptr0, BBCC_ptr; 
+
+   /* Get BBCC (creating if necessary -- requires a counting pass over the BB
+    * if it's the first time it's been seen), and point to start of the 
+    * BBCC array.  */
+   BBCC_node = get_BBCC(orig_addr, cb_in, &BB_seen_before);
+   BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);
+
+   cb = VG_(allocCodeBlock)();
+   cb->nextTemp = cb_in->nextTemp;
+
+   t_CC_addr = t_read_addr = t_write_addr = t_data_addr = INVALID_TEMPREG;
+
+   for (i = 0; i < cb_in->used; i++) {
+      u_in = &cb_in->instrs[i];
+
+      //VG_(ppUInstr)(0, u_in);
+
+      /* What this is all about:  we want to instrument each x86 instruction 
+       * translation.  The end of these are marked in three ways.  The three
+       * ways, and the way we instrument them, are as follows:
+       *
+       * 1. UCode, INCEIP         --> UCode, Instrumentation, INCEIP
+       * 2. UCode, Juncond        --> UCode, Instrumentation, Juncond
+       * 3. UCode, Jcond, Juncond --> UCode, Instrumentation, Jcond, Juncond
+       *
+       * We must put the instrumentation before the jumps so that it is always
+       * executed.  We don't have to put the instrumentation before the INCEIP
+       * (it could go after) but we do so for consistency.
+       *
+       * Junconds are always the last instruction in a basic block.  Jconds are
+       * always the 2nd last, and must be followed by a Jcond.  We check this
+       * with various assertions.
+       *
+       * Note that in VG_(disBB) we patched the `extra4b' field of the first
+       * occurring JMP in a block with the size of its x86 instruction.  This
+       * is used now.
+       *
+       * Note that we don't have to treat JIFZ specially;  unlike JMPs, JIFZ
+       * occurs in the middle of a BB and gets an INCEIP after it.
+       *
+       * The instrumentation is just a call to the appropriate helper function,
+       * passing it the address of the instruction's CC.
+       */
+      if (prev_instr_was_Jcond) vg_assert(u_in->opcode == JMP);
+
+      switch (u_in->opcode) {
+
+         case INCEIP:
+            instr_size = u_in->val1;
+            goto case_for_end_of_x86_instr;
+
+         case JMP:
+            if (u_in->cond == CondAlways) {
+               vg_assert(i+1 == cb_in->used); 
+
+               /* Don't instrument if previous instr was a Jcond. */
+               if (prev_instr_was_Jcond) {
+                  vg_assert(0 == u_in->extra4b);
+                  VG_(copyUInstr)(cb, u_in);
+                  break;
+               }
+               prev_instr_was_Jcond = False;
+
+            } else {
+               vg_assert(i+2 == cb_in->used);  /* 2nd last instr in block */
+               prev_instr_was_Jcond = True;
+            }
+
+            /* Ah, the first JMP... instrument, please. */
+            instr_size = u_in->extra4b;
+            goto case_for_end_of_x86_instr;
+
+            /* Shared code that is executed at the end of an x86 translation
+             * block, marked by either an INCEIP or an unconditional JMP. */
+            case_for_end_of_x86_instr:
+
+#define IS_(X)      (INVALID_TEMPREG != t_##X##_addr)
+             
+            /* Initialise the CC in the BBCC array appropriately if it hasn't
+             * been initialised before.
+             * Then call appropriate sim function, passing it the CC address.
+             * Note that CALLM_S/CALL_E aren't required here;  by this point,
+             * the checking related to them has already happened. */
+            stack_used = 0;
+
+            vg_assert(instr_size >= 1 && instr_size <= MAX_x86_INSTR_SIZE);
+            vg_assert(0 != instr_addr);
+
+            /* Save the caller-save registers before we push our args */
+            uInstr1(cb, PUSH, 4, RealReg, R_EAX);
+            uInstr1(cb, PUSH, 4, RealReg, R_ECX);
+            uInstr1(cb, PUSH, 4, RealReg, R_EDX);
+
+            if (!IS_(read) && !IS_(write)) {
+               iCC* CC_ptr = (iCC*)(BBCC_ptr);
+               vg_assert(INVALID_DATA_SIZE == data_size);
+               vg_assert(INVALID_TEMPREG == t_read_addr && 
+                         INVALID_TEMPREG == t_write_addr);
+               CC_size = sizeof(iCC);
+               if (!BB_seen_before)
+                   init_iCC(CC_ptr, instr_addr, instr_size);
+
+               helper = VGOFF_(cachesim_log_non_mem_instr);
+
+            } else { 
+               CC_type X_CC;
+               idCC* CC_ptr = (idCC*)(BBCC_ptr);
+                
+               vg_assert(4 == data_size || 2  == data_size || 1 == data_size || 
+                         8 == data_size || 10 == data_size);
+               
+               CC_size = sizeof(idCC);
+               helper = VGOFF_(cachesim_log_mem_instr);
+
+               if (IS_(read) && !IS_(write)) {
+                  X_CC = READ_CC;
+                  vg_assert(INVALID_TEMPREG != t_read_addr && 
+                            INVALID_TEMPREG == t_write_addr);
+                  t_data_addr = t_read_addr;
+
+               } else if (!IS_(read) && IS_(write)) {
+                  X_CC = WRITE_CC;
+                  vg_assert(INVALID_TEMPREG == t_read_addr && 
+                            INVALID_TEMPREG != t_write_addr);
+                  t_data_addr = t_write_addr;
+
+               } else {
+                  vg_assert(IS_(read) && IS_(write));
+                  X_CC = MOD_CC;
+                  vg_assert(INVALID_TEMPREG != t_read_addr && 
+                            INVALID_TEMPREG != t_write_addr);
+                  t_data_addr = t_read_addr;
+               }
+
+               if (!BB_seen_before)
+                  init_idCC(X_CC, CC_ptr, instr_addr, instr_size, data_size);
+
+               /* 2nd arg: data addr */
+               uInstr1(cb, PUSH,  4, TempReg, t_data_addr);
+               stack_used += 4;
+            }
+#undef IS_
+
+            /* 1st arg: CC addr */
+            t_CC_addr = newTemp(cb);
+            uInstr2(cb, MOV,   4, Literal, 0, TempReg, t_CC_addr);
+            uLiteral(cb, BBCC_ptr);
+            uInstr1(cb, PUSH,  4, TempReg, t_CC_addr);
+            stack_used += 4;
+
+            /* Call function and return. */
+            uInstr1(cb, CALLM, 0, Lit16,   helper);
+            uInstr1(cb, CLEAR, 0, Lit16,   stack_used);
+
+            /* Restore the caller-save registers now the call is done */
+            uInstr1(cb, POP, 4, RealReg, R_EDX);
+            uInstr1(cb, POP, 4, RealReg, R_ECX);
+            uInstr1(cb, POP, 4, RealReg, R_EAX);
+
+            VG_(copyUInstr)(cb, u_in);
+
+            /* Update BBCC_ptr, EIP, de-init read/write temps for next instr */
+            BBCC_ptr   += CC_size; 
+            instr_addr += instr_size;
+            t_CC_addr = t_read_addr = t_write_addr = 
+                                      t_data_addr  = INVALID_TEMPREG;
+            data_size = INVALID_DATA_SIZE;
+            break;
+
+
+         /* For memory-ref instrs, copy the data_addr into a temporary to be
+          * passed to the cachesim_log_function at the end of the instruction.
+          */
+         case LOAD: 
+            t_read_addr = newTemp(cb);
+            uInstr2(cb, MOV, 4, TempReg, u_in->val1,  TempReg, t_read_addr);
+            data_size = u_in->size;
+            VG_(copyUInstr)(cb, u_in);
+            break;
+
+         case FPU_R:
+            t_read_addr = newTemp(cb);
+            uInstr2(cb, MOV, 4, TempReg, u_in->val2,  TempReg, t_read_addr);
+            data_size = u_in->size;
+            VG_(copyUInstr)(cb, u_in);
+            break;
+
+         /* Note that we must set t_write_addr even for mod instructions;
+          * that's how the code above determines whether it does a write;
+          * without it, it would think a mod instruction is a read.
+          * As for the MOV, if it's a mod instruction it's redundant, but it's
+          * not expensive and mod instructions are rare anyway. */
+         case STORE:
+         case FPU_W:
+            t_write_addr = newTemp(cb);
+            uInstr2(cb, MOV, 4, TempReg, u_in->val2, TempReg, t_write_addr);
+            data_size = u_in->size;
+            VG_(copyUInstr)(cb, u_in);
+            break;
+
+         case NOP:  case CALLM_E:  case CALLM_S:
+            break;
+
+         default:
+            VG_(copyUInstr)(cb, u_in);
+            break;
+      }
+   }
+
+   /* Just check everything looks ok */
+   vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size);
+
+   VG_(freeCodeBlock)(cb_in);
+   return cb;
+}
+
+/*------------------------------------------------------------*/
+/*--- Cache simulation stuff                               ---*/
+/*------------------------------------------------------------*/
+
+/* Total reads/writes/misses.  Calculated during CC traversal at the end. */
+static CC Ir_total;
+static CC Dr_total;
+static CC Dw_total;
+
+void VG_(init_cachesim)(void)
+{
+   /* Make sure the output file can be written. */
+   Int fd = VG_(open_write)(OUT_FILE);
+   if (-1 == fd) { 
+      fd = VG_(create_and_write)(OUT_FILE);
+      if (-1 == fd) {
+         file_err(); 
+      }
+   }
+   VG_(close)(fd);
+    
+   initCC(&Ir_total);
+   initCC(&Dr_total);
+   initCC(&Dw_total);
+   
+   cachesim_I1_initcache();
+   cachesim_D1_initcache();
+   cachesim_L2_initcache();
+
+   init_BBCC_table();
+}
+
+void VG_(cachesim_log_non_mem_instr)(iCC* cc)
+{
+   //VG_(printf)("sim  I: CCaddr=0x%x, iaddr=0x%x, isize=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
+   cc->I.a++;
+   VGP_POPCC;
+}
+
+void VG_(cachesim_log_mem_instr)(idCC* cc, Addr data_addr)
+{
+   //VG_(printf)("sim  D: CCaddr=0x%x, iaddr=0x%x, isize=%u, daddr=0x%x, dsize=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size, data_addr, cc->data_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
+   cc->I.a++;
+
+   cachesim_D1_doref(data_addr,      cc->data_size,  &cc->D.m1, &cc->D.m2);
+   cc->D.a++;
+   VGP_POPCC;
+}
+
+/*------------------------------------------------------------*/
+/*--- Printing of output file and summary stats            ---*/
+/*------------------------------------------------------------*/
+
+int get_line_num(Addr instr_addr) 
+{
+   Char filename[FILENAME_LEN] = "???";
+   UInt line_num;
+   Bool found;
+
+   found = VG_(what_line_is_this)(instr_addr, filename,
+                                  FILENAME_LEN, &line_num);
+   if (!found) {
+      line_num = 0; 
+   }
+   return line_num;
+}
+
+static void fprint_BBCC(Int fd, BBCC* BBCC_node, Char *first_instr_fl, 
+                                                 Char *first_instr_fn)
+{
+   Addr BBCC_ptr0, BBCC_ptr;
+   Char buf[BUF_LEN], curr_file[BUF_LEN], fbuf[BUF_LEN+4];
+   UInt line_num;
+
+   BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);
+
+   VG_(write)(fd, (void*)"\n", 1);
+
+   VG_(strcpy)(curr_file, first_instr_fl);
+   
+   while (BBCC_ptr - BBCC_ptr0 < BBCC_node->array_size) {
+
+      /* We pretend the CC is an iCC for getting the tag.  This is ok
+       * because both CC types have tag as their first byte.  Once we know
+       * the type, we can cast and act appropriately. */
+
+      Char fl_buf[FILENAME_LEN];
+      Char fn_buf[FN_NAME_LEN];
+
+      /* Assumes instr_addr position is same for both CCs. */
+      Addr instr_addr = ((iCC*)BBCC_ptr)->instr_addr;
+      get_file_fn_names(instr_addr, fl_buf, fn_buf);
+
+      /* Allow for filename switching in the middle of a BB;  if this happens,
+       * must print the new filename with the function name. */
+      if (0 != strcmp(fl_buf, curr_file)) {
+         VG_(strcpy)(curr_file, fl_buf);
+         VG_(sprintf)(fbuf, "fi=%s\n", curr_file);
+         VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf));
+      }
+
+      switch ( ((iCC*)BBCC_ptr)->tag ) {
+
+#define ADD_CC_TO(CC_type, cc, total)           \
+   total.a  += ((CC_type*)BBCC_ptr)->cc.a;      \
+   total.m1 += ((CC_type*)BBCC_ptr)->cc.m1;     \
+   total.m2 += ((CC_type*)BBCC_ptr)->cc.m2;
+          
+         case INSTR_CC:
+            line_num = get_line_num(((iCC*)BBCC_ptr)->instr_addr);
+            sprint_iCC(buf, line_num, (iCC*)BBCC_ptr);
+            ADD_CC_TO(iCC, I, Ir_total);
+            BBCC_ptr += sizeof(iCC);
+            break;
+
+         case READ_CC:
+         case  MOD_CC:
+            line_num = get_line_num(((idCC*)BBCC_ptr)->instr_addr);
+            sprint_read_or_mod_CC(buf, line_num, (idCC*)BBCC_ptr);
+            ADD_CC_TO(idCC, I, Ir_total);
+            ADD_CC_TO(idCC, D, Dr_total);
+            BBCC_ptr += sizeof(idCC);
+            break;
+
+         case WRITE_CC:
+            line_num = get_line_num(((idCC*)BBCC_ptr)->instr_addr);
+            sprint_write_CC(buf, line_num, (idCC*)BBCC_ptr);
+            ADD_CC_TO(idCC, I, Ir_total);
+            ADD_CC_TO(idCC, D, Dw_total);
+            BBCC_ptr += sizeof(idCC);
+            break;
+
+#undef ADD_CC_TO
+
+         default:
+            VG_(panic)("Unknown CC type in fprint_BBCC()\n");
+            break;
+      }
+      distinct_instrs++;
+      
+      /* If the function name for this instruction doesn't match that of the
+       * first instruction in the BB, print out a warning. */
+      if (VG_(clo_trace_symtab) && 0 != strcmp(fn_buf, first_instr_fn)) {
+         VG_(printf)("Mismatched function names\n");
+         VG_(printf)("  filenames: BB:%s, instr:%s;  "
+                     "fn_names:  BB:%s, instr:%s;  "
+                     "line: %d\n", 
+                     first_instr_fl, fl_buf, 
+                     first_instr_fn, fn_buf, 
+                     line_num);
+      }
+
+      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   }
+   /* If we switched filenames in the middle of the BB without switching back,
+    * switch back now because the subsequent BB may be relying on falling under
+    * the original file name. */
+   if (0 != VG_(strcmp)(first_instr_fl, curr_file)) {
+      VG_(sprintf)(fbuf, "fe=%s\n", first_instr_fl);
+      VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf));
+   }
+   //VG_(write)(fd, (void*)"#}\n", 3);
+
+   vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size);
+}
+
+static void fprint_BBCC_table_and_calc_totals(Int client_argc, 
+                                              Char** client_argv)
+{
+   Int        fd;
+   Char       buf[BUF_LEN];
+   file_node *curr_file_node;
+   fn_node   *curr_fn_node;
+   BBCC      *curr_BBCC;
+   Int        i,j,k;
+
+   VGP_PUSHCC(VgpCacheDump);
+   fd = VG_(open_write)(OUT_FILE);
+   if (-1 == fd) { file_err(); }
+
+   /* "desc:" lines (giving I1/D1/L2 cache configuration) */
+   VG_(write)(fd, (void*)I1_desc_line, VG_(strlen)(I1_desc_line));
+   VG_(write)(fd, (void*)D1_desc_line, VG_(strlen)(D1_desc_line));
+   VG_(write)(fd, (void*)L2_desc_line, VG_(strlen)(L2_desc_line));
+
+   /* "cmd:" line */
+   VG_(strcpy)(buf, "cmd:");
+   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   for (i = 0; i < client_argc; i++) {
+       VG_(sprintf)(buf, " %s", client_argv[i]);
+       VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   }
+   /* "events:" line */
+   VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw\n");
+   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+   /* Six loops here:  three for the hash table arrays, and three for the
+    * chains hanging off the hash table arrays. */
+   for (i = 0; i < N_FILE_ENTRIES; i++) {
+      curr_file_node = BBCC_table[i];
+      while (curr_file_node != NULL) {
+         VG_(sprintf)(buf, "fl=%s\n", curr_file_node->filename);
+         VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+         for (j = 0; j < N_FN_ENTRIES; j++) {
+            curr_fn_node = curr_file_node->fns[j];
+            while (curr_fn_node != NULL) {
+               VG_(sprintf)(buf, "fn=%s\n", curr_fn_node->fn_name);
+               VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+               for (k = 0; k < N_BBCC_ENTRIES; k++) {
+                  curr_BBCC = curr_fn_node->BBCCs[k];
+                  while (curr_BBCC != NULL) {
+                     fprint_BBCC(fd, curr_BBCC, 
+                             
+                             curr_file_node->filename,
+                             curr_fn_node->fn_name);
+
+                     curr_BBCC = curr_BBCC->next;
+                  }
+               }
+               curr_fn_node = curr_fn_node->next;
+            }
+         }
+         curr_file_node = curr_file_node->next;
+      }
+   }
+
+   /* Summary stats must come after rest of table, since we calculate them
+    * during traversal.  */ 
+   VG_(sprintf)(buf, "summary: "
+                     "%llu %llu %llu "
+                     "%llu %llu %llu "
+                     "%llu %llu %llu\n", 
+                     Ir_total.a, Ir_total.m1, Ir_total.m2,
+                     Dr_total.a, Dr_total.m1, Dr_total.m2,
+                     Dw_total.a, Dw_total.m1, Dw_total.m2);
+   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   VG_(close)(fd);
+}
+
+/* Adds commas to ULong, right justifying in a field field_width wide, returns
+ * the string in buf. */
+Int commify(ULong n, int field_width, char buf[COMMIFY_BUF_LEN])
+{
+   int len, n_commas, i, j, new_len, space;
+
+   VG_(sprintf)(buf, "%lu", n);
+   len = VG_(strlen)(buf);
+   n_commas = (len - 1) / 3;
+   new_len = len + n_commas;
+   space = field_width - new_len;
+
+   /* Allow for printing a number in a field_width smaller than it's size */
+   if (space < 0) space = 0;    
+
+   /* Make j = -1 because we copy the '\0' before doing the numbers in groups
+    * of three. */
+   for (j = -1, i = len ; i >= 0; i--) {
+      buf[i + n_commas + space] = buf[i];
+
+      if (3 == ++j) {
+         j = 0;
+         n_commas--;
+         buf[i + n_commas + space] = ',';
+      }
+   }
+   /* Right justify in field. */
+   for (i = 0; i < space; i++)  buf[i] = ' ';
+   return new_len;
+}
+
+void percentify(Int n, Int pow, Int field_width, char buf[]) 
+{
+   int i, len, space;
+    
+   VG_(sprintf)(buf, "%d.%d%%", n / pow, n % pow);
+   len = VG_(strlen)(buf);
+   space = field_width - len;
+   i = len;
+
+   /* Right justify in field */
+   for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
+   for (i = 0; i < space; i++)  buf[i] = ' ';
+}
+
+void VG_(show_cachesim_results)(Int client_argc, Char** client_argv)
+{
+   CC D_total;
+   ULong L2_total_m, L2_total_mr, L2_total_mw; 
+   char buf1[RESULTS_BUF], 
+        buf2[RESULTS_BUF], 
+        buf3[RESULTS_BUF];
+   Int l1, l2, l3;
+   Int p;
+
+   fprint_BBCC_table_and_calc_totals(client_argc, client_argv);
+
+   /* I cache results.  Use the I_refs value to determine the first column
+    * width. */
+   l1 = commify(Ir_total.a, 0, buf1);
+   VG_(message)(Vg_UserMsg, "I   refs:      %s", buf1);
+
+   commify(Ir_total.m1, l1, buf1);
+   VG_(message)(Vg_UserMsg, "I1  misses:    %s", buf1);
+
+   commify(Ir_total.m2, l1, buf1);
+   VG_(message)(Vg_UserMsg, "L2  misses:    %s", buf1);
+
+   p = 100;
+
+   percentify(Ir_total.m1 * 100 * p / Ir_total.a, p, l1+1, buf1);
+   VG_(message)(Vg_UserMsg, "I1  miss rate: %s", buf1);
+                
+   percentify(Ir_total.m2 * 100 * p / Ir_total.a, p, l1+1, buf1);
+   VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
+   VG_(message)(Vg_UserMsg, "");
+
+   /* D cache results.  Use the D_refs.rd and D_refs.wr values to determine the
+    * width of columns 2 & 3. */
+   D_total.a  = Dr_total.a  + Dw_total.a;
+   D_total.m1 = Dr_total.m1 + Dw_total.m1;
+   D_total.m2 = Dr_total.m2 + Dw_total.m2;
+       
+        commify( D_total.a, 0, buf1);
+   l2 = commify(Dr_total.a, 0, buf2);
+   l3 = commify(Dw_total.a, 0, buf3);
+   VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)",
+                buf1,  buf2,  buf3);
+
+   commify( D_total.m1, l1, buf1);
+   commify(Dr_total.m1, l2, buf2);
+   commify(Dw_total.m1, l3, buf3);
+   VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)",
+                buf1, buf2, buf3);
+
+   commify( D_total.m2, l1, buf1);
+   commify(Dr_total.m2, l2, buf2);
+   commify(Dw_total.m2, l3, buf3);
+   VG_(message)(Vg_UserMsg, "L2  misses:    %s  (%s rd + %s wr)",
+                buf1, buf2, buf3);
+
+   p = 10;
+   
+   percentify( D_total.m1 * 100 * p / D_total.a,  p, l1+1, buf1);
+   percentify(Dr_total.m1 * 100 * p / Dr_total.a, p, l2+1, buf2);
+   percentify(Dw_total.m1 * 100 * p / Dw_total.a, p, l3+1, buf3);
+   VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
+
+   percentify( D_total.m2 * 100 * p / D_total.a,  p, l1+1, buf1);
+   percentify(Dr_total.m2 * 100 * p / Dr_total.a, p, l2+1, buf2);
+   percentify(Dw_total.m2 * 100 * p / Dw_total.a, p, l3+1, buf3);
+   VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
+   VG_(message)(Vg_UserMsg, "");
+
+   /* L2 overall results */
+   L2_total_m  = Dr_total.m2 + Dw_total.m2 + Ir_total.m2;
+   L2_total_mr = Dr_total.m2 + Ir_total.m2;
+   L2_total_mw = Dw_total.m2;
+
+   commify(L2_total_m,  l1, buf1);
+   commify(L2_total_mr, l2, buf2);
+   commify(L2_total_mw, l3, buf3);
+   VG_(message)(Vg_UserMsg, "L2 misses:     %s  (%s rd + %s wr)",
+                buf1, buf2, buf3);
+
+   percentify(L2_total_m  * 100 * p / (Ir_total.a + D_total.a),  p, l1+1, buf1);
+   percentify(L2_total_mr * 100 * p / (Ir_total.a + Dr_total.a), p, l2+1, buf2);
+   percentify(L2_total_mw * 100 * p / Dw_total.a, p, l3+1, buf3);
+   VG_(message)(Vg_UserMsg, "L2 miss rate:  %s (%s   + %s  )", buf1, buf2,buf3);
+            
+
+   /* Hash table stats */
+   if (VG_(clo_verbosity) > 1) {
+       int BB_lookups = full_debug_BBs      + fn_name_debug_BBs +
+                        file_line_debug_BBs + no_debug_BBs;
+      
+       VG_(message)(Vg_DebugMsg, "");
+       VG_(message)(Vg_DebugMsg, "Distinct files:   %d", distinct_files);
+       VG_(message)(Vg_DebugMsg, "Distinct fns:     %d", distinct_fns);
+       VG_(message)(Vg_DebugMsg, "BB lookups:       %d", BB_lookups);
+       VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)", 
+                    full_debug_BBs    * 100 / BB_lookups,
+                    full_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)", 
+                    file_line_debug_BBs * 100 / BB_lookups,
+                    file_line_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)", 
+                    fn_name_debug_BBs * 100 / BB_lookups,
+                    fn_name_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)", 
+                    no_debug_BBs      * 100 / BB_lookups,
+                    no_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "BBs Retranslated: %d", BB_retranslations);
+       VG_(message)(Vg_DebugMsg, "Distinct instrs:  %d", distinct_instrs);
+   }
+   VGP_POPCC;
+}
+