Redo the way cachegrind generates instrumentation code, so that it can
deal with any IR that happens to show up.  This makes it work on ppc32
and should fix occasionally-reported bugs on x86/amd64 where it bombs
due to having to deal with multiple date references in a single
instruction.

The new scheme is based around the idea of a queue of memory events
which are outstanding, in the sense that no IR has yet been generated
to do the relevant helper calls.  The presence of the queue --
currently 16 entries deep -- gives cachegrind more scope for combining
multiple memory references into a single helper function call.  As a
result it runs 3%-5% faster than the previous version, on x86.

This commit also changes the type of the tool interface function
'tool_discard_basic_block_info' and clarifies its meaning.  See
comments in include/pub_tool_tooliface.h.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@4903 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index d98c7e3..f36d374 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -51,6 +51,9 @@
 /*--- Constants                                            ---*/
 /*------------------------------------------------------------*/
 
+/* Set to 1 for very verbose debugging */
+#define DEBUG_CG 0
+
 #define MIN_LINE_SIZE         16
 #define FILE_LEN              256
 #define FN_LEN                256
@@ -131,7 +134,6 @@
 struct _instr_info {
    Addr    instr_addr;
    UChar   instr_len;
-   UChar   data_size;
    lineCC* parent;       // parent line-CC
 };
 
@@ -292,8 +294,8 @@
 static VG_REGPARM(1)
 void log_1I_0D_cache_access(instr_info* n)
 {
-   //VG_(printf)("1I_0D : CCaddr=0x%x, iaddr=0x%x, isize=%u\n",
-   //            n, n->instr_addr, n->instr_len);
+   //VG_(printf)("1I_0D :  CCaddr=0x%010lx,  iaddr=0x%010lx,  isize=%lu\n",
+   //             n, n->instr_addr, n->instr_len);
    VGP_PUSHCC(VgpCacheSimulate);
    cachesim_I1_doref(n->instr_addr, n->instr_len, 
                      &n->parent->Ir.m1, &n->parent->Ir.m2);
@@ -302,62 +304,214 @@
 }
 
 static VG_REGPARM(2)
-void log_1I_1Dr_cache_access(instr_info* n, Addr data_addr)
+void log_2I_0D_cache_access(instr_info* n, instr_info* n2)
 {
-   //VG_(printf)("1I_1Dr: CCaddr=%p, iaddr=%p, isize=%u, daddr=%p, dsize=%u\n",
-   //            n, n->instr_addr, n->instr_len, data_addr, n->data_size);
+   //VG_(printf)("2I_0D : CC1addr=0x%010lx, i1addr=0x%010lx, i1size=%lu\n"
+   //            "        CC2addr=0x%010lx, i2addr=0x%010lx, i2size=%lu\n",
+   //            n,  n->instr_addr,  n->instr_len,
+   //            n2, n2->instr_addr, n2->instr_len);
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_I1_doref(n->instr_addr, n->instr_len, 
+                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+   n->parent->Ir.a++;
+   cachesim_I1_doref(n2->instr_addr, n2->instr_len, 
+                     &n2->parent->Ir.m1, &n2->parent->Ir.m2);
+   n2->parent->Ir.a++;
+   VGP_POPCC(VgpCacheSimulate);
+}
+
+static VG_REGPARM(3)
+void log_3I_0D_cache_access(instr_info* n, instr_info* n2, instr_info* n3)
+{
+   //VG_(printf)("3I_0D : CC1addr=0x%010lx, i1addr=0x%010lx, i1size=%lu\n"
+   //            "        CC2addr=0x%010lx, i2addr=0x%010lx, i2size=%lu\n"
+   //            "        CC3addr=0x%010lx, i3addr=0x%010lx, i3size=%lu\n",
+   //            n,  n->instr_addr,  n->instr_len,
+   //            n2, n2->instr_addr, n2->instr_len,
+   //            n3, n3->instr_addr, n3->instr_len);
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_I1_doref(n->instr_addr, n->instr_len, 
+                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+   n->parent->Ir.a++;
+   cachesim_I1_doref(n2->instr_addr, n2->instr_len, 
+                     &n2->parent->Ir.m1, &n2->parent->Ir.m2);
+   n2->parent->Ir.a++;
+   cachesim_I1_doref(n3->instr_addr, n3->instr_len, 
+                     &n3->parent->Ir.m1, &n3->parent->Ir.m2);
+   n3->parent->Ir.a++;
+   VGP_POPCC(VgpCacheSimulate);
+}
+
+static VG_REGPARM(3)
+void log_1I_1Dr_cache_access(instr_info* n, Addr data_addr, Word data_size)
+{
+   //VG_(printf)("1I_1Dr:  CCaddr=0x%010lx,  iaddr=0x%010lx,  isize=%lu\n"
+   //            "                               daddr=0x%010lx,  dsize=%lu\n",
+   //            n, n->instr_addr, n->instr_len, data_addr, data_size);
    VGP_PUSHCC(VgpCacheSimulate);
    cachesim_I1_doref(n->instr_addr, n->instr_len, 
                      &n->parent->Ir.m1, &n->parent->Ir.m2);
    n->parent->Ir.a++;
 
-   cachesim_D1_doref(data_addr, n->data_size, 
+   cachesim_D1_doref(data_addr, data_size, 
                      &n->parent->Dr.m1, &n->parent->Dr.m2);
    n->parent->Dr.a++;
    VGP_POPCC(VgpCacheSimulate);
 }
 
-static VG_REGPARM(2)
-void log_1I_1Dw_cache_access(instr_info* n, Addr data_addr)
+static VG_REGPARM(3)
+void log_1I_1Dw_cache_access(instr_info* n, Addr data_addr, Word data_size)
 {
-   //VG_(printf)("1I_1Dw: CCaddr=%p, iaddr=%p, isize=%u, daddr=%p, dsize=%u\n",
-   //            n, n->instr_addr, n->instr_len, data_addr, n->data_size);
+   //VG_(printf)("1I_1Dw:  CCaddr=0x%010lx,  iaddr=0x%010lx,  isize=%lu\n"
+   //            "                               daddr=0x%010lx,  dsize=%lu\n",
+   //            n, n->instr_addr, n->instr_len, data_addr, data_size);
    VGP_PUSHCC(VgpCacheSimulate);
    cachesim_I1_doref(n->instr_addr, n->instr_len, 
                      &n->parent->Ir.m1, &n->parent->Ir.m2);
    n->parent->Ir.a++;
 
-   cachesim_D1_doref(data_addr, n->data_size, 
+   cachesim_D1_doref(data_addr, data_size, 
                      &n->parent->Dw.m1, &n->parent->Dw.m2);
    n->parent->Dw.a++;
    VGP_POPCC(VgpCacheSimulate);
 }
 
 static VG_REGPARM(3)
-void log_1I_2D_cache_access(instr_info* n, Addr data_addr1, Addr data_addr2)
+void log_0I_1Dr_cache_access(instr_info* n, Addr data_addr, Word data_size)
 {
-   //VG_(printf)("1I_2D: CCaddr=%p, iaddr=%p, isize=%u, daddr1=%p, daddr2=%p, dsize=%u\n",
-   //            n, n->instr_addr, n->instr_len, data_addr1, data_addr2, n->data_size);
+   //VG_(printf)("0I_1Dr:  CCaddr=0x%010lx,  daddr=0x%010lx,  dsize=%lu\n",
+   //            n, data_addr, data_size);
    VGP_PUSHCC(VgpCacheSimulate);
-   cachesim_I1_doref(n->instr_addr, n->instr_len,
-                     &n->parent->Ir.m1,  &n->parent->Ir.m2);
-   n->parent->Ir.a++;
-
-   cachesim_D1_doref(data_addr1, n->data_size,
+   cachesim_D1_doref(data_addr, data_size, 
                      &n->parent->Dr.m1, &n->parent->Dr.m2);
    n->parent->Dr.a++;
-   cachesim_D1_doref(data_addr2, n->data_size,
+   VGP_POPCC(VgpCacheSimulate);
+}
+
+static VG_REGPARM(3)
+void log_0I_1Dw_cache_access(instr_info* n, Addr data_addr, Word data_size)
+{
+   //VG_(printf)("0I_1Dw:  CCaddr=0x%010lx,  daddr=0x%010lx,  dsize=%lu\n",
+   //            n, data_addr, data_size);
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_D1_doref(data_addr, data_size, 
                      &n->parent->Dw.m1, &n->parent->Dw.m2);
    n->parent->Dw.a++;
    VGP_POPCC(VgpCacheSimulate);
 }
 
 /*------------------------------------------------------------*/
-/*--- Instrumentation                                      ---*/
+/*--- Instrumentation types and structures                 ---*/
+/*------------------------------------------------------------*/
+
+/* Maintain an ordered list of memory events which are outstanding, in
+   the sense that no IR has yet been generated to do the relevant
+   helper calls.  The BB is scanned top to bottom and memory events
+   are added to the end of the list, merging with the most recent
+   notified event where possible (Dw immediately following Dr and
+   having the same size and EA can be merged).
+
+   This merging is done so that for architectures which have
+   load-op-store instructions (x86, amd64), the insn is treated as if
+   it makes just one memory reference (a modify), rather than two (a
+   read followed by a write at the same address).
+
+   At various points the list will need to be flushed, that is, IR
+   generated from it.  That must happen before any possible exit from
+   the block (the end, or an IRStmt_Exit).  Flushing also takes place
+   when there is no space to add a new event.
+
+   If we require the simulation statistics to be up to date with
+   respect to possible memory exceptions, then the list would have to
+   be flushed before each memory reference.  That would however lose
+   performance by inhibiting event-merging during flushing.
+
+   Flushing the list consists of walking it start to end and emitting
+   instrumentation IR for each event, in the order in which they
+   appear.  It may be possible to emit a single call for two adjacent
+   events in order to reduce the number of helper function calls made.
+   For example, it could well be profitable to handle two adjacent Ir
+   events with a single helper call.  */
+
+typedef
+   IRExpr 
+   IRAtom;
+
+typedef 
+   enum { Event_Ir=0, Event_Dr=1, Event_Dw=2, Event_Dm=3 }
+   EventKind;
+
+typedef
+   struct {
+      EventKind ekind;
+      Int       size;   /* ALL */
+      Addr64    iaddr;  /* ALL.  For Dr/Dw/Dm is & of parent insn. */
+      IRAtom*   dataEA; /* Dr/Dw/Dm only */ /* IR ATOM ONLY */
+   }
+   Event;
+
+/* Up to this many unnotified events are allowed.  Number is
+   arbitrary.  Larger numbers allow more event merging to occur, but
+   potentially induce more spilling due to extending live ranges of
+   address temporaries. */
+#define N_EVENTS 16
+
+
+/* A struct which holds all the running state during instrumentation.
+   Mostly to avoid passing loads of parameters everywhere. */
+typedef
+   struct {
+      /* The current outstanding-memory-event list. */
+      Event events[N_EVENTS];
+      Int   events_used;
+
+      /* The array of instr_info bins for the BB. */
+      BB_info* bbInfo;
+
+      /* Number instr_info bins 'used' so far. */
+      Int bbInfo_i;
+
+      /* Not sure what this is for (jrs 20051009) */
+      Bool bbSeenBefore;
+
+      /* The output BB being constructed. */
+      IRBB* bbOut;
+   }
+   CgState;
+
+
+static Int index3 ( EventKind k1, EventKind k2, EventKind k3 )
+{
+  Int i1 = k1;
+  Int i2 = k2;
+  Int i3 = k3;
+  Int  r;
+  tl_assert(i1 >= 0 && i1 < 4);
+  tl_assert(i2 >= 0 && i2 < 4);
+  tl_assert(i3 >= 0 && i3 < 4);
+  r = 16*i1 + 4*i2 + i3;
+  tl_assert(r >= 0 && r < 64);
+  return r;
+}
+
+static void show3 ( Int idx )
+{
+  HChar* names = "IRWM";
+  Int i1 = (idx >> 4) & 3;
+  Int i2 = (idx >> 2) & 3;
+  Int i3 = idx & 3;
+  VG_(printf)("%c%c%c", names[i1], names[i2], names[i3]);
+}
+
+static Int trigrams[64];
+
+
+/*------------------------------------------------------------*/
+/*--- Instrumentation main                                 ---*/
 /*------------------------------------------------------------*/
 
 static
-BB_info* get_BB_info(IRBB* bbIn, Addr origAddr, Bool* bbSeenBefore)
+BB_info* get_BB_info(IRBB* bbIn, Addr origAddr, /*OUT*/Bool* bbSeenBefore)
 {
    Int      i, n_instrs;
    IRStmt*  st;
@@ -389,143 +543,14 @@
    return bbInfo;
 }
 
-static 
-Bool handleOneStatement(IRTypeEnv* tyenv, IRBB* bbOut, IRStmt* st, IRStmt* st2,
-                        Addr* instrAddr, UInt* instrLen,
-                        IRExpr** loadAddrExpr, IRExpr** storeAddrExpr,
-                        UInt* dataSize)
-{
-   tl_assert(isFlatIRStmt(st));
-
-   switch (st->tag) {
-   case Ist_NoOp:
-   case Ist_AbiHint:
-   case Ist_Put:
-   case Ist_PutI:
-   case Ist_MFence:
-      break;
-
-   case Ist_Exit: {
-      // This is a conditional jump.  Most of the time, we want to add the
-      // instrumentation before it, to ensure it gets executed.  Eg, (1) if
-      // this conditional jump is just before an IMark:
-      //
-      //   t108 = Not1(t107)
-      //   [add instrumentation here]
-      //   if (t108) goto {Boring} 0x3A96637D:I32
-      //   ------ IMark(0x3A966370, 7) ------
-      //
-      // or (2) if this conditional jump is the last thing before the
-      // block-ending unconditional jump:
-      //
-      //   t111 = Not1(t110)
-      //   [add instrumentation here]
-      //   if (t111) goto {Boring} 0x3A96637D:I32
-      //   goto {Boring} 0x3A966370:I32
-      //
-      // One case (3) where we want the instrumentation after the conditional
-      // jump is when the conditional jump is for an x86 REP instruction:
-      //
-      //   ------ IMark(0x3A967F13, 2) ------
-      //   t1 = GET:I32(4)
-      //   t6 = CmpEQ32(t1,0x0:I32) 
-      //   if (t6) goto {Boring} 0x3A967F15:I32    # ignore this cond jmp
-      //   t7 = Sub32(t1,0x1:I32)
-      //   PUT(4) = t7
-      //   ...
-      //   t56 = Not1(t55)
-      //   [add instrumentation here]
-      //   if (t56) goto {Boring} 0x3A967F15:I32
-      //
-      // Therefore, we return true if the next statement is an IMark, or if
-      // there is no next statement (which matches case (2), as the final
-      // unconditional jump is not represented in the IRStmt list).
-      //
-      // Note that this approach won't do in the long run for supporting
-      // PPC, but it's good enough for x86/AMD64 for the 3.0.X series.
-      if (NULL == st2 || Ist_IMark == st2->tag)
-         return True;
-      else
-         return False;
-   }
-
-   case Ist_IMark:
-      /* st->Ist.IMark.addr is a 64-bit int.  ULong_to_Ptr casts this
-         to the host's native pointer type; if that is 32 bits then it
-         discards the upper 32 bits.  If we are cachegrinding on a
-         32-bit host then we are also ensured that the guest word size
-         is 32 bits, due to the assertion in cg_instrument that the
-         host and guest word sizes must be the same.  Hence
-         st->Ist.IMark.addr will have been derived from a 32-bit guest
-         code address and truncation of it is safe.  I believe this
-         assignment should be correct for both 32- and 64-bit
-         machines. */
-      *instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
-      *instrLen =        st->Ist.IMark.len;
-      break;
-
-   case Ist_Tmp: {
-      IRExpr* data = st->Ist.Tmp.data;
-      if (data->tag == Iex_Load) {
-         IRExpr* aexpr = data->Iex.Load.addr;
-         tl_assert( isIRAtom(aexpr) );
-         // Note also, endianness info is ignored.  I guess that's not
-         // interesting.
-         // XXX: repe cmpsb does two loads... the first one is ignored here!
-         //tl_assert( NULL == *loadAddrExpr );          // XXX: ???
-         *loadAddrExpr = aexpr;
-         *dataSize = sizeofIRType(data->Iex.Load.ty);
-      }
-      break;
-   }
-      
-   case Ist_Store: {
-      IRExpr* data  = st->Ist.Store.data;
-      IRExpr* aexpr = st->Ist.Store.addr;
-      tl_assert( isIRAtom(aexpr) );
-      tl_assert( NULL == *storeAddrExpr );          // XXX: ???
-      *storeAddrExpr = aexpr;
-      *dataSize = sizeofIRType(typeOfIRExpr(tyenv, data));
-      break;
-   }
-   
-   case Ist_Dirty: {
-      IRDirty* d = st->Ist.Dirty.details;
-      if (d->mFx != Ifx_None) {
-         /* This dirty helper accesses memory.  Collect the
-            details. */
-         tl_assert(d->mAddr != NULL);
-         tl_assert(d->mSize != 0);
-         *dataSize = d->mSize;
-         if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
-            *loadAddrExpr = d->mAddr;
-         if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
-            *storeAddrExpr = d->mAddr;
-      } else {
-         tl_assert(d->mAddr == NULL);
-         tl_assert(d->mSize == 0);
-      }
-      break;
-   }
-
-   default:
-      VG_(printf)("\n");
-      ppIRStmt(st);
-      VG_(printf)("\n");
-      VG_(tool_panic)("Cachegrind: unhandled IRStmt");
-   }
-
-   return False;
-}
 
 static
-void do_details( instr_info* n, Bool bbSeenBefore,
-                 Addr instr_addr, Int instr_len, Int data_size )
+void init_instr_info( instr_info* n, Bool bbSeenBefore,
+                      Addr instr_addr, Int instr_len )
 {
    if (bbSeenBefore) {
       tl_assert( n->instr_addr == instr_addr );
       tl_assert( n->instr_len  == instr_len );
-      tl_assert( n->data_size  == data_size );
       // Don't check that (n->parent == parent)... it's conceivable that
       // the debug info might change;  the other asserts should be enough to
       // detect anything strange.
@@ -533,126 +558,318 @@
       lineCC* parent = get_lineCC(instr_addr);
       n->instr_addr = instr_addr;
       n->instr_len  = instr_len;
-      n->data_size  = data_size;
       n->parent     = parent;
    }
 }
 
-static Bool loadStoreAddrsMatch(IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+static void showEvent ( Event* ev )
 {
-  // I'm assuming that for 'modify' instructions, that Vex always makes
-  // the loadAddrExpr and storeAddrExpr be of the same type, ie. both Tmp
-  // expressions, or both Const expressions.
-  tl_assert(isIRAtom(loadAddrExpr));
-  tl_assert(isIRAtom(storeAddrExpr));
-  return eqIRAtom(loadAddrExpr, storeAddrExpr);
+   switch (ev->ekind) {
+      case Event_Ir: 
+         VG_(printf)("Ir %d 0x%llx\n", ev->size, ev->iaddr);
+         break;
+      case Event_Dr:
+         VG_(printf)("Dr %d 0x%llx EA=", ev->size, ev->iaddr);
+         ppIRExpr(ev->dataEA); 
+         VG_(printf)("\n");
+         break;
+      case Event_Dw:
+         VG_(printf)("Dw %d 0x%llx EA=", ev->size, ev->iaddr);
+         ppIRExpr(ev->dataEA); 
+         VG_(printf)("\n");
+         break;
+      case Event_Dm:
+         VG_(printf)("Dm %d 0x%llx EA=", ev->size, ev->iaddr);
+         ppIRExpr(ev->dataEA); 
+         VG_(printf)("\n");
+         break;
+      default: 
+         tl_assert(0);
+         break;
+   }
 }
 
-// Instrumentation for the end of each original instruction.
-static
-void instrumentInstr(IRBB* bbOut, instr_info* i_node, Bool bbSeenBefore,
-                     UInt instrAddr, UInt instrLen, UInt dataSize,
-                     IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+/* Reserve instr_info for the first mention of a new insn. */
+
+static instr_info* reserve_instr_info ( CgState* cgs )
 {
-   IRDirty* di;
-   IRExpr  *arg1, *arg2, *arg3, **argv;
-   Int      argc;
-   Char*    helperName;
-   void*    helperAddr;
-   IRType   wordTy;
+   instr_info* i_node;
+   tl_assert(cgs->bbInfo_i >= 0);
+   tl_assert(cgs->bbInfo_i < cgs->bbInfo->n_instrs);
+   i_node = &cgs->bbInfo->instrs[ cgs->bbInfo_i ];
+   cgs->bbInfo_i++;
+   return i_node;
+}
 
-   // Stay sane ...
-   tl_assert(sizeof(HWord) == sizeof(void*));
-   if (sizeof(HWord) == 4) {
-      wordTy = Ity_I32;
-   } else
-   if (sizeof(HWord) == 8) {
-      wordTy = Ity_I64;
-   } else {
-      VG_(tool_panic)("instrumentInstr: strange word size");
-   }
 
-   if (loadAddrExpr) 
-      tl_assert(wordTy == typeOfIRExpr(bbOut->tyenv, loadAddrExpr));
-   if (storeAddrExpr) 
-      tl_assert(wordTy == typeOfIRExpr(bbOut->tyenv, storeAddrExpr));
+/* Find the most recently allocated instr_info. */
 
-   // Large (eg. 28B, 108B, 512B on x86) data-sized instructions will be
-   // done inaccurately, but they're very rare and this avoids errors from
-   // hitting more than two cache lines in the simulation.
-   if (dataSize > MIN_LINE_SIZE) dataSize = MIN_LINE_SIZE;
+static instr_info* find_most_recent_instr_info ( CgState* cgs )
+{
+   tl_assert(cgs->bbInfo_i >= 0);
+   tl_assert(cgs->bbInfo_i <= cgs->bbInfo->n_instrs);
+   if (cgs->bbInfo_i == 0)
+      return NULL;
+   else
+      return &cgs->bbInfo->instrs[ cgs->bbInfo_i - 1 ];
+}
 
-   // Setup 1st arg: instr_info node's address
-   // Believed to be 64-bit clean
-   do_details(i_node, bbSeenBefore, instrAddr, instrLen, dataSize );
-   arg1 = mkIRExpr_HWord( (HWord)i_node );
 
-   if (!loadAddrExpr && !storeAddrExpr) {
-      // no load/store
-      tl_assert(0 == dataSize);
-      helperName = "log_1I_0D_cache_access";
-      helperAddr = &log_1I_0D_cache_access;
-      argc = 1;
-      argv = mkIRExprVec_1(arg1);
+/* Generate code for all outstanding memory events, and mark the queue
+   empty.  Code is generated into cgs->bbOut, and this activity
+   'consumes' slots in cgs->bbInfo. */
 
-   } else if (loadAddrExpr && !storeAddrExpr) {
-      // load
-      tl_assert( isIRAtom(loadAddrExpr) );
-      helperName = "log_1I_1Dr_cache_access";
-      helperAddr = &log_1I_1Dr_cache_access;
-      argc = 2;
-      arg2 = loadAddrExpr;
-      argv = mkIRExprVec_2(arg1, arg2);
+static void flushEvents ( CgState* cgs )
+{
+   Int         i, regparms;
+   Char*       helperName;
+   void*       helperAddr;
+   IRExpr**    argv;
+   IRExpr*     i_node_expr;
+   IRExpr*     i_node2_expr;
+   IRExpr*     i_node3_expr;
+   IRDirty*    di;
+   instr_info* i_node;
+   instr_info* i_node2;
+   instr_info* i_node3;
 
-   } else if (!loadAddrExpr && storeAddrExpr) {
-      // store
-      tl_assert( isIRAtom(storeAddrExpr) );
-      helperName = "log_1I_1Dw_cache_access";
-      helperAddr = &log_1I_1Dw_cache_access;
-      argc = 2;
-      arg2 = storeAddrExpr;
-      argv = mkIRExprVec_2(arg1, arg2);
- 
-   } else {
-      tl_assert( loadAddrExpr && storeAddrExpr );
-      tl_assert( isIRAtom(loadAddrExpr) );
-      tl_assert( isIRAtom(storeAddrExpr) );
+   for (i = 0; i < cgs->events_used-2; i++)
+     trigrams [ index3( cgs->events[i].ekind, cgs->events[i+1].ekind,cgs->events[i+2].ekind ) ]++;
 
-      if ( loadStoreAddrsMatch(loadAddrExpr, storeAddrExpr) ) {
-         // modify
-         helperName = "log_1I_1Dr_cache_access";
-         helperAddr = &log_1I_1Dr_cache_access;
-         argc = 2;
-         arg2 = loadAddrExpr;
-         argv = mkIRExprVec_2(arg1, arg2);
+   i = 0;
+   while (i < cgs->events_used) {
 
-      } else {
-         // load/store
-         helperName = "log_1I_2D_cache_access";
-         helperAddr = &log_1I_2D_cache_access;
-         argc = 3;
-         arg2 = loadAddrExpr;
-         arg3 = storeAddrExpr;
-         argv = mkIRExprVec_3(arg1, arg2, arg3);
+      helperName = NULL;
+      helperAddr = NULL;
+      argv       = NULL;
+      regparms   = 0;
+
+      /* generate IR to notify event i and possibly the ones
+         immediately following it. */
+      tl_assert(i >= 0 && i < cgs->events_used);
+      if (DEBUG_CG) {
+         VG_(printf)("   flush "); 
+         showEvent( &cgs->events[i] );
       }
+
+      /* For any event we find the relevant instr_info.  The following
+         assumes that Event_Ir is the first event to refer to any
+         specific insn, and so a new entry in the cgs->bbInfo->instrs
+         is allocated.  All other events (Dr,Dw,Dm) must refer to the
+         most recently encountered IMark and so we use the
+         most-recently allocated instrs[] entry, which must exist. */
+
+      if (cgs->events[i].ekind == Event_Ir) {
+         /* allocate an instr_info and fill in its addr/size. */
+         i_node = reserve_instr_info( cgs );
+         tl_assert(i_node);
+         init_instr_info( i_node, cgs->bbSeenBefore,
+                          (Addr)cgs->events[i].iaddr, /* i addr */
+                          cgs->events[i].size  /* i size */);
+      } else {
+         /* use the most-recently allocated i_node but don't mess with
+            its internals */
+         i_node = find_most_recent_instr_info( cgs );
+         /* it must actually exist */
+         tl_assert(i_node);
+         /* it must match the declared parent instruction of this
+            event. */
+         tl_assert(i_node->instr_addr == cgs->events[i].iaddr);
+      }
+
+      i_node_expr = mkIRExpr_HWord( (HWord)i_node );
+
+      /* Decide on helper fn to call and args to pass it, and advance
+         i appropriately. */
+      switch (cgs->events[i].ekind) {
+         case Event_Ir:
+            /* Merge with a following Dr/Dm if it is from this insn. */
+            if (i < cgs->events_used-1 
+                && cgs->events[i+1].iaddr == cgs->events[i].iaddr
+                && (cgs->events[i+1].ekind == Event_Dr
+                    || cgs->events[i+1].ekind == Event_Dm)) {
+               helperName = "log_1I_1Dr_cache_access";
+               helperAddr = &log_1I_1Dr_cache_access;
+               argv = mkIRExprVec_3( i_node_expr,
+                                     cgs->events[i+1].dataEA,
+                                     mkIRExpr_HWord( cgs->events[i+1].size ) );
+               regparms = 3;
+               i += 2;
+            }
+            /* Merge with a following Dw if it is from this insn. */
+            else
+            if (i < cgs->events_used-1 
+                && cgs->events[i+1].iaddr == cgs->events[i].iaddr
+                && cgs->events[i+1].ekind == Event_Dw) {
+               helperName = "log_1I_1Dw_cache_access";
+               helperAddr = &log_1I_1Dw_cache_access;
+               argv = mkIRExprVec_3( i_node_expr,
+                                     cgs->events[i+1].dataEA,
+                                     mkIRExpr_HWord( cgs->events[i+1].size ) );
+               regparms = 3;
+               i += 2;
+            }
+            /* Merge with two following Irs if possible. */
+            else
+            if (i < cgs->events_used-2 
+                && cgs->events[i+1].ekind == Event_Ir
+                && cgs->events[i+2].ekind == Event_Ir) {
+               helperName = "log_3I_0D_cache_access";
+               helperAddr = &log_3I_0D_cache_access;
+
+               i_node2 = reserve_instr_info( cgs );
+               tl_assert(i_node2);
+               init_instr_info( i_node2, cgs->bbSeenBefore,
+                                (Addr)cgs->events[i+1].iaddr, /* i addr */
+                                cgs->events[i+1].size  /* i size */);
+               i_node2_expr = mkIRExpr_HWord( (HWord)i_node2 );
+
+               i_node3 = reserve_instr_info( cgs );
+               tl_assert(i_node3);
+               init_instr_info( i_node3, cgs->bbSeenBefore,
+                                (Addr)cgs->events[i+2].iaddr, /* i addr */
+                                cgs->events[i+2].size  /* i size */);
+               i_node3_expr = mkIRExpr_HWord( (HWord)i_node3 );
+
+               argv = mkIRExprVec_3( i_node_expr, i_node2_expr, i_node3_expr );
+               regparms = 3;
+               i += 3;
+            }
+            /* Merge with a following Ir if possible. */
+            else
+            if (i < cgs->events_used-1 
+                && cgs->events[i+1].ekind == Event_Ir) {
+               helperName = "log_2I_0D_cache_access";
+               helperAddr = &log_2I_0D_cache_access;
+               i_node2 = reserve_instr_info( cgs );
+               tl_assert(i_node2);
+               init_instr_info( i_node2, cgs->bbSeenBefore,
+                                (Addr)cgs->events[i+1].iaddr, /* i addr */
+                                cgs->events[i+1].size  /* i size */);
+               i_node2_expr = mkIRExpr_HWord( (HWord)i_node2 );
+               argv = mkIRExprVec_2( i_node_expr, i_node2_expr );
+               regparms = 2;
+               i += 2;
+            }
+            /* No merging possible; emit as-is. */
+            else {
+               helperName = "log_1I_0D_cache_access";
+               helperAddr = &log_1I_0D_cache_access;
+               argv = mkIRExprVec_1( i_node_expr );
+               regparms = 1;
+               i++;
+            }
+            break;
+         case Event_Dr:
+         case Event_Dm:
+            helperName = "log_0I_1Dr_cache_access";
+            helperAddr = &log_0I_1Dr_cache_access;
+            argv = mkIRExprVec_3( i_node_expr, 
+                                  cgs->events[i].dataEA, 
+                                  mkIRExpr_HWord( cgs->events[i].size ) );
+            regparms = 3;
+            i++;
+            break;
+         case Event_Dw:
+            helperName = "log_0I_1Dw_cache_access";
+            helperAddr = &log_0I_1Dw_cache_access;
+            argv = mkIRExprVec_3( i_node_expr,
+                                  cgs->events[i].dataEA, 
+                                  mkIRExpr_HWord( cgs->events[i].size ) );
+            regparms = 3;
+            i++;
+            break;
+         default:
+            tl_assert(0);
+      }
+
+      /* Add the helper. */
+      tl_assert(helperName);
+      tl_assert(helperAddr);
+      tl_assert(argv);
+      di = unsafeIRDirty_0_N( regparms, helperName, helperAddr, argv);
+      addStmtToIRBB( cgs->bbOut, IRStmt_Dirty(di) );
    }
 
-   // Add call to the instrumentation function
-   di = unsafeIRDirty_0_N( argc, helperName, helperAddr, argv);
-   addStmtToIRBB( bbOut, IRStmt_Dirty(di) );
+   cgs->events_used = 0;
 }
 
+
+static void addEvent_Ir ( CgState* cgs, Int size, Addr64 iaddr )
+{
+   Event* evt;
+   tl_assert(size >= 0 && size <= MIN_LINE_SIZE);
+   if (cgs->events_used == N_EVENTS)
+      flushEvents(cgs);
+   tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
+   /* If vex fails to decode an insn, the size will be zero, but that
+      can't really be true -- the cpu couldn't have determined the
+      insn was undecodable without looking at it.  Hence: */
+   if (size == 0)
+      size = 1;
+   evt = &cgs->events[cgs->events_used];
+   evt->ekind  = Event_Ir;
+   evt->size   = size;
+   evt->iaddr  = iaddr;
+   evt->dataEA = NULL; /*paranoia*/
+   cgs->events_used++;
+}
+
+static void addEvent_Dr ( CgState* cgs, Int size, Addr64 iaddr, IRAtom* ea )
+{
+   Event* evt;
+   tl_assert(isIRAtom(ea));
+   tl_assert(size >= 1 && size <= MIN_LINE_SIZE);
+   if (cgs->events_used == N_EVENTS)
+      flushEvents(cgs);
+   tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
+   evt = &cgs->events[cgs->events_used];
+   evt->ekind  = Event_Dr;
+   evt->size   = size;
+   evt->iaddr  = iaddr;
+   evt->dataEA = ea;
+   cgs->events_used++;
+}
+
+static void addEvent_Dw ( CgState* cgs, Int size, Addr64 iaddr, IRAtom* ea )
+{
+   tl_assert(isIRAtom(ea));
+   tl_assert(size >= 1 && size <= MIN_LINE_SIZE);
+
+   /* Is it possible to merge this write into an immediately preceding
+      read? */
+   if (cgs->events_used > 0
+       && cgs->events[cgs->events_used-1].ekind == Event_Dr
+       && cgs->events[cgs->events_used-1].size  == size
+       && cgs->events[cgs->events_used-1].iaddr == iaddr
+       && eqIRAtom(cgs->events[cgs->events_used-1].dataEA, ea)) {
+      cgs->events[cgs->events_used-1].ekind = Event_Dm;
+      return;
+   }
+
+   /* No.  Add as normal. */
+   if (cgs->events_used == N_EVENTS)
+      flushEvents(cgs);
+   tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
+   cgs->events[cgs->events_used].ekind  = Event_Dw;
+   cgs->events[cgs->events_used].size   = size;
+   cgs->events[cgs->events_used].iaddr  = iaddr;
+   cgs->events[cgs->events_used].dataEA = ea;
+   cgs->events_used++;
+}
+
+////////////////////////////////////////////////////////////
+
+
 static IRBB* cg_instrument ( IRBB* bbIn, VexGuestLayout* layout, 
                              IRType gWordTy, IRType hWordTy )
 {
-   Int      i, dataSize = 0, bbInfo_i;
-   IRBB*    bbOut;
-   IRStmt*  st;
-   BB_info* bbInfo;
-   Bool     bbSeenBefore = False, addedInstrumentation, addInstNow;
-   Addr     instrAddr, origAddr;
-   UInt     instrLen;
-   IRExpr  *loadAddrExpr, *storeAddrExpr;
+   Int        i;
+   IRStmt*    st;
+   Addr64     cia; /* address of current insn */
+   CgState    cgs;
+   IRTypeEnv* tyenv = bbIn->tyenv;
+
 
    if (gWordTy != hWordTy) {
       /* We don't currently support this case. */
@@ -660,75 +877,132 @@
    }
 
    /* Set up BB */
-   bbOut           = emptyIRBB();
-   bbOut->tyenv    = dopyIRTypeEnv(bbIn->tyenv);
-   bbOut->next     = dopyIRExpr(bbIn->next);
-   bbOut->jumpkind = bbIn->jumpkind;
+   cgs.bbOut        = emptyIRBB();
+   cgs.bbOut->tyenv = dopyIRTypeEnv(tyenv);
 
-   // Get the first statement, and origAddr from it
+   // Get the first statement, and initial cia from it
    i = 0;
    tl_assert(bbIn->stmts_used > 0);
    st = bbIn->stmts[0];
    tl_assert(Ist_IMark == st->tag);
-   origAddr = (Addr)st->Ist.IMark.addr;
-   tl_assert(origAddr == st->Ist.IMark.addr);  // XXX: check no overflow
+   cia = st->Ist.IMark.addr;
 
-   // Get block info
-   bbInfo = get_BB_info(bbIn, origAddr, &bbSeenBefore);
-   bbInfo_i = 0;
+   // Set up running state and get block info
+   cgs.events_used = 0;
+   cgs.bbInfo      = get_BB_info(bbIn, (Addr)cia, &cgs.bbSeenBefore);
+   cgs.bbInfo_i    = 0;
 
-   do {
-      // We should be at an IMark statement
-      tl_assert(Ist_IMark == st->tag);
+   if (DEBUG_CG)
+      VG_(printf)("\n\n---------- cg_instrument ----------\n");
 
-      // Reset stuff for this original instruction
-      loadAddrExpr = storeAddrExpr = NULL;
-      dataSize = 0;
-      addedInstrumentation = False;
+   // Traverse the block, adding events and flushing as necessary.
+   for (i = 0; i < bbIn->stmts_used; i++) {
 
-      // Process all the statements for this original instruction (ie. until
-      // the next IMark statement, or the end of the block)
-      do {
-         IRStmt* st2 = ( i+1 < bbIn->stmts_used ? bbIn->stmts[i+1] : NULL );
+      st = bbIn->stmts[i];
+      tl_assert(isFlatIRStmt(st));
 
-         addInstNow = handleOneStatement(bbIn->tyenv, bbOut, st, st2,
-                                         &instrAddr, &instrLen, &loadAddrExpr,
-                                         &storeAddrExpr, &dataSize);
-         if (addInstNow) {
-            tl_assert(!addedInstrumentation);
-            addedInstrumentation = True;
-            
-            // Nb: instrLen will be zero if Vex failed to decode it.
-            // Also Client requests can appear to be very large (eg. 18
-            // bytes on x86) because they are really multiple instructions.
-            tl_assert( 0 == instrLen ||
-                       bbIn->jumpkind == Ijk_ClientReq ||
-                       (instrLen >= VG_MIN_INSTR_SZB && 
-                        instrLen <= VG_MAX_INSTR_SZB) );
+      switch (st->tag) {
+         case Ist_NoOp:
+         case Ist_AbiHint:
+         case Ist_Put:
+         case Ist_PutI:
+         case Ist_MFence:
+            break;
 
-            // Add instrumentation before this statement.
-            instrumentInstr(bbOut, &bbInfo->instrs[ bbInfo_i ], bbSeenBefore,
-                      instrAddr, instrLen, dataSize, loadAddrExpr, storeAddrExpr);
+         case Ist_IMark:
+            cia = st->Ist.IMark.addr;
+            addEvent_Ir( &cgs, st->Ist.IMark.len, cia );
+            break;
+
+         case Ist_Tmp: {
+            IRExpr* data = st->Ist.Tmp.data;
+            if (data->tag == Iex_Load) {
+               IRExpr* aexpr = data->Iex.Load.addr;
+               tl_assert( isIRAtom(aexpr) );
+               // Note also, endianness info is ignored.  I guess
+               // that's not interesting.
+               addEvent_Dr( &cgs, sizeofIRType(data->Iex.Load.ty), 
+                                  cia, aexpr );
+            }
+            break;
          }
 
-         addStmtToIRBB( bbOut, st );
+         case Ist_Store: {
+            IRExpr* data  = st->Ist.Store.data;
+            IRExpr* aexpr = st->Ist.Store.addr;
+            tl_assert( isIRAtom(aexpr) );
+            addEvent_Dw( &cgs, 
+                         sizeofIRType(typeOfIRExpr(tyenv, data)), 
+                         cia, aexpr );
+            break;
+         }
 
-         i++;
-         st = st2;
-      } 
-      while (st && Ist_IMark != st->tag);
+         case Ist_Dirty: {
+            Int      dataSize;
+            IRDirty* d = st->Ist.Dirty.details;
+            if (d->mFx != Ifx_None) {
+               /* This dirty helper accesses memory.  Collect the
+                  details. */
+               tl_assert(d->mAddr != NULL);
+               tl_assert(d->mSize != 0);
+               dataSize = d->mSize;
+               // Large (eg. 28B, 108B, 512B on x86) data-sized
+               // instructions will be done inaccurately, but they're
+               // very rare and this avoids errors from hitting more
+               // than two cache lines in the simulation.
+               if (dataSize > MIN_LINE_SIZE)
+                  dataSize = MIN_LINE_SIZE;
+               if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
+                  addEvent_Dr( &cgs, dataSize, cia, d->mAddr );
+               if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
+                  addEvent_Dw( &cgs, dataSize, cia, d->mAddr );
+            } else {
+               tl_assert(d->mAddr == NULL);
+               tl_assert(d->mSize == 0);
+            }
+            break;
+         }
 
-      if (!addedInstrumentation) {
-         // Add instrumentation now, after all the instruction's statements.
-         instrumentInstr(bbOut, &bbInfo->instrs[ bbInfo_i ], bbSeenBefore,
-                         instrAddr, instrLen, dataSize, loadAddrExpr, storeAddrExpr);
+         case Ist_Exit:
+            /* We may never reach the next statement, so need to flush
+               all outstanding transactions now. */
+            flushEvents( &cgs );
+            break;
+
+         default:
+            tl_assert(0);
+            break;
       }
 
-      bbInfo_i++;
-   }
-   while (st);
+      /* Copy the original statement */
+      addStmtToIRBB( cgs.bbOut, st );
 
-   return bbOut;
+      if (DEBUG_CG) {
+         ppIRStmt(st);
+         VG_(printf)("\n");
+      }
+   }
+
+   /* At the end of the bb.  Flush outstandings. */
+   tl_assert(isIRAtom(bbIn->next));
+   flushEvents( &cgs );
+
+   /* copy where-next stuff. */
+   cgs.bbOut->next     = dopyIRExpr(bbIn->next);
+   cgs.bbOut->jumpkind = bbIn->jumpkind;
+
+   /* done.  stay sane ... */
+   tl_assert(cgs.bbInfo_i == cgs.bbInfo->n_instrs);
+
+   if (DEBUG_CG) {
+      VG_(printf)( "goto {");
+      ppIRJumpKind(bbIn->jumpkind);
+      VG_(printf)( "} ");
+      ppIRExpr( bbIn->next );
+      VG_(printf)( "}\n");
+   }
+
+   return cgs.bbOut;
 }
 
 /*------------------------------------------------------------*/
@@ -1077,6 +1351,12 @@
        VG_(message)(Vg_DebugMsg, "BBs Retranslated: %d", BB_retranslations);
    }
    VGP_POPCC(VgpCacheResults);
+
+   if (0) { Int i;
+   for (i = 0; i < 64; i++) {
+     show3(i); VG_(printf)(" %5d\n", trigrams[i] );
+   }
+   }
 }
 
 /*--------------------------------------------------------------------*/
@@ -1084,15 +1364,20 @@
 /*--------------------------------------------------------------------*/
 
 // Called when a translation is invalidated due to code unloading.
-static void cg_discard_basic_block_info ( Addr a, SizeT size )
+static void cg_discard_basic_block_info ( VexGuestExtents vge )
 {
-   VgHashNode*  bbInfo;
+   VgHashNode* bbInfo;
 
-   if (0) VG_(printf)( "discard_basic_block_info: %p, %llu\n", a, (ULong)size);
+   tl_assert(vge.n_used > 0);
+
+   if (DEBUG_CG)
+      VG_(printf)( "discard_basic_block_info: %p, %llu\n", 
+                   (void*)(Addr)vge.base[0], (ULong)vge.len[0]);
 
    // Get BB info, remove from table, free BB info.  Simple!
-   bbInfo = VG_(HT_remove)(instr_info_table, a);
+   bbInfo = VG_(HT_remove)(instr_info_table, (UWord)vge.base[0]);
    tl_assert(NULL != bbInfo);
+
    VG_(free)(bbInfo);
 }
 
@@ -1192,7 +1477,7 @@
    VG_(details_copyright_author)(
       "Copyright (C) 2002-2005, and GNU GPL'd, by Nicholas Nethercote et al.");
    VG_(details_bug_reports_to)  (VG_BUGS_TO);
-   VG_(details_avg_translation_sizeB) ( 155 );
+   VG_(details_avg_translation_sizeB) ( 245 );
 
    VG_(basic_tool_funcs)          (cg_post_clo_init,
                                    cg_instrument,