Callgrind: fix instrumentation for arbitrary events per guest instruction

(should fix bug 169505)

This uses the same event queue scheme as cachegrind and lackey, and
same kind of helpers (1/2/3 Ir events, Ir+Dr, Dr, Ir+Dw, Dw).
Note that in contrast to Cachegrind, Callgrind interpretes a modify event
as Dw (otherwise the cache model generating write back events would not work).

Callgrind uses per-(guest)instruction event sets for cost counters.
An per-instruction eventset is incrementally extended as events for the
same guest instruction are flushed. Event sets always start with Ir counters,
but depending on Dr/Dw order afterwards, there exist IrDr(Dw) and IrDw(Dr).
Per-instruction event sets now are consistently named according to event ordering.
Event set "sim" is a subset of "full", was never used and was removed.

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@10321 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/callgrind/sim.c b/callgrind/sim.c
index 9edbecc..9e53f89 100644
--- a/callgrind/sim.c
+++ b/callgrind/sim.c
@@ -113,22 +113,21 @@
  * - BBCC*  nonskipped  (only != 0 when in a function not skipped)
  */
 
-/* Offset to events in event set, used in log_* functions */
-static Int off_D0_Ir;
-static Int off_D1r_Ir;
-static Int off_D1r_Dr;
-static Int off_D1w_Ir;
-static Int off_D1w_Dw;
-static Int off_D2_Ir;
-static Int off_D2_Dr;
-static Int off_D2_Dw;
+/* Offset to events in event set, used in log_* functions
+ * <off_EventSet_BasicEventSet>: offset where basic set is found
+ */
+static Int off_UIr_Ir;
+static Int off_UIrDr_Ir,   off_UIrDr_Dr;
+static Int off_UIrDrDw_Ir, off_UIrDrDw_Dr, off_UIrDrDw_Dw;
+static Int off_UIrDw_Ir,   off_UIrDw_Dw;
+static Int off_UIrDwDr_Ir, off_UIrDwDr_Dr, off_UIrDwDr_Dw;
 
 static Addr   bb_base;
 static ULong* cost_base;
 static InstrInfo* current_ii;
 
 /* Cache use offsets */
-/* FIXME: The offsets are only correct because all eventsets get
+/* The offsets are only correct because all per-instruction event sets get
  * the "Use" set added first !
  */
 static Int off_I1_AcCost  = 0;
@@ -984,13 +983,13 @@
 void cacheuse_finish(void)
 {
   int i;
-  InstrInfo ii = { 0,0,0,0,0 };
+  InstrInfo ii = { 0,0,0,0 };
 
   if (!CLG_(current_state).collect) return;
 
   bb_base = 0;
   current_ii = &ii;
-  cost_base = 0;  
+  cost_base = 0;
 
   /* update usage counters */
   if (I1.use)
@@ -1043,6 +1042,19 @@
     }
 }
 
+static
+Char* cacheRes(CacheModelResult r)
+{
+    switch(r) {
+    case L1_Hit:    return "L1 Hit ";
+    case L2_Hit:    return "L2 Hit ";
+    case MemAccess: return "L2 Miss";
+    case WriteBackMemAccess: return "L2 Miss (dirty)";
+    default:
+	tl_assert(0);
+    }
+    return "??";
+}
 
 VG_REGPARM(1)
 static void log_1I0D(InstrInfo* ii)
@@ -1052,37 +1064,101 @@
     current_ii = ii;
     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
 
-    CLG_DEBUG(6, "log_1I0D:  Ir=%#lx/%u => Ir %d\n",
-	      bb_base + ii->instr_offset, ii->instr_size, IrRes);
+    CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
+	      bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
 
     if (CLG_(current_state).collect) {
 	ULong* cost_Ir;
-	
+
 	if (CLG_(current_state).nonskipped)
 	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
 	else
-	    cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
+	    cost_Ir = cost_base + ii->cost_offset + off_UIr_Ir;
 
 	inc_costs(IrRes, cost_Ir, 
 		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
     }
 }
 
+VG_REGPARM(2)
+static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
+{
+    CacheModelResult Ir1Res, Ir2Res;
+    ULong *global_cost_Ir;
+
+    current_ii = ii1;
+    Ir1Res = (*simulator.I1_Read)(bb_base + ii1->instr_offset, ii1->instr_size);
+    current_ii = ii2;
+    Ir2Res = (*simulator.I1_Read)(bb_base + ii2->instr_offset, ii2->instr_size);
+
+    CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
+	      bb_base + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+	      bb_base + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
+
+    if (!CLG_(current_state).collect) return;
+
+    global_cost_Ir = CLG_(current_state).cost + CLG_(sets).off_full_Ir;
+    if (CLG_(current_state).nonskipped) {
+	ULong* skipped_cost_Ir = CLG_(current_state).nonskipped->skipped +
+				 CLG_(sets).off_full_Ir;
+	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+	return;
+    }
+
+    inc_costs(Ir1Res, global_cost_Ir, cost_base + ii1->cost_offset + off_UIr_Ir);
+    inc_costs(Ir2Res, global_cost_Ir, cost_base + ii2->cost_offset + off_UIr_Ir);
+}
+
+VG_REGPARM(3)
+static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
+{
+    CacheModelResult Ir1Res, Ir2Res, Ir3Res;
+    ULong *global_cost_Ir;
+
+    current_ii = ii1;
+    Ir1Res = (*simulator.I1_Read)(bb_base + ii1->instr_offset, ii1->instr_size);
+    current_ii = ii2;
+    Ir2Res = (*simulator.I1_Read)(bb_base + ii2->instr_offset, ii2->instr_size);
+    current_ii = ii3;
+    Ir3Res = (*simulator.I1_Read)(bb_base + ii3->instr_offset, ii3->instr_size);
+
+    CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
+	      bb_base + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+	      bb_base + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
+	      bb_base + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
+
+    if (!CLG_(current_state).collect) return;
+
+    global_cost_Ir = CLG_(current_state).cost + CLG_(sets).off_full_Ir;
+    if (CLG_(current_state).nonskipped) {
+	ULong* skipped_cost_Ir = CLG_(current_state).nonskipped->skipped +
+				 CLG_(sets).off_full_Ir;
+	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+	inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
+	return;
+    }
+
+    inc_costs(Ir1Res, global_cost_Ir, cost_base + ii1->cost_offset + off_UIr_Ir);
+    inc_costs(Ir2Res, global_cost_Ir, cost_base + ii2->cost_offset + off_UIr_Ir);
+    inc_costs(Ir3Res, global_cost_Ir, cost_base + ii3->cost_offset + off_UIr_Ir);
+}
 
 /* Instruction doing a read access */
 
-VG_REGPARM(2)
-static void log_1I1Dr(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult IrRes, DrRes;
 
     current_ii = ii;
     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
-    DrRes = (*simulator.D1_Read)(data, ii->data_size);
+    DrRes = (*simulator.D1_Read)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_1I1Dr: Ir=%#lx/%u, Dr=%#lx/%u => Ir %d, Dr %d\n",
-	      bb_base + ii->instr_offset, ii->instr_size,
-	      data, ii->data_size, IrRes, DrRes);
+    CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%lu => %s\n",
+	      bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+	      data_addr, data_size, cacheRes(DrRes));
 
     if (CLG_(current_state).collect) {
 	ULong *cost_Ir, *cost_Dr;
@@ -1092,8 +1168,11 @@
 	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
 	}
 	else {
-	    cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
-	    cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
+	    // event set must be UIrDr or extension
+	    CLG_ASSERT((ii->eventset == CLG_(sets).UIrDr) ||
+		       (ii->eventset == CLG_(sets).UIrDrDw));
+	    cost_Ir = cost_base + ii->cost_offset + off_UIrDr_Ir;
+	    cost_Dr = cost_base + ii->cost_offset + off_UIrDr_Dr;
 	}
        
 	inc_costs(IrRes, cost_Ir, 
@@ -1104,16 +1183,16 @@
 }
 
 
-VG_REGPARM(2)
-static void log_0I1Dr(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult DrRes;
 
     current_ii = ii;
-    DrRes = (*simulator.D1_Read)(data, ii->data_size);
+    DrRes = (*simulator.D1_Read)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_0I1Dr: Dr=%#lx/%u => Dr %d\n",
-	      data, ii->data_size, DrRes);
+    CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%lu => %s\n",
+	      data_addr, data_size, cacheRes(DrRes));
 
     if (CLG_(current_state).collect) {
 	ULong *cost_Dr;
@@ -1122,9 +1201,15 @@
 	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
 	}
 	else {
-	    cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
+	    Int off_Dr;
+	    if      (ii->eventset == CLG_(sets).UIrDr)   off_Dr = off_UIrDr_Dr;
+	    else if (ii->eventset == CLG_(sets).UIrDrDw) off_Dr = off_UIrDrDw_Dr;
+	    else if (ii->eventset == CLG_(sets).UIrDwDr) off_Dr = off_UIrDwDr_Dr;
+	    else CLG_ASSERT(0);
+
+	    cost_Dr = cost_base + ii->cost_offset + off_Dr;
 	}
-       
+
 	inc_costs(DrRes, cost_Dr,
 		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
     }
@@ -1133,29 +1218,33 @@
 
 /* Instruction doing a write access */
 
-VG_REGPARM(2)
-static void log_1I1Dw(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult IrRes, DwRes;
 
     current_ii = ii;
     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
-    DwRes = (*simulator.D1_Write)(data, ii->data_size);
+    DwRes = (*simulator.D1_Write)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_1I1Dw: Ir=%#lx/%u, Dw=%#lx/%u => Ir %d, Dw %d\n",
-	      bb_base + ii->instr_offset, ii->instr_size,
-	      data, ii->data_size, IrRes, DwRes);
+    CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%lu => %s\n",
+	      bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+	      data_addr, data_size, cacheRes(DwRes));
 
     if (CLG_(current_state).collect) {
 	ULong *cost_Ir, *cost_Dw;
 	
 	if (CLG_(current_state).nonskipped) {
-	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
-	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
+	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
+	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
 	}
 	else {
-	    cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
-	    cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
+	    // This helper is called when a Dr event follows Ir;
+	    // Event set must be UIrDw or extension
+	    CLG_ASSERT((ii->eventset == CLG_(sets).UIrDw) ||
+		       (ii->eventset == CLG_(sets).UIrDwDr));
+	    cost_Ir = cost_base + ii->cost_offset + off_UIrDw_Ir;
+	    cost_Dw = cost_base + ii->cost_offset + off_UIrDw_Dw;
 	}
        
 	inc_costs(IrRes, cost_Ir,
@@ -1165,16 +1254,16 @@
     }
 }
 
-VG_REGPARM(2)
-static void log_0I1Dw(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult DwRes;
 
     current_ii = ii;
-    DwRes = (*simulator.D1_Write)(data, ii->data_size);
+    DwRes = (*simulator.D1_Write)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_0I1Dw: Dw=%#lx/%u => Dw %d\n",
-	      data, ii->data_size, DwRes);
+    CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%lu => %s\n",
+	      data_addr, data_size, cacheRes(DwRes));
 
     if (CLG_(current_state).collect) {
 	ULong *cost_Dw;
@@ -1183,7 +1272,13 @@
 	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
 	}
 	else {
-	    cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
+	    Int off_Dw;
+	    if      (ii->eventset == CLG_(sets).UIrDw)   off_Dw = off_UIrDw_Dw;
+	    else if (ii->eventset == CLG_(sets).UIrDwDr) off_Dw = off_UIrDwDr_Dw;
+	    else if (ii->eventset == CLG_(sets).UIrDrDw) off_Dw = off_UIrDrDw_Dw;
+	    else CLG_ASSERT(0);
+
+	    cost_Dw = cost_base + ii->cost_offset + off_Dw;
 	}
        
 	inc_costs(DwRes, cost_Dw,
@@ -1191,77 +1286,6 @@
     }
 }
 
-/* Instruction doing a read and a write access */
-
-VG_REGPARM(3)
-static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
-{
-    CacheModelResult IrRes, DrRes, DwRes;
-
-    current_ii = ii;
-    IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
-    DrRes = (*simulator.D1_Read)(data1, ii->data_size);
-    DwRes = (*simulator.D1_Write)(data2, ii->data_size);
-
-    CLG_DEBUG(6,
-	      "log_1I2D: Ir=%#lx/%u, Dr=%#lx/%u, Dw=%#lx/%u => Ir %d, Dr %d, Dw %d\n",
-	      bb_base + ii->instr_offset, ii->instr_size,
-	      data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
-
-    if (CLG_(current_state).collect) {
-	ULong *cost_Ir, *cost_Dr, *cost_Dw;
-
-	if (CLG_(current_state).nonskipped) {
-	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
-	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
-	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
-	}
-	else {
-	    cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
-	    cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
-	    cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
-	}
-	
-	inc_costs(IrRes, cost_Ir, 
-		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
-	inc_costs(DrRes, cost_Dr, 
-		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
-	inc_costs(DwRes, cost_Dw, 
-		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
-    }
-}
-
-VG_REGPARM(3)
-static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
-{
-    CacheModelResult DrRes, DwRes;
-
-    current_ii = ii;
-    DrRes = (*simulator.D1_Read)(data1, ii->data_size);
-    DwRes = (*simulator.D1_Write)(data2, ii->data_size);
-
-    CLG_DEBUG(6,
-	      "log_0D2D: Dr=%#lx/%u, Dw=%#lx/%u => Dr %d, Dw %d\n",
-	      data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
-
-    if (CLG_(current_state).collect) {
-	ULong *cost_Dr, *cost_Dw;
-
-	if (CLG_(current_state).nonskipped) {
-	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
-	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
-	}
-	else {
-	    cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
-	    cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
-	}
-	
-	inc_costs(DrRes, cost_Dr, 
-		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
-	inc_costs(DwRes, cost_Dw, 
-		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
-    }
-}
 
 
 /*------------------------------------------------------------*/
@@ -1369,20 +1393,20 @@
   if (!CLG_(clo).simulate_cache) {
     CLG_(cachesim).log_1I0D  = 0;
     CLG_(cachesim).log_1I0D_name = "(no function)";
+    CLG_(cachesim).log_2I0D  = 0;
+    CLG_(cachesim).log_2I0D_name = "(no function)";
+    CLG_(cachesim).log_3I0D  = 0;
+    CLG_(cachesim).log_3I0D_name = "(no function)";
 
     CLG_(cachesim).log_1I1Dr = 0;
-    CLG_(cachesim).log_1I1Dw = 0;
-    CLG_(cachesim).log_1I2D  = 0;
     CLG_(cachesim).log_1I1Dr_name = "(no function)";
+    CLG_(cachesim).log_1I1Dw = 0;
     CLG_(cachesim).log_1I1Dw_name = "(no function)";
-    CLG_(cachesim).log_1I2D_name = "(no function)";
 
     CLG_(cachesim).log_0I1Dr = 0;
-    CLG_(cachesim).log_0I1Dw = 0;
-    CLG_(cachesim).log_0I2D  = 0;
     CLG_(cachesim).log_0I1Dr_name = "(no function)";
+    CLG_(cachesim).log_0I1Dw = 0;
     CLG_(cachesim).log_0I1Dw_name = "(no function)";
-    CLG_(cachesim).log_0I2D_name = "(no function)";
     return;
   }
 
@@ -1402,20 +1426,20 @@
 
   CLG_(cachesim).log_1I0D  = log_1I0D;
   CLG_(cachesim).log_1I0D_name  = "log_1I0D";
+  CLG_(cachesim).log_2I0D  = log_2I0D;
+  CLG_(cachesim).log_2I0D_name  = "log_2I0D";
+  CLG_(cachesim).log_3I0D  = log_3I0D;
+  CLG_(cachesim).log_3I0D_name  = "log_3I0D";
 
   CLG_(cachesim).log_1I1Dr = log_1I1Dr;
   CLG_(cachesim).log_1I1Dw = log_1I1Dw;
-  CLG_(cachesim).log_1I2D  = log_1I2D;
   CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
   CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
-  CLG_(cachesim).log_1I2D_name  = "log_1I2D";
 
   CLG_(cachesim).log_0I1Dr = log_0I1Dr;
   CLG_(cachesim).log_0I1Dw = log_0I1Dw;
-  CLG_(cachesim).log_0I2D  = log_0I2D;
   CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
   CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
-  CLG_(cachesim).log_0I2D_name  = "log_0I2D";
 
   if (clo_collect_cacheuse) {
 
@@ -1763,26 +1787,29 @@
 void CLG_(init_eventsets)(Int max_user)
 {
   EventType * e1, *e2, *e3, *e4;
-  EventSet *Ir, *Dr, *Dw;
-  EventSet *D0, *D1r, *D1w, *D2;
-  EventSet *sim, *full;
-  EventSet *use;
+  // Basic event sets from which others are composed
+  EventSet *Use, *Ir, *Dr, *Dw;
+  // Compositions of basic sets used for per-instruction counters
+  EventSet *UIr, *UIrDr, *UIrDrDw, *UIrDw, *UIrDwDr;
+  // Composition used for global counters and aggregation
+  EventSet *full;
   int sizeOfUseIr;
 
-  use = CLG_(get_eventset)("Use", 4);
+  // the "Use" events types only are used with "cacheuse" simulation
+  Use = CLG_(get_eventset)("Use", 4);
   if (clo_collect_cacheuse) {
     /* if TUse is 0, there was never a load, and no loss, too */
     e1 = CLG_(register_eventtype)("AcCost1");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
     e1 = CLG_(register_eventtype)("SpLoss1");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
     e1 = CLG_(register_eventtype)("AcCost2");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
     e1 = CLG_(register_eventtype)("SpLoss2");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
   }
 
-  Ir = CLG_(get_eventset)("Ir", 4);    
+  Ir = CLG_(get_eventset)("Ir", 4);
   Dr = CLG_(get_eventset)("Dr", 4);
   Dw = CLG_(get_eventset)("Dw", 4);
   if (CLG_(clo).simulate_cache) {
@@ -1822,74 +1849,76 @@
     CLG_(add_eventtype)(Ir, e1);
   }
 
-  sizeOfUseIr =  use->size + Ir->size;
-  D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
-  CLG_(add_eventset)(D0, use);
-  off_D0_Ir  = CLG_(add_eventset)(D0, Ir);
+  // Self cost event sets per guest instruction (U used only for cacheUse).
+  // Each basic event set only appears once, as eg. multiple different Dr's
+  // in one guest instruction are counted in the same counter.
 
-  D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
-  CLG_(add_eventset)(D1r, use);
-  off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
-  off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
+  sizeOfUseIr =  Use->size + Ir->size;
+  UIr = CLG_(get_eventset)("UIr", sizeOfUseIr);
+  CLG_(add_eventset)(UIr, Use);
+  off_UIr_Ir  = CLG_(add_eventset)(UIr, Ir);
 
-  D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
-  CLG_(add_eventset)(D1w, use);
-  off_D1w_Ir   = CLG_(add_eventset)(D1w, Ir);
-  off_D1w_Dw   = CLG_(add_eventset)(D1w, Dw);
+  UIrDr = CLG_(get_eventset)("UIrDr", sizeOfUseIr + Dr->size);
+  CLG_(add_eventset)(UIrDr, Use);
+  off_UIrDr_Ir = CLG_(add_eventset)(UIrDr, Ir);
+  off_UIrDr_Dr = CLG_(add_eventset)(UIrDr, Dr);
 
-  D2  = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
-  CLG_(add_eventset)(D2, use);
-  off_D2_Ir    = CLG_(add_eventset)(D2, Ir);
-  off_D2_Dr    = CLG_(add_eventset)(D2, Dr);
-  off_D2_Dw    = CLG_(add_eventset)(D2, Dw);
+  UIrDrDw  = CLG_(get_eventset)("IrDrDw", sizeOfUseIr + Dr->size + Dw->size);
+  CLG_(add_eventset)(UIrDrDw, Use);
+  off_UIrDrDw_Ir    = CLG_(add_eventset)(UIrDrDw, Ir);
+  off_UIrDrDw_Dr    = CLG_(add_eventset)(UIrDrDw, Dr);
+  off_UIrDrDw_Dw    = CLG_(add_eventset)(UIrDrDw, Dw);
 
-  sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
-  CLG_(add_eventset)(sim, use);
-  CLG_(sets).off_sim_Ir   = CLG_(add_eventset)(sim, Ir);
-  CLG_(sets).off_sim_Dr   = CLG_(add_eventset)(sim, Dr);
-  CLG_(sets).off_sim_Dw   = CLG_(add_eventset)(sim, Dw);
+  UIrDw = CLG_(get_eventset)("UIrDw", sizeOfUseIr + Dw->size);
+  CLG_(add_eventset)(UIrDw, Use);
+  off_UIrDw_Ir   = CLG_(add_eventset)(UIrDw, Ir);
+  off_UIrDw_Dw   = CLG_(add_eventset)(UIrDw, Dw);
 
+  UIrDwDr  = CLG_(get_eventset)("IrDwDr", sizeOfUseIr + Dw->size + Dr->size);
+  CLG_(add_eventset)(UIrDwDr, Use);
+  off_UIrDwDr_Ir    = CLG_(add_eventset)(UIrDrDw, Ir);
+  off_UIrDwDr_Dw    = CLG_(add_eventset)(UIrDrDw, Dw);
+  off_UIrDwDr_Dr    = CLG_(add_eventset)(UIrDrDw, Dr);
+
+
+  // the "full" event set is used as global counter and for aggregation
   if (CLG_(clo).collect_alloc)   max_user += 2;
   if (CLG_(clo).collect_systime) max_user += 2;
+  full = CLG_(get_eventset)("full",
+			    sizeOfUseIr + Dr->size + Dw->size + max_user);
+  CLG_(add_eventset)(full, Use);
+  CLG_(sets).off_full_Ir   = CLG_(add_eventset)(full, Ir);
+  CLG_(sets).off_full_Dr   = CLG_(add_eventset)(full, Dr);
+  CLG_(sets).off_full_Dw   = CLG_(add_eventset)(full, Dw);
+  if (CLG_(clo).collect_alloc) {
+      e1 = CLG_(register_eventtype)("allocCount");
+      e2 = CLG_(register_eventtype)("allocSize");
+      CLG_(sets).off_full_alloc =  CLG_(add_dep_event2)(full, e1,e2);
+  }
+  if (CLG_(clo).collect_systime) {
+      e1 = CLG_(register_eventtype)("sysCount");
+      e2 = CLG_(register_eventtype)("sysTime");
+      CLG_(sets).off_full_systime =  CLG_(add_dep_event2)(full, e1,e2);
+  }
 
-  full = CLG_(get_eventset)("full", sim->size + max_user);
-  CLG_(add_eventset)(full, sim);
-  CLG_(sets).off_full_Ir   = CLG_(sets).off_sim_Ir;
-  CLG_(sets).off_full_Dr   = CLG_(sets).off_sim_Dr;
-  CLG_(sets).off_full_Dw   = CLG_(sets).off_sim_Dw;
-
-  CLG_(sets).use = use;
+  CLG_(sets).Use = Use;
   CLG_(sets).Ir  = Ir;
   CLG_(sets).Dr  = Dr;
   CLG_(sets).Dw  = Dw;
-
-  CLG_(sets).D0  = D0;
-  CLG_(sets).D1r = D1r;
-  CLG_(sets).D1w = D1w;
-  CLG_(sets).D2  = D2;
-
-  CLG_(sets).sim  = sim;
+  CLG_(sets).UIr  = UIr;
+  CLG_(sets).UIrDr = UIrDr;
+  CLG_(sets).UIrDrDw  = UIrDrDw;
+  CLG_(sets).UIrDw = UIrDw;
+  CLG_(sets).UIrDwDr  = UIrDwDr;
   CLG_(sets).full = full;
 
-  if (CLG_(clo).collect_alloc) {
-    e1 = CLG_(register_eventtype)("allocCount");
-    e2 = CLG_(register_eventtype)("allocSize");
-    CLG_(sets).off_full_user =  CLG_(add_dep_event2)(full, e1,e2);
-  }
-
-  if (CLG_(clo).collect_systime) {
-    e1 = CLG_(register_eventtype)("sysCount");
-    e2 = CLG_(register_eventtype)("sysTime");
-    CLG_(sets).off_full_systime =  CLG_(add_dep_event2)(full, e1,e2);
-  }
 
   CLG_DEBUGIF(1) {
     CLG_DEBUG(1, "EventSets:\n");
-    CLG_(print_eventset)(-2, use);
+    CLG_(print_eventset)(-2, Use);
     CLG_(print_eventset)(-2, Ir);
     CLG_(print_eventset)(-2, Dr);
     CLG_(print_eventset)(-2, Dw);
-    CLG_(print_eventset)(-2, sim);
     CLG_(print_eventset)(-2, full);
   }
 
@@ -1924,34 +1953,41 @@
 void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
 {
   /* if eventset use is defined, it is always first (hardcoded!) */
-  CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);  
+  CLG_(add_and_zero_cost)( CLG_(sets).Use, dst, cost);
 
-  /* FIXME: This is hardcoded... */
-  if (es == CLG_(sets).D0) {
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-			    cost + off_D0_Ir);
+  if (es == CLG_(sets).UIr) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+			    cost + off_UIr_Ir);
   }
-  else if (es == CLG_(sets).D1r) {
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-			    cost + off_D1r_Ir);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
-			    cost + off_D1r_Dr);
+  else if (es == CLG_(sets).UIrDr) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+			    cost + off_UIrDr_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
+			    cost + off_UIrDr_Dr);
   }
-  else if (es == CLG_(sets).D1w) {
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-			    cost + off_D1w_Ir);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
-			    cost + off_D1w_Dw);
+  else if (es == CLG_(sets).UIrDrDw) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+			    cost + off_UIrDrDw_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
+			    cost + off_UIrDrDw_Dr);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
+			    cost + off_UIrDrDw_Dw);
   }
-  else {
-    CLG_ASSERT(es == CLG_(sets).D2);
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-			    cost + off_D2_Ir);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
-			    cost + off_D2_Dr);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
-			    cost + off_D2_Dw);
+  else if (es == CLG_(sets).UIrDw) {
+      CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+			       cost + off_UIrDw_Ir);
+      CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
+			       cost + off_UIrDw_Dw);
   }
+  else if (es == CLG_(sets).UIrDwDr) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+			    cost + off_UIrDwDr_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
+			    cost + off_UIrDwDr_Dw);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
+			     cost + off_UIrDwDr_Dr);
+  }
+  else CLG_ASSERT(0);
 }
 
 /* this is called at dump time for every instruction executed */
@@ -1959,7 +1995,7 @@
 			       InstrInfo* ii, ULong exe_count)
 {
   if (!CLG_(clo).simulate_cache)
-      cost[CLG_(sets).off_sim_Ir] += exe_count;
+      cost[CLG_(sets).off_full_Ir] += exe_count;
   else {
 
 #if 0
@@ -2019,24 +2055,24 @@
 
   /* these will be set by cachesim_post_clo_init */
   .log_1I0D        = 0,
+  .log_2I0D        = 0,
+  .log_3I0D        = 0,
 
   .log_1I1Dr       = 0,
   .log_1I1Dw       = 0,
-  .log_1I2D        = 0,
 
   .log_0I1Dr       = 0,
   .log_0I1Dw       = 0,
-  .log_0I2D        = 0,
 
   .log_1I0D_name = "(no function)",
+  .log_2I0D_name = "(no function)",
+  .log_3I0D_name = "(no function)",
 
   .log_1I1Dr_name = "(no function)",
   .log_1I1Dw_name = "(no function)",
-  .log_1I2D_name = "(no function)",
 
   .log_0I1Dr_name = "(no function)",
   .log_0I1Dw_name = "(no function)",
-  .log_0I2D_name = "(no function)"
 };