Callgrind: add branch prediction from Cachegrind
Callgrind now uses Cachegrind's command line option to switch
on simulation: "--branch-sim=yes/no" for branch prediction,
and "--cache-sim=yes/no" for cache simulation (for more
consistency and to avoid confusion). However, the previously
used "--simulate-cache=yes/no" still is supported but deprecated.
Included: according documentation and tests.
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11207 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/callgrind/main.c b/callgrind/main.c
index c0290b4..e36ba8a 100644
--- a/callgrind/main.c
+++ b/callgrind/main.c
@@ -37,6 +37,8 @@
#include <pub_tool_threadstate.h>
+#include "cg_branchpred.c"
+
/*------------------------------------------------------------*/
/*--- Global variables ---*/
/*------------------------------------------------------------*/
@@ -103,11 +105,13 @@
{
ULong* cost_Bus;
- CLG_DEBUG(0, "log_global_event: Ir %#lx/%u\n",
+ CLG_DEBUG(6, "log_global_event: Ir %#lx/%u\n",
CLG_(bb_base) + ii->instr_offset, ii->instr_size);
if (!CLG_(current_state).collect) return;
+ CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
+
CLG_(current_state).cost[ fullOffset(EG_BUS) ]++;
if (CLG_(current_state).nonskipped)
@@ -118,6 +122,71 @@
}
+/* For branches, we consult two different predictors, one which
+ predicts taken/untaken for conditional branches, and the other
+ which predicts the branch target address for indirect branches
+ (jump-to-register style ones). */
+
+static VG_REGPARM(2)
+void log_cond_branch(InstrInfo* ii, Word taken)
+{
+ Bool miss;
+ Int fullOffset_Bc;
+ ULong* cost_Bc;
+
+ CLG_DEBUG(6, "log_cond_branch: Ir %#lx, taken %lu\n",
+ CLG_(bb_base) + ii->instr_offset, taken);
+
+ miss = 1 & do_cond_branch_predict(CLG_(bb_base) + ii->instr_offset, taken);
+
+ if (!CLG_(current_state).collect) return;
+
+ CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
+
+ if (CLG_(current_state).nonskipped)
+ cost_Bc = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
+ else
+ cost_Bc = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
+
+ fullOffset_Bc = fullOffset(EG_BC);
+ CLG_(current_state).cost[ fullOffset_Bc ]++;
+ cost_Bc[0]++;
+ if (miss) {
+ CLG_(current_state).cost[ fullOffset_Bc+1 ]++;
+ cost_Bc[1]++;
+ }
+}
+
+static VG_REGPARM(2)
+void log_ind_branch(InstrInfo* ii, UWord actual_dst)
+{
+ Bool miss;
+ Int fullOffset_Bi;
+ ULong* cost_Bi;
+
+ CLG_DEBUG(6, "log_ind_branch: Ir %#lx, dst %#lx\n",
+ CLG_(bb_base) + ii->instr_offset, actual_dst);
+
+ miss = 1 & do_ind_branch_predict(CLG_(bb_base) + ii->instr_offset, actual_dst);
+
+ if (!CLG_(current_state).collect) return;
+
+ CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
+
+ if (CLG_(current_state).nonskipped)
+ cost_Bi = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
+ else
+ cost_Bi = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
+
+ fullOffset_Bi = fullOffset(EG_BI);
+ CLG_(current_state).cost[ fullOffset_Bi ]++;
+ cost_Bi[0]++;
+ if (miss) {
+ CLG_(current_state).cost[ fullOffset_Bi+1 ]++;
+ cost_Bi[1]++;
+ }
+}
+
/*------------------------------------------------------------*/
/*--- Instrumentation structures and event queue handling ---*/
/*------------------------------------------------------------*/
@@ -161,6 +230,8 @@
Ev_Dr, // Data read
Ev_Dw, // Data write
Ev_Dm, // Data modify (read then write)
+ Ev_Bc, // branch conditional
+ Ev_Bi, // branch indirect (to unknown destination)
Ev_G // Global bus event
}
EventTag;
@@ -184,6 +255,12 @@
IRAtom* ea;
Int szB;
} Dm;
+ struct {
+ IRAtom* taken; /* :: Ity_I1 */
+ } Bc;
+ struct {
+ IRAtom* dst;
+ } Bi;
struct {
} G;
} Ev;
@@ -269,6 +346,16 @@
ppIRExpr(ev->Ev.Dm.ea);
VG_(printf)("\n");
break;
+ case Ev_Bc:
+ VG_(printf)("Bc %p GA=", ev->inode);
+ ppIRExpr(ev->Ev.Bc.taken);
+ VG_(printf)("\n");
+ break;
+ case Ev_Bi:
+ VG_(printf)("Bi %p DST=", ev->inode);
+ ppIRExpr(ev->Ev.Bi.dst);
+ VG_(printf)("\n");
+ break;
case Ev_G:
VG_(printf)("G %p\n", ev->inode);
break;
@@ -306,18 +393,28 @@
ev->inode->eventset = CLG_(sets).base;
break;
case Ev_Dr:
- // extend event set by Dr counter
+ // extend event set by Dr counters
ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
EG_DR);
break;
case Ev_Dw:
case Ev_Dm:
- // extend event set by Dw counter
+ // extend event set by Dw counters
ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
EG_DW);
break;
+ case Ev_Bc:
+ // extend event set by Bc counters
+ ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
+ EG_BC);
+ break;
+ case Ev_Bi:
+ // extend event set by Bi counters
+ ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
+ EG_BI);
+ break;
case Ev_G:
- // extend event set by Bus counter
+ // extend event set by Bus counter
ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
EG_BUS);
break;
@@ -436,6 +533,22 @@
regparms = 3;
inew = i+1;
break;
+ case Ev_Bc:
+ /* Conditional branch */
+ helperName = "log_cond_branch";
+ helperAddr = &log_cond_branch;
+ argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
+ regparms = 2;
+ inew = i+1;
+ break;
+ case Ev_Bi:
+ /* Branch to an unknown destination */
+ helperName = "log_ind_branch";
+ helperAddr = &log_ind_branch;
+ argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
+ regparms = 2;
+ inew = i+1;
+ break;
case Ev_G:
/* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
helperName = "log_global_event";
@@ -549,10 +662,51 @@
}
static
+void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
+{
+ Event* evt;
+ tl_assert(isIRAtom(guard));
+ tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
+ == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
+ if (!CLG_(clo).simulate_branch) return;
+
+ if (clgs->events_used == N_EVENTS)
+ flushEvents(clgs);
+ tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+ evt = &clgs->events[clgs->events_used];
+ init_Event(evt);
+ evt->tag = Ev_Bc;
+ evt->inode = inode;
+ evt->Ev.Bc.taken = guard;
+ clgs->events_used++;
+}
+
+static
+void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
+{
+ Event* evt;
+ tl_assert(isIRAtom(whereTo));
+ tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
+ == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
+ if (!CLG_(clo).simulate_branch) return;
+
+ if (clgs->events_used == N_EVENTS)
+ flushEvents(clgs);
+ tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+ evt = &clgs->events[clgs->events_used];
+ init_Event(evt);
+ evt->tag = Ev_Bi;
+ evt->inode = inode;
+ evt->Ev.Bi.dst = whereTo;
+ clgs->events_used++;
+}
+
+static
void addEvent_G ( ClgState* clgs, InstrInfo* inode )
{
Event* evt;
if (!CLG_(clo).collect_bus) return;
+
if (clgs->events_used == N_EVENTS)
flushEvents(clgs);
tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
@@ -753,6 +907,7 @@
Int i, isize;
IRStmt* st;
Addr origAddr;
+ Addr64 cia; /* address of current insn */
InstrInfo* curr_inode = NULL;
ClgState clgs;
UInt cJumps = 0;
@@ -789,6 +944,8 @@
CLG_ASSERT(Ist_IMark == st->tag);
origAddr = (Addr)st->Ist.IMark.addr;
+ cia = st->Ist.IMark.addr;
+ isize = st->Ist.IMark.len;
CLG_ASSERT(origAddr == st->Ist.IMark.addr); // XXX: check no overflow
/* Get BB struct (creating if necessary).
@@ -819,8 +976,9 @@
break;
case Ist_IMark: {
- CLG_ASSERT(clgs.instr_offset == (Addr)st->Ist.IMark.addr - origAddr);
- isize = st->Ist.IMark.len;
+ cia = st->Ist.IMark.addr;
+ isize = st->Ist.IMark.len;
+ CLG_ASSERT(clgs.instr_offset == (Addr)cia - origAddr);
// If Vex fails to decode an instruction, the size will be zero.
// Pretend otherwise.
if (isize == 0) isize = VG_MIN_INSTR_SZB;
@@ -925,7 +1083,63 @@
}
case Ist_Exit: {
- UInt jmps_passed;
+ Bool guest_exit, inverted;
+
+ /* VEX code generation sometimes inverts conditional branches.
+ * As Callgrind counts (conditional) jumps, it has to correct
+ * inversions. The heuristic is the following:
+ * (1) Callgrind switches off SB chasing and unrolling, and
+ * therefore it assumes that a candidate for inversion only is
+ * the last conditional branch in an SB.
+ * (2) inversion is assumed if the branch jumps to the address of
+ * the next guest instruction in memory.
+ * This heuristic is precalculated in CLG_(collectBlockInfo)().
+ *
+ * Branching behavior is also used for branch prediction. Note that
+ * above heuristic is different from what Cachegrind does.
+ * Cachegrind uses (2) for all branches.
+ */
+ if (cJumps+1 == clgs.bb->cjmp_count)
+ inverted = clgs.bb->cjmp_inverted;
+ else
+ inverted = False;
+
+ // call branch predictor only if this is a branch in guest code
+ guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
+ (st->Ist.Exit.jk == Ijk_Call) ||
+ (st->Ist.Exit.jk == Ijk_Ret);
+
+ if (guest_exit) {
+ /* Stuff to widen the guard expression to a host word, so
+ we can pass it to the branch predictor simulation
+ functions easily. */
+ IRType tyW = hWordTy;
+ IROp widen = tyW==Ity_I32 ? Iop_1Uto32 : Iop_1Uto64;
+ IROp opXOR = tyW==Ity_I32 ? Iop_Xor32 : Iop_Xor64;
+ IRTemp guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
+ IRTemp guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
+ IRTemp guard = newIRTemp(clgs.sbOut->tyenv, tyW);
+ IRExpr* one = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
+ : IRExpr_Const(IRConst_U64(1));
+
+ /* Widen the guard expression. */
+ addStmtToIRSB( clgs.sbOut,
+ IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
+ addStmtToIRSB( clgs.sbOut,
+ IRStmt_WrTmp( guardW,
+ IRExpr_Unop(widen,
+ IRExpr_RdTmp(guard1))) );
+ /* If the exit is inverted, invert the sense of the guard. */
+ addStmtToIRSB(
+ clgs.sbOut,
+ IRStmt_WrTmp(
+ guard,
+ inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
+ : IRExpr_RdTmp(guardW)
+ ));
+ /* And post the event. */
+ addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
+ }
/* We may never reach the next statement, so need to flush
all outstanding transactions now. */
@@ -940,12 +1154,9 @@
/* Update global variable jmps_passed before the jump
* A correction is needed if VEX inverted the last jump condition
*/
- jmps_passed = cJumps;
- if ((cJumps+1 == clgs.bb->cjmp_count) && clgs.bb->cjmp_inverted)
- jmps_passed++;
addConstMemStoreStmt( clgs.sbOut,
(UWord) &CLG_(current_state).jmps_passed,
- jmps_passed, hWordTy);
+ inverted ? cJumps+1 : cJumps, hWordTy);
cJumps++;
break;
@@ -966,6 +1177,26 @@
}
}
+ /* Deal with branches to unknown destinations. Except ignore ones
+ which are function returns as we assume the return stack
+ predictor never mispredicts. */
+ if (sbIn->jumpkind == Ijk_Boring) {
+ if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
+ switch (sbIn->next->tag) {
+ case Iex_Const:
+ break; /* boring - branch to known address */
+ case Iex_RdTmp:
+ /* looks like an indirect branch (branch to unknown) */
+ addEvent_Bi( &clgs, curr_inode, sbIn->next );
+ break;
+ default:
+ /* shouldn't happen - if the incoming IR is properly
+ flattened, should only have tmp and const cases to
+ consider. */
+ tl_assert(0);
+ }
+ }
+
/* At the end of the bb. Flush outstandings. */
flushEvents( &clgs );
@@ -1236,10 +1467,61 @@
}
}
+static UInt ULong_width(ULong n)
+{
+ UInt w = 0;
+ while (n > 0) {
+ n = n / 10;
+ w++;
+ }
+ if (w == 0) w = 1;
+ return w + (w-1)/3; // add space for commas
+}
+
+static
+void branchsim_printstat(int l1, int l2, int l3)
+{
+ static Char buf1[128], buf2[128], buf3[128], fmt[128];
+ FullCost total;
+ ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
+ ULong B_total_b, B_total_mp;
+
+ total = CLG_(total_cost);
+ Bc_total_b = total[ fullOffset(EG_BC) ];
+ Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
+ Bi_total_b = total[ fullOffset(EG_BI) ];
+ Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
+
+ /* Make format string, getting width right for numbers */
+ VG_(sprintf)(fmt, "%%s %%,%dllu (%%,%dllu cond + %%,%dllu ind)\n",
+ l1, l2, l3);
+
+ if (0 == Bc_total_b) Bc_total_b = 1;
+ if (0 == Bi_total_b) Bi_total_b = 1;
+ B_total_b = Bc_total_b + Bi_total_b;
+ B_total_mp = Bc_total_mp + Bi_total_mp;
+
+ VG_(umsg)("\n");
+ VG_(umsg)(fmt, "Branches: ",
+ B_total_b, Bc_total_b, Bi_total_b);
+
+ VG_(umsg)(fmt, "Mispredicts: ",
+ B_total_mp, Bc_total_mp, Bi_total_mp);
+
+ VG_(percentify)(B_total_mp, B_total_b, 1, l1+1, buf1);
+ VG_(percentify)(Bc_total_mp, Bc_total_b, 1, l2+1, buf2);
+ VG_(percentify)(Bi_total_mp, Bi_total_b, 1, l3+1, buf3);
+
+ VG_(umsg)("Mispred rate: %s (%s + %s )\n", buf1, buf2,buf3);
+}
+
+
static
void finish(void)
{
- char buf[RESULTS_BUF_LEN];
+ Char buf[RESULTS_BUF_LEN], fmt[128];
+ Int l1, l2, l3;
+ FullCost total;
CLG_DEBUG(0, "finish()\n");
@@ -1334,8 +1616,33 @@
VG_(message)(Vg_UserMsg, "Collected : %s\n", buf);
VG_(message)(Vg_UserMsg, "\n");
- // if (CLG_(clo).simulate_cache)
- (*CLG_(cachesim).printstat)();
+ /* determine value widths for statistics */
+ total = CLG_(total_cost);
+ l1 = ULong_width( total[fullOffset(EG_IR)] );
+ l2 = l3 = 0;
+ if (CLG_(clo).simulate_cache) {
+ l2 = ULong_width( total[fullOffset(EG_DR)] );
+ l3 = ULong_width( total[fullOffset(EG_DW)] );
+ }
+ if (CLG_(clo).simulate_branch) {
+ int l2b = ULong_width( total[fullOffset(EG_BC)] );
+ int l3b = ULong_width( total[fullOffset(EG_BI)] );
+ if (l2b > l2) l2 = l2b;
+ if (l3b > l3) l3 = l3b;
+ }
+
+ /* Make format string, getting width right for numbers */
+ VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
+
+ /* Always print this */
+ VG_(umsg)(fmt, "I refs: ", total[fullOffset(EG_IR)] );
+
+ if (CLG_(clo).simulate_cache)
+ (*CLG_(cachesim).printstat)(l1, l2, l3);
+
+ if (CLG_(clo).simulate_branch)
+ branchsim_printstat(l1, l2, l3);
+
}