Callgrind: add branch prediction from Cachegrind

Callgrind now uses Cachegrind's command line option to switch
on simulation: "--branch-sim=yes/no" for branch prediction,
and "--cache-sim=yes/no" for cache simulation (for more
consistency and to avoid confusion). However, the previously
used "--simulate-cache=yes/no" still is supported but deprecated.

Included: according documentation and tests.

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11207 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/callgrind/main.c b/callgrind/main.c
index c0290b4..e36ba8a 100644
--- a/callgrind/main.c
+++ b/callgrind/main.c
@@ -37,6 +37,8 @@
 
 #include <pub_tool_threadstate.h>
 
+#include "cg_branchpred.c"
+
 /*------------------------------------------------------------*/
 /*--- Global variables                                     ---*/
 /*------------------------------------------------------------*/
@@ -103,11 +105,13 @@
 {
     ULong* cost_Bus;
 
-    CLG_DEBUG(0, "log_global_event:  Ir  %#lx/%u\n",
+    CLG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
               CLG_(bb_base) + ii->instr_offset, ii->instr_size);
 
     if (!CLG_(current_state).collect) return;
 
+    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
+
     CLG_(current_state).cost[ fullOffset(EG_BUS) ]++;
 
     if (CLG_(current_state).nonskipped)
@@ -118,6 +122,71 @@
 }
 
 
+/* For branches, we consult two different predictors, one which
+   predicts taken/untaken for conditional branches, and the other
+   which predicts the branch target address for indirect branches
+   (jump-to-register style ones). */
+
+static VG_REGPARM(2)
+void log_cond_branch(InstrInfo* ii, Word taken)
+{
+    Bool miss;
+    Int fullOffset_Bc;
+    ULong* cost_Bc;
+
+    CLG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %lu\n",
+              CLG_(bb_base) + ii->instr_offset, taken);
+
+    miss = 1 & do_cond_branch_predict(CLG_(bb_base) + ii->instr_offset, taken);
+
+    if (!CLG_(current_state).collect) return;
+
+    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
+
+    if (CLG_(current_state).nonskipped)
+        cost_Bc = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
+    else
+        cost_Bc = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
+
+    fullOffset_Bc = fullOffset(EG_BC);
+    CLG_(current_state).cost[ fullOffset_Bc ]++;
+    cost_Bc[0]++;
+    if (miss) {
+        CLG_(current_state).cost[ fullOffset_Bc+1 ]++;
+        cost_Bc[1]++;
+    }
+}
+
+static VG_REGPARM(2)
+void log_ind_branch(InstrInfo* ii, UWord actual_dst)
+{
+    Bool miss;
+    Int fullOffset_Bi;
+    ULong* cost_Bi;
+
+    CLG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
+              CLG_(bb_base) + ii->instr_offset, actual_dst);
+
+    miss = 1 & do_ind_branch_predict(CLG_(bb_base) + ii->instr_offset, actual_dst);
+
+    if (!CLG_(current_state).collect) return;
+
+    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
+
+    if (CLG_(current_state).nonskipped)
+        cost_Bi = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
+    else
+        cost_Bi = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
+
+    fullOffset_Bi = fullOffset(EG_BI);
+    CLG_(current_state).cost[ fullOffset_Bi ]++;
+    cost_Bi[0]++;
+    if (miss) {
+        CLG_(current_state).cost[ fullOffset_Bi+1 ]++;
+        cost_Bi[1]++;
+    }
+}
+
 /*------------------------------------------------------------*/
 /*--- Instrumentation structures and event queue handling  ---*/
 /*------------------------------------------------------------*/
@@ -161,6 +230,8 @@
       Ev_Dr,  // Data read
       Ev_Dw,  // Data write
       Ev_Dm,  // Data modify (read then write)
+      Ev_Bc,  // branch conditional
+      Ev_Bi,  // branch indirect (to unknown destination)
       Ev_G    // Global bus event
    }
    EventTag;
@@ -184,6 +255,12 @@
 	    IRAtom* ea;
 	    Int     szB;
 	 } Dm;
+         struct {
+            IRAtom* taken; /* :: Ity_I1 */
+         } Bc;
+         struct {
+            IRAtom* dst;
+         } Bi;
 	 struct {
 	 } G;
       } Ev;
@@ -269,6 +346,16 @@
 	 ppIRExpr(ev->Ev.Dm.ea);
 	 VG_(printf)("\n");
 	 break;
+      case Ev_Bc:
+         VG_(printf)("Bc %p   GA=", ev->inode);
+         ppIRExpr(ev->Ev.Bc.taken);
+         VG_(printf)("\n");
+         break;
+      case Ev_Bi:
+         VG_(printf)("Bi %p  DST=", ev->inode);
+         ppIRExpr(ev->Ev.Bi.dst);
+         VG_(printf)("\n");
+         break;
       case Ev_G:
          VG_(printf)("G  %p\n", ev->inode);
          break;
@@ -306,18 +393,28 @@
 	       ev->inode->eventset = CLG_(sets).base;
 	       break;
 	   case Ev_Dr:
-	       // extend event set by Dr counter
+               // extend event set by Dr counters
 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
 							   EG_DR);
 	       break;
 	   case Ev_Dw:
 	   case Ev_Dm:
-	       // extend event set by Dw counter
+               // extend event set by Dw counters
 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
 							   EG_DW);
 	       break;
+           case Ev_Bc:
+               // extend event set by Bc counters
+               ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
+                                                           EG_BC);
+               break;
+           case Ev_Bi:
+               // extend event set by Bi counters
+               ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
+                                                           EG_BI);
+               break;
 	   case Ev_G:
-	       // extend event set by Bus counter
+               // extend event set by Bus counter
 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
 							   EG_BUS);
 	       break;
@@ -436,6 +533,22 @@
 	    regparms = 3;
 	    inew = i+1;
 	    break;
+         case Ev_Bc:
+            /* Conditional branch */
+            helperName = "log_cond_branch";
+            helperAddr = &log_cond_branch;
+            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
+            regparms = 2;
+            inew = i+1;
+            break;
+         case Ev_Bi:
+            /* Branch to an unknown destination */
+            helperName = "log_ind_branch";
+            helperAddr = &log_ind_branch;
+            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
+            regparms = 2;
+            inew = i+1;
+            break;
          case Ev_G:
             /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
             helperName = "log_global_event";
@@ -549,10 +662,51 @@
 }
 
 static
+void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
+{
+   Event* evt;
+   tl_assert(isIRAtom(guard));
+   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
+             == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
+   if (!CLG_(clo).simulate_branch) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag         = Ev_Bc;
+   evt->inode       = inode;
+   evt->Ev.Bc.taken = guard;
+   clgs->events_used++;
+}
+
+static
+void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
+{
+   Event* evt;
+   tl_assert(isIRAtom(whereTo));
+   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
+             == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
+   if (!CLG_(clo).simulate_branch) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Bi;
+   evt->inode     = inode;
+   evt->Ev.Bi.dst = whereTo;
+   clgs->events_used++;
+}
+
+static
 void addEvent_G ( ClgState* clgs, InstrInfo* inode )
 {
    Event* evt;
    if (!CLG_(clo).collect_bus) return;
+
    if (clgs->events_used == N_EVENTS)
       flushEvents(clgs);
    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
@@ -753,6 +907,7 @@
    Int      i, isize;
    IRStmt*  st;
    Addr     origAddr;
+   Addr64   cia; /* address of current insn */
    InstrInfo* curr_inode = NULL;
    ClgState clgs;
    UInt     cJumps = 0;
@@ -789,6 +944,8 @@
    CLG_ASSERT(Ist_IMark == st->tag);
 
    origAddr = (Addr)st->Ist.IMark.addr;
+   cia   = st->Ist.IMark.addr;
+   isize = st->Ist.IMark.len;
    CLG_ASSERT(origAddr == st->Ist.IMark.addr);  // XXX: check no overflow
 
    /* Get BB struct (creating if necessary).
@@ -819,8 +976,9 @@
 	    break;
 
 	 case Ist_IMark: {
-	    CLG_ASSERT(clgs.instr_offset == (Addr)st->Ist.IMark.addr - origAddr);
-	    isize = st->Ist.IMark.len;
+            cia   = st->Ist.IMark.addr;
+            isize = st->Ist.IMark.len;
+            CLG_ASSERT(clgs.instr_offset == (Addr)cia - origAddr);
 	    // If Vex fails to decode an instruction, the size will be zero.
 	    // Pretend otherwise.
 	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
@@ -925,7 +1083,63 @@
          }
 
  	 case Ist_Exit: {
-	    UInt jmps_passed;
+            Bool guest_exit, inverted;
+
+            /* VEX code generation sometimes inverts conditional branches.
+             * As Callgrind counts (conditional) jumps, it has to correct
+             * inversions. The heuristic is the following:
+             * (1) Callgrind switches off SB chasing and unrolling, and
+             *     therefore it assumes that a candidate for inversion only is
+             *     the last conditional branch in an SB.
+             * (2) inversion is assumed if the branch jumps to the address of
+             *     the next guest instruction in memory.
+             * This heuristic is precalculated in CLG_(collectBlockInfo)().
+             *
+             * Branching behavior is also used for branch prediction. Note that
+             * above heuristic is different from what Cachegrind does.
+             * Cachegrind uses (2) for all branches.
+             */
+            if (cJumps+1 == clgs.bb->cjmp_count)
+                inverted = clgs.bb->cjmp_inverted;
+            else
+                inverted = False;
+
+            // call branch predictor only if this is a branch in guest code
+            guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
+                         (st->Ist.Exit.jk == Ijk_Call) ||
+                         (st->Ist.Exit.jk == Ijk_Ret);
+
+            if (guest_exit) {
+                /* Stuff to widen the guard expression to a host word, so
+                   we can pass it to the branch predictor simulation
+                   functions easily. */
+                IRType   tyW    = hWordTy;
+                IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
+                IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
+                IRTemp   guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
+                IRTemp   guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
+                IRTemp   guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
+                IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
+                                               : IRExpr_Const(IRConst_U64(1));
+
+                /* Widen the guard expression. */
+                addStmtToIRSB( clgs.sbOut,
+                               IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
+                addStmtToIRSB( clgs.sbOut,
+                               IRStmt_WrTmp( guardW,
+                                             IRExpr_Unop(widen,
+                                                         IRExpr_RdTmp(guard1))) );
+                /* If the exit is inverted, invert the sense of the guard. */
+                addStmtToIRSB(
+                        clgs.sbOut,
+                        IRStmt_WrTmp(
+                                guard,
+                                inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
+                                    : IRExpr_RdTmp(guardW)
+                                    ));
+                /* And post the event. */
+                addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
+            }
 
 	    /* We may never reach the next statement, so need to flush
 	       all outstanding transactions now. */
@@ -940,12 +1154,9 @@
 	    /* Update global variable jmps_passed before the jump
 	     * A correction is needed if VEX inverted the last jump condition
 	    */
-	    jmps_passed = cJumps;
-	    if ((cJumps+1 == clgs.bb->cjmp_count) && clgs.bb->cjmp_inverted)
-		jmps_passed++;
 	    addConstMemStoreStmt( clgs.sbOut,
 				  (UWord) &CLG_(current_state).jmps_passed,
-				  jmps_passed, hWordTy);
+                                  inverted ? cJumps+1 : cJumps, hWordTy);
 	    cJumps++;
 
 	    break;
@@ -966,6 +1177,26 @@
       }
    }
 
+   /* Deal with branches to unknown destinations.  Except ignore ones
+      which are function returns as we assume the return stack
+      predictor never mispredicts. */
+   if (sbIn->jumpkind == Ijk_Boring) {
+      if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
+      switch (sbIn->next->tag) {
+         case Iex_Const:
+            break; /* boring - branch to known address */
+         case Iex_RdTmp:
+            /* looks like an indirect branch (branch to unknown) */
+            addEvent_Bi( &clgs, curr_inode, sbIn->next );
+            break;
+         default:
+            /* shouldn't happen - if the incoming IR is properly
+               flattened, should only have tmp and const cases to
+               consider. */
+            tl_assert(0);
+      }
+   }
+
    /* At the end of the bb.  Flush outstandings. */
    flushEvents( &clgs );
 
@@ -1236,10 +1467,61 @@
   }
 }
 
+static UInt ULong_width(ULong n)
+{
+   UInt w = 0;
+   while (n > 0) {
+      n = n / 10;
+      w++;
+   }
+   if (w == 0) w = 1;
+   return w + (w-1)/3;   // add space for commas
+}
+
+static
+void branchsim_printstat(int l1, int l2, int l3)
+{
+    static Char buf1[128], buf2[128], buf3[128], fmt[128];
+    FullCost total;
+    ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
+    ULong B_total_b, B_total_mp;
+
+    total = CLG_(total_cost);
+    Bc_total_b  = total[ fullOffset(EG_BC)   ];
+    Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
+    Bi_total_b  = total[ fullOffset(EG_BI)   ];
+    Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
+
+    /* Make format string, getting width right for numbers */
+    VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
+                 l1, l2, l3);
+
+    if (0 == Bc_total_b)  Bc_total_b = 1;
+    if (0 == Bi_total_b)  Bi_total_b = 1;
+    B_total_b  = Bc_total_b  + Bi_total_b;
+    B_total_mp = Bc_total_mp + Bi_total_mp;
+
+    VG_(umsg)("\n");
+    VG_(umsg)(fmt, "Branches:     ",
+              B_total_b, Bc_total_b, Bi_total_b);
+
+    VG_(umsg)(fmt, "Mispredicts:  ",
+              B_total_mp, Bc_total_mp, Bi_total_mp);
+
+    VG_(percentify)(B_total_mp,  B_total_b,  1, l1+1, buf1);
+    VG_(percentify)(Bc_total_mp, Bc_total_b, 1, l2+1, buf2);
+    VG_(percentify)(Bi_total_mp, Bi_total_b, 1, l3+1, buf3);
+
+    VG_(umsg)("Mispred rate:  %s (%s     + %s   )\n", buf1, buf2,buf3);
+}
+
+
 static
 void finish(void)
 {
-  char buf[RESULTS_BUF_LEN];
+  Char buf[RESULTS_BUF_LEN], fmt[128];
+  Int l1, l2, l3;
+  FullCost total;
 
   CLG_DEBUG(0, "finish()\n");
 
@@ -1334,8 +1616,33 @@
   VG_(message)(Vg_UserMsg, "Collected : %s\n", buf);
   VG_(message)(Vg_UserMsg, "\n");
 
-  //  if (CLG_(clo).simulate_cache)
-  (*CLG_(cachesim).printstat)();
+  /* determine value widths for statistics */
+  total = CLG_(total_cost);
+  l1 = ULong_width( total[fullOffset(EG_IR)] );
+  l2 = l3 = 0;
+  if (CLG_(clo).simulate_cache) {
+      l2 = ULong_width( total[fullOffset(EG_DR)] );
+      l3 = ULong_width( total[fullOffset(EG_DW)] );
+  }
+  if (CLG_(clo).simulate_branch) {
+      int l2b = ULong_width( total[fullOffset(EG_BC)] );
+      int l3b = ULong_width( total[fullOffset(EG_BI)] );
+      if (l2b > l2) l2 = l2b;
+      if (l3b > l3) l3 = l3b;
+  }
+
+  /* Make format string, getting width right for numbers */
+  VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
+
+  /* Always print this */
+  VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)] );
+
+  if (CLG_(clo).simulate_cache)
+      (*CLG_(cachesim).printstat)(l1, l2, l3);
+
+  if (CLG_(clo).simulate_branch)
+      branchsim_printstat(l1, l2, l3);
+
 }