Callgrind: add branch prediction from Cachegrind

Callgrind now uses Cachegrind's command line option to switch
on simulation: "--branch-sim=yes/no" for branch prediction,
and "--cache-sim=yes/no" for cache simulation (for more
consistency and to avoid confusion). However, the previously
used "--simulate-cache=yes/no" still is supported but deprecated.

Included: according documentation and tests.

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11207 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/NEWS b/NEWS
index ff3e5df..25cceb3 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,10 @@
   harder than the heap-level output, but this option is useful if you want
   to account for every byte of memory used by a program.
 
+- Callgrind now can do branch prediction simulation, similar to Cachegrind.
+  In addition, it optionally can count the number of executed global bus events.
+  Both can be used for a better approximation of a "Cycle Estimation" as
+  derived event (you need to update the event formula in KCachegrind yourself).
 
 
 Release 3.5.0 (19 August 2009)
diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c
index 8620c6f..24862a8 100644
--- a/callgrind/bbcc.c
+++ b/callgrind/bbcc.c
@@ -580,6 +580,7 @@
 
   if (last_bb) {
       passed = CLG_(current_state).jmps_passed;
+      CLG_ASSERT(passed <= last_bb->cjmp_count);
       if (passed == last_bb->cjmp_count) {
 	  jmpkind = last_bb->jmpkind;
 
@@ -599,9 +600,9 @@
 	  last_bbcc->ecounter_sum++;
 	  last_bbcc->jmp[passed].ecounter++;
 	  if (!CLG_(clo).simulate_cache) {
-	      /* update Ir cost */
-	      int instr_count = last_bb->jmp[passed].instr+1;
-	      CLG_(current_state).cost[ fullOffset(EG_IR) ] += instr_count;
+	      /* update Ir cost */              
+              UInt instr_count = last_bb->jmp[passed].instr+1;
+              CLG_(current_state).cost[ fullOffset(EG_IR) ] += instr_count;
 	  }
       }
 
diff --git a/callgrind/clo.c b/callgrind/clo.c
index e4da421..fc99b0d 100644
--- a/callgrind/clo.c
+++ b/callgrind/clo.c
@@ -415,8 +415,6 @@
    /* compatibility alias, deprecated option */
    else if VG_BOOL_CLO(arg, "--trace-jump",    CLG_(clo).collect_jumps) {}
 
-   else if VG_BOOL_CLO(arg, "--collect-bus", CLG_(clo).collect_bus) {}
-
    else if VG_BOOL_CLO(arg, "--combine-dumps", CLG_(clo).combine_dumps) {}
 
    else if VG_BOOL_CLO(arg, "--collect-atstart", CLG_(clo).collect_atstart) {}
@@ -527,8 +525,13 @@
 
    else if VG_BOOL_CLO(arg, "--collect-alloc",   CLG_(clo).collect_alloc) {}
    else if VG_BOOL_CLO(arg, "--collect-systime", CLG_(clo).collect_systime) {}
+   else if VG_BOOL_CLO(arg, "--collect-bus",     CLG_(clo).collect_bus) {}
+   /* for option compatibility with cachegrind */
+   else if VG_BOOL_CLO(arg, "--cache-sim",       CLG_(clo).simulate_cache) {}
+   /* compatibility alias, deprecated option */
    else if VG_BOOL_CLO(arg, "--simulate-cache",  CLG_(clo).simulate_cache) {}
-
+   /* for option compatibility with cachegrind */
+   else if VG_BOOL_CLO(arg, "--branch-sim",      CLG_(clo).simulate_branch) {}
    else {
        Bool isCachesimOption = (*CLG_(cachesim).parse_opt)(arg);
 
@@ -592,6 +595,9 @@
 #if CLG_EXPERIMENTAL
 "    --fn-group<no>=<func>     Put function into separation group <no>\n"
 #endif
+"\n   simulation options:\n"
+"    --branch-sim=no|yes       Do branch prediction simulation [no]\n"
+"    --cache-sim=no|yes        Do cache simulation [no]\n"
     );
 
    (*CLG_(cachesim).print_opts)();
@@ -642,6 +648,7 @@
   CLG_(clo).collect_jumps    = False;
   CLG_(clo).collect_alloc    = False;
   CLG_(clo).collect_systime  = False;
+  CLG_(clo).collect_bus      = False;
 
   CLG_(clo).skip_plt         = True;
   CLG_(clo).separate_callers = 0;
@@ -651,6 +658,7 @@
   /* Instrumentation */
   CLG_(clo).instrument_atstart = True;
   CLG_(clo).simulate_cache = False;
+  CLG_(clo).simulate_branch = False;
 
   /* Call graph */
   CLG_(clo).pop_on_jump = False;
diff --git a/callgrind/docs/cl-manual.xml b/callgrind/docs/cl-manual.xml
index a133984..e2289ff 100644
--- a/callgrind/docs/cl-manual.xml
+++ b/callgrind/docs/cl-manual.xml
@@ -4,7 +4,7 @@
 [ <!ENTITY % vg-entities SYSTEM "../../docs/xml/vg-entities.xml"> %vg-entities; ]>
 
 <chapter id="cl-manual" xreflabel="Callgrind Manual">
-<title>Callgrind: a call-graph generating cache profiler</title>
+<title>Callgrind: a call-graph generating cache and branch prediction profiler</title>
 
 
 <para>To use this tool, you must specify
@@ -14,14 +14,14 @@
 <sect1 id="cl-manual.use" xreflabel="Overview">
 <title>Overview</title>
 
-<para>Callgrind is a profiling tool that can
-construct a call graph for a program's run.
+<para>Callgrind is a profiling tool that records the call history among
+functions in a program's run as a call-graph.
 By default, the collected data consists of
 the number of instructions executed, their relationship
 to source lines, the caller/callee relationship between functions,
 and the numbers of such calls.
-Optionally, a cache simulator (similar to Cachegrind) can produce
-further information about the memory access behavior of the application.
+Optionally, cache simulation and/or branch prediction (similar to Cachegrind)
+can produce further information about the runtime behavior of an application.
 </para>
 
 <para>The profile data is written out to a file at program
@@ -175,10 +175,10 @@
   results in this case.</para>
 
   <para>If you are additionally interested in measuring the 
-  cache behavior of your 
-  program, use Callgrind with the option
-  <option><xref linkend="opt.simulate-cache"/>=yes.</option>
-  However, expect a  further slow down approximately by a factor of 2.</para>
+  cache behavior of your program, use Callgrind with the option
+  <option><xref linkend="clopt.cache-sim"/>=yes</option>. For
+  branch prediction simulation, use <option><xref linkend="clopt.branch-sim"/>=yes</option>.
+  Expect a further slow down approximately by a factor of 2.</para>
 
   <para>If the program section you want to profile is somewhere in the
   middle of the run, it is beneficial to 
@@ -371,7 +371,7 @@
   "global bus events" is used.</para>
 
   <para>The short name of the event type used for global bus events is "Ge".
-  To count global bus events, use <option><xref linkend="opt.collect-bus"/></option>.
+  To count global bus events, use <option><xref linkend="clopt.collect-bus"/>=yes</option>.
   </para>
   </sect2>
 
@@ -779,7 +779,7 @@
     </listitem>
   </varlistentry>
 
-  <varlistentry id="opt.collect-bus" xreflabel="--collect-bus">
+  <varlistentry id="clopt.collect-bus" xreflabel="--collect-bus">
     <term>
       <option><![CDATA[--collect-bus=<no|yes> [default: no] ]]></option>
     </term>
@@ -917,22 +917,54 @@
 <!-- end of xi:include in the manpage -->
 </sect2>
 
+
 <sect2 id="cl-manual.options.simulation"
+       xreflabel="Simulation options">
+<title>Simulation options</title>
+
+<!-- start of xi:include in the manpage -->
+<variablelist id="cl.opts.list.simulation">
+
+  <varlistentry id="clopt.cache-sim" xreflabel="--cache-sim">
+    <term>
+      <option><![CDATA[--cache-sim=<yes|no> [default: no] ]]></option>
+    </term>
+    <listitem>
+      <para>Specify if you want to do full cache simulation.  By default,
+      only instruction read accesses will be counted ("Ir").
+      With cache simulation, further event counters are enabled:
+      Cache misses on instruction reads ("I1mr"/"I2mr"),
+      data read accesses ("Dr") and related cache misses ("D1mr"/"D2mr"),
+      data write accesses ("Dw") and related cache misses ("D1mw"/"D2mw").
+      For more information, see <xref linkend="cg-manual"/>.
+      </para>
+    </listitem>
+  </varlistentry>
+
+  <varlistentry id="clopt.branch-sim" xreflabel="--branch-sim">
+    <term>
+      <option><![CDATA[--branch-sim=<yes|no> [default: no] ]]></option>
+    </term>
+    <listitem>
+      <para>Specify if you want to do branch prediction simulation.
+      Further event counters are enabled: Number of executed conditional
+      branches and related predictor misses ("Bc"/"Bcm"), executed indirect
+      jumps and related misses of the jump address predictor ("Bi"/"Bim").
+      </para>
+    </listitem>
+  </varlistentry>
+
+</variablelist>
+<!-- end of xi:include in the manpage -->
+</sect2>
+
+
+<sect2 id="cl-manual.options.cachesimulation"
        xreflabel="Cache simulation options">
 <title>Cache simulation options</title>
 
 <!-- start of xi:include in the manpage -->
-<variablelist id="cl.opts.list.simulation">
-  
-  <varlistentry id="opt.simulate-cache" xreflabel="--simulate-cache">
-    <term>
-      <option><![CDATA[--simulate-cache=<yes|no> [default: no] ]]></option>
-    </term>
-    <listitem>
-      <para>Specify if you want to do full cache simulation.  By default,
-      only instruction read accesses will be profiled.</para>
-    </listitem>
-  </varlistentry>
+<variablelist id="cl.opts.list.cachesimulation">
 
   <varlistentry id="opt.simulate-wb" xreflabel="--simulate-wb">
     <term>
diff --git a/callgrind/global.h b/callgrind/global.h
index db694a8..706edec 100644
--- a/callgrind/global.h
+++ b/callgrind/global.h
@@ -92,6 +92,7 @@
   /* Instrument options */
   Bool instrument_atstart;  /* Instrument at start? */
   Bool simulate_cache;      /* Call into cache simulator ? */
+  Bool simulate_branch;     /* Call into branch prediction simulator ? */
 
   /* Call graph generation */
   Bool pop_on_jump;       /* Handle a jump between functions as ret+call */
@@ -652,7 +653,7 @@
     void (*post_clo_init)(void);
     void (*clear)(void);
     void (*getdesc)(Char* buf);
-    void (*printstat)(void);  
+    void (*printstat)(Int,Int,Int);
     void (*add_icost)(SimCost, BBCC*, InstrInfo*, ULong);
     void (*finish)(void);
     
@@ -681,9 +682,11 @@
 #define EG_IR    1
 #define EG_DR    2
 #define EG_DW    3
-#define EG_BUS   4
-#define EG_ALLOC 5
-#define EG_SYS   6
+#define EG_BC    4
+#define EG_BI    5
+#define EG_BUS   6
+#define EG_ALLOC 7
+#define EG_SYS   8
 
 struct event_sets {
     EventSet *base, *full;
diff --git a/callgrind/main.c b/callgrind/main.c
index c0290b4..e36ba8a 100644
--- a/callgrind/main.c
+++ b/callgrind/main.c
@@ -37,6 +37,8 @@
 
 #include <pub_tool_threadstate.h>
 
+#include "cg_branchpred.c"
+
 /*------------------------------------------------------------*/
 /*--- Global variables                                     ---*/
 /*------------------------------------------------------------*/
@@ -103,11 +105,13 @@
 {
     ULong* cost_Bus;
 
-    CLG_DEBUG(0, "log_global_event:  Ir  %#lx/%u\n",
+    CLG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
               CLG_(bb_base) + ii->instr_offset, ii->instr_size);
 
     if (!CLG_(current_state).collect) return;
 
+    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
+
     CLG_(current_state).cost[ fullOffset(EG_BUS) ]++;
 
     if (CLG_(current_state).nonskipped)
@@ -118,6 +122,71 @@
 }
 
 
+/* For branches, we consult two different predictors, one which
+   predicts taken/untaken for conditional branches, and the other
+   which predicts the branch target address for indirect branches
+   (jump-to-register style ones). */
+
+static VG_REGPARM(2)
+void log_cond_branch(InstrInfo* ii, Word taken)
+{
+    Bool miss;
+    Int fullOffset_Bc;
+    ULong* cost_Bc;
+
+    CLG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %lu\n",
+              CLG_(bb_base) + ii->instr_offset, taken);
+
+    miss = 1 & do_cond_branch_predict(CLG_(bb_base) + ii->instr_offset, taken);
+
+    if (!CLG_(current_state).collect) return;
+
+    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
+
+    if (CLG_(current_state).nonskipped)
+        cost_Bc = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
+    else
+        cost_Bc = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
+
+    fullOffset_Bc = fullOffset(EG_BC);
+    CLG_(current_state).cost[ fullOffset_Bc ]++;
+    cost_Bc[0]++;
+    if (miss) {
+        CLG_(current_state).cost[ fullOffset_Bc+1 ]++;
+        cost_Bc[1]++;
+    }
+}
+
+static VG_REGPARM(2)
+void log_ind_branch(InstrInfo* ii, UWord actual_dst)
+{
+    Bool miss;
+    Int fullOffset_Bi;
+    ULong* cost_Bi;
+
+    CLG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
+              CLG_(bb_base) + ii->instr_offset, actual_dst);
+
+    miss = 1 & do_ind_branch_predict(CLG_(bb_base) + ii->instr_offset, actual_dst);
+
+    if (!CLG_(current_state).collect) return;
+
+    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
+
+    if (CLG_(current_state).nonskipped)
+        cost_Bi = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
+    else
+        cost_Bi = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
+
+    fullOffset_Bi = fullOffset(EG_BI);
+    CLG_(current_state).cost[ fullOffset_Bi ]++;
+    cost_Bi[0]++;
+    if (miss) {
+        CLG_(current_state).cost[ fullOffset_Bi+1 ]++;
+        cost_Bi[1]++;
+    }
+}
+
 /*------------------------------------------------------------*/
 /*--- Instrumentation structures and event queue handling  ---*/
 /*------------------------------------------------------------*/
@@ -161,6 +230,8 @@
       Ev_Dr,  // Data read
       Ev_Dw,  // Data write
       Ev_Dm,  // Data modify (read then write)
+      Ev_Bc,  // branch conditional
+      Ev_Bi,  // branch indirect (to unknown destination)
       Ev_G    // Global bus event
    }
    EventTag;
@@ -184,6 +255,12 @@
 	    IRAtom* ea;
 	    Int     szB;
 	 } Dm;
+         struct {
+            IRAtom* taken; /* :: Ity_I1 */
+         } Bc;
+         struct {
+            IRAtom* dst;
+         } Bi;
 	 struct {
 	 } G;
       } Ev;
@@ -269,6 +346,16 @@
 	 ppIRExpr(ev->Ev.Dm.ea);
 	 VG_(printf)("\n");
 	 break;
+      case Ev_Bc:
+         VG_(printf)("Bc %p   GA=", ev->inode);
+         ppIRExpr(ev->Ev.Bc.taken);
+         VG_(printf)("\n");
+         break;
+      case Ev_Bi:
+         VG_(printf)("Bi %p  DST=", ev->inode);
+         ppIRExpr(ev->Ev.Bi.dst);
+         VG_(printf)("\n");
+         break;
       case Ev_G:
          VG_(printf)("G  %p\n", ev->inode);
          break;
@@ -306,18 +393,28 @@
 	       ev->inode->eventset = CLG_(sets).base;
 	       break;
 	   case Ev_Dr:
-	       // extend event set by Dr counter
+               // extend event set by Dr counters
 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
 							   EG_DR);
 	       break;
 	   case Ev_Dw:
 	   case Ev_Dm:
-	       // extend event set by Dw counter
+               // extend event set by Dw counters
 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
 							   EG_DW);
 	       break;
+           case Ev_Bc:
+               // extend event set by Bc counters
+               ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
+                                                           EG_BC);
+               break;
+           case Ev_Bi:
+               // extend event set by Bi counters
+               ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
+                                                           EG_BI);
+               break;
 	   case Ev_G:
-	       // extend event set by Bus counter
+               // extend event set by Bus counter
 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
 							   EG_BUS);
 	       break;
@@ -436,6 +533,22 @@
 	    regparms = 3;
 	    inew = i+1;
 	    break;
+         case Ev_Bc:
+            /* Conditional branch */
+            helperName = "log_cond_branch";
+            helperAddr = &log_cond_branch;
+            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
+            regparms = 2;
+            inew = i+1;
+            break;
+         case Ev_Bi:
+            /* Branch to an unknown destination */
+            helperName = "log_ind_branch";
+            helperAddr = &log_ind_branch;
+            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
+            regparms = 2;
+            inew = i+1;
+            break;
          case Ev_G:
             /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
             helperName = "log_global_event";
@@ -549,10 +662,51 @@
 }
 
 static
+void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
+{
+   Event* evt;
+   tl_assert(isIRAtom(guard));
+   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
+             == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
+   if (!CLG_(clo).simulate_branch) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag         = Ev_Bc;
+   evt->inode       = inode;
+   evt->Ev.Bc.taken = guard;
+   clgs->events_used++;
+}
+
+static
+void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
+{
+   Event* evt;
+   tl_assert(isIRAtom(whereTo));
+   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
+             == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
+   if (!CLG_(clo).simulate_branch) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Bi;
+   evt->inode     = inode;
+   evt->Ev.Bi.dst = whereTo;
+   clgs->events_used++;
+}
+
+static
 void addEvent_G ( ClgState* clgs, InstrInfo* inode )
 {
    Event* evt;
    if (!CLG_(clo).collect_bus) return;
+
    if (clgs->events_used == N_EVENTS)
       flushEvents(clgs);
    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
@@ -753,6 +907,7 @@
    Int      i, isize;
    IRStmt*  st;
    Addr     origAddr;
+   Addr64   cia; /* address of current insn */
    InstrInfo* curr_inode = NULL;
    ClgState clgs;
    UInt     cJumps = 0;
@@ -789,6 +944,8 @@
    CLG_ASSERT(Ist_IMark == st->tag);
 
    origAddr = (Addr)st->Ist.IMark.addr;
+   cia   = st->Ist.IMark.addr;
+   isize = st->Ist.IMark.len;
    CLG_ASSERT(origAddr == st->Ist.IMark.addr);  // XXX: check no overflow
 
    /* Get BB struct (creating if necessary).
@@ -819,8 +976,9 @@
 	    break;
 
 	 case Ist_IMark: {
-	    CLG_ASSERT(clgs.instr_offset == (Addr)st->Ist.IMark.addr - origAddr);
-	    isize = st->Ist.IMark.len;
+            cia   = st->Ist.IMark.addr;
+            isize = st->Ist.IMark.len;
+            CLG_ASSERT(clgs.instr_offset == (Addr)cia - origAddr);
 	    // If Vex fails to decode an instruction, the size will be zero.
 	    // Pretend otherwise.
 	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
@@ -925,7 +1083,63 @@
          }
 
  	 case Ist_Exit: {
-	    UInt jmps_passed;
+            Bool guest_exit, inverted;
+
+            /* VEX code generation sometimes inverts conditional branches.
+             * As Callgrind counts (conditional) jumps, it has to correct
+             * inversions. The heuristic is the following:
+             * (1) Callgrind switches off SB chasing and unrolling, and
+             *     therefore it assumes that a candidate for inversion only is
+             *     the last conditional branch in an SB.
+             * (2) inversion is assumed if the branch jumps to the address of
+             *     the next guest instruction in memory.
+             * This heuristic is precalculated in CLG_(collectBlockInfo)().
+             *
+             * Branching behavior is also used for branch prediction. Note that
+             * above heuristic is different from what Cachegrind does.
+             * Cachegrind uses (2) for all branches.
+             */
+            if (cJumps+1 == clgs.bb->cjmp_count)
+                inverted = clgs.bb->cjmp_inverted;
+            else
+                inverted = False;
+
+            // call branch predictor only if this is a branch in guest code
+            guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
+                         (st->Ist.Exit.jk == Ijk_Call) ||
+                         (st->Ist.Exit.jk == Ijk_Ret);
+
+            if (guest_exit) {
+                /* Stuff to widen the guard expression to a host word, so
+                   we can pass it to the branch predictor simulation
+                   functions easily. */
+                IRType   tyW    = hWordTy;
+                IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
+                IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
+                IRTemp   guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
+                IRTemp   guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
+                IRTemp   guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
+                IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
+                                               : IRExpr_Const(IRConst_U64(1));
+
+                /* Widen the guard expression. */
+                addStmtToIRSB( clgs.sbOut,
+                               IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
+                addStmtToIRSB( clgs.sbOut,
+                               IRStmt_WrTmp( guardW,
+                                             IRExpr_Unop(widen,
+                                                         IRExpr_RdTmp(guard1))) );
+                /* If the exit is inverted, invert the sense of the guard. */
+                addStmtToIRSB(
+                        clgs.sbOut,
+                        IRStmt_WrTmp(
+                                guard,
+                                inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
+                                    : IRExpr_RdTmp(guardW)
+                                    ));
+                /* And post the event. */
+                addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
+            }
 
 	    /* We may never reach the next statement, so need to flush
 	       all outstanding transactions now. */
@@ -940,12 +1154,9 @@
 	    /* Update global variable jmps_passed before the jump
 	     * A correction is needed if VEX inverted the last jump condition
 	    */
-	    jmps_passed = cJumps;
-	    if ((cJumps+1 == clgs.bb->cjmp_count) && clgs.bb->cjmp_inverted)
-		jmps_passed++;
 	    addConstMemStoreStmt( clgs.sbOut,
 				  (UWord) &CLG_(current_state).jmps_passed,
-				  jmps_passed, hWordTy);
+                                  inverted ? cJumps+1 : cJumps, hWordTy);
 	    cJumps++;
 
 	    break;
@@ -966,6 +1177,26 @@
       }
    }
 
+   /* Deal with branches to unknown destinations.  Except ignore ones
+      which are function returns as we assume the return stack
+      predictor never mispredicts. */
+   if (sbIn->jumpkind == Ijk_Boring) {
+      if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
+      switch (sbIn->next->tag) {
+         case Iex_Const:
+            break; /* boring - branch to known address */
+         case Iex_RdTmp:
+            /* looks like an indirect branch (branch to unknown) */
+            addEvent_Bi( &clgs, curr_inode, sbIn->next );
+            break;
+         default:
+            /* shouldn't happen - if the incoming IR is properly
+               flattened, should only have tmp and const cases to
+               consider. */
+            tl_assert(0);
+      }
+   }
+
    /* At the end of the bb.  Flush outstandings. */
    flushEvents( &clgs );
 
@@ -1236,10 +1467,61 @@
   }
 }
 
+static UInt ULong_width(ULong n)
+{
+   UInt w = 0;
+   while (n > 0) {
+      n = n / 10;
+      w++;
+   }
+   if (w == 0) w = 1;
+   return w + (w-1)/3;   // add space for commas
+}
+
+static
+void branchsim_printstat(int l1, int l2, int l3)
+{
+    static Char buf1[128], buf2[128], buf3[128], fmt[128];
+    FullCost total;
+    ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
+    ULong B_total_b, B_total_mp;
+
+    total = CLG_(total_cost);
+    Bc_total_b  = total[ fullOffset(EG_BC)   ];
+    Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
+    Bi_total_b  = total[ fullOffset(EG_BI)   ];
+    Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
+
+    /* Make format string, getting width right for numbers */
+    VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
+                 l1, l2, l3);
+
+    if (0 == Bc_total_b)  Bc_total_b = 1;
+    if (0 == Bi_total_b)  Bi_total_b = 1;
+    B_total_b  = Bc_total_b  + Bi_total_b;
+    B_total_mp = Bc_total_mp + Bi_total_mp;
+
+    VG_(umsg)("\n");
+    VG_(umsg)(fmt, "Branches:     ",
+              B_total_b, Bc_total_b, Bi_total_b);
+
+    VG_(umsg)(fmt, "Mispredicts:  ",
+              B_total_mp, Bc_total_mp, Bi_total_mp);
+
+    VG_(percentify)(B_total_mp,  B_total_b,  1, l1+1, buf1);
+    VG_(percentify)(Bc_total_mp, Bc_total_b, 1, l2+1, buf2);
+    VG_(percentify)(Bi_total_mp, Bi_total_b, 1, l3+1, buf3);
+
+    VG_(umsg)("Mispred rate:  %s (%s     + %s   )\n", buf1, buf2,buf3);
+}
+
+
 static
 void finish(void)
 {
-  char buf[RESULTS_BUF_LEN];
+  Char buf[RESULTS_BUF_LEN], fmt[128];
+  Int l1, l2, l3;
+  FullCost total;
 
   CLG_DEBUG(0, "finish()\n");
 
@@ -1334,8 +1616,33 @@
   VG_(message)(Vg_UserMsg, "Collected : %s\n", buf);
   VG_(message)(Vg_UserMsg, "\n");
 
-  //  if (CLG_(clo).simulate_cache)
-  (*CLG_(cachesim).printstat)();
+  /* determine value widths for statistics */
+  total = CLG_(total_cost);
+  l1 = ULong_width( total[fullOffset(EG_IR)] );
+  l2 = l3 = 0;
+  if (CLG_(clo).simulate_cache) {
+      l2 = ULong_width( total[fullOffset(EG_DR)] );
+      l3 = ULong_width( total[fullOffset(EG_DW)] );
+  }
+  if (CLG_(clo).simulate_branch) {
+      int l2b = ULong_width( total[fullOffset(EG_BC)] );
+      int l3b = ULong_width( total[fullOffset(EG_BI)] );
+      if (l2b > l2) l2 = l2b;
+      if (l3b > l3) l3 = l3b;
+  }
+
+  /* Make format string, getting width right for numbers */
+  VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
+
+  /* Always print this */
+  VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)] );
+
+  if (CLG_(clo).simulate_cache)
+      (*CLG_(cachesim).printstat)(l1, l2, l3);
+
+  if (CLG_(clo).simulate_branch)
+      branchsim_printstat(l1, l2, l3);
+
 }
 
 
diff --git a/callgrind/sim.c b/callgrind/sim.c
index 01fd5bc..0841d2c 100644
--- a/callgrind/sim.c
+++ b/callgrind/sim.c
@@ -1490,8 +1490,7 @@
 void cachesim_print_opts(void)
 {
   VG_(printf)(
-"\n   cache simulator options:\n"
-"    --simulate-cache=no|yes   Do cache simulation [no]\n"
+"\n   cache simulator options (does cache simulation if used):\n"
 "    --simulate-wb=no|yes      Count write-back events [no]\n"
 "    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
 #if CLG_EXPERIMENTAL
@@ -1614,7 +1613,7 @@
 }
 
 static
-void cachesim_printstat(void)
+void cachesim_printstat(Int l1, Int l2, Int l3)
 {
   FullCost total = CLG_(total_cost), D_total = 0;
   ULong L2_total_m, L2_total_mr, L2_total_mw,
@@ -1622,7 +1621,6 @@
   char buf1[RESULTS_BUF_LEN], 
     buf2[RESULTS_BUF_LEN], 
     buf3[RESULTS_BUF_LEN];
-  Int l1, l2, l3;
   Int p;
 
   if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
@@ -1633,13 +1631,6 @@
     VG_(message)(Vg_DebugMsg, "\n");
   }
 
-  /* I cache results.  Use the I_refs value to determine the first column
-   * width. */
-  l1 = commify(total[fullOffset(EG_IR)], 0, buf1);
-  VG_(message)(Vg_UserMsg, "I   refs:      %s\n", buf1);
-
-  if (!CLG_(clo).simulate_cache) return;
-
   commify(total[fullOffset(EG_IR) +1], l1, buf1);
   VG_(message)(Vg_UserMsg, "I1  misses:    %s\n", buf1);
 
@@ -1671,8 +1662,8 @@
   CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
 
   commify( D_total[0], l1, buf1);
-  l2 = commify(total[fullOffset(EG_DR)], 0,  buf2);
-  l3 = commify(total[fullOffset(EG_DW)], 0,  buf3);
+  commify(total[fullOffset(EG_DR)], l2,  buf2);
+  commify(total[fullOffset(EG_DW)], l3,  buf3);
   VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)\n",
 	       buf1,  buf2,  buf3);
 
@@ -1782,6 +1773,11 @@
         CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "D2mw", "D2dmw");
     }
 
+    if (CLG_(clo).simulate_branch) {
+        CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
+        CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
+    }
+
     if (CLG_(clo).collect_bus)
 	CLG_(register_event_group)(EG_BUS, "Ge");
 
@@ -1796,6 +1792,7 @@
 
     // event set comprising all event groups, used for inclusive cost
     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
+    CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
     CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
 
@@ -1819,6 +1816,10 @@
     CLG_(append_event)(CLG_(dumpmap), "I2dmr");
     CLG_(append_event)(CLG_(dumpmap), "D2dmr");
     CLG_(append_event)(CLG_(dumpmap), "D2dmw");
+    CLG_(append_event)(CLG_(dumpmap), "Bc");
+    CLG_(append_event)(CLG_(dumpmap), "Bcm");
+    CLG_(append_event)(CLG_(dumpmap), "Bi");
+    CLG_(append_event)(CLG_(dumpmap), "Bim");
     CLG_(append_event)(CLG_(dumpmap), "AcCost1");
     CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
     CLG_(append_event)(CLG_(dumpmap), "AcCost2");
diff --git a/callgrind/tests/filter_stderr b/callgrind/tests/filter_stderr
index 1a58540..d2d7544 100755
--- a/callgrind/tests/filter_stderr
+++ b/callgrind/tests/filter_stderr
@@ -19,6 +19,9 @@
 # Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines
 perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
 
+# Remove numbers from "Branches:", "Mispredicts:, and "Mispred rate:" lines
+perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' |
+
 # Remove CPUID warnings lines for P4s and other machines
 sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
 sed "/Simulating a 16 KB I-cache with 32 B lines/d"   |
diff --git a/callgrind/tests/simwork-both.stderr.exp b/callgrind/tests/simwork-both.stderr.exp
new file mode 100644
index 0000000..b742c21
--- /dev/null
+++ b/callgrind/tests/simwork-both.stderr.exp
@@ -0,0 +1,24 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw Bc Bcm Bi Bim
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
+
+Branches:
+Mispredicts:
+Mispred rate:
diff --git a/callgrind/tests/simwork-both.stdout.exp b/callgrind/tests/simwork-both.stdout.exp
new file mode 100644
index 0000000..d4c867c
--- /dev/null
+++ b/callgrind/tests/simwork-both.stdout.exp
@@ -0,0 +1 @@
+Sum: 1000000
diff --git a/callgrind/tests/simwork-both.vgtest b/callgrind/tests/simwork-both.vgtest
new file mode 100644
index 0000000..19c3ff8
--- /dev/null
+++ b/callgrind/tests/simwork-both.vgtest
@@ -0,0 +1,3 @@
+prog: simwork
+vgopts: --cache-sim=yes --branch-sim=yes
+cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/simwork-branch.stderr.exp b/callgrind/tests/simwork-branch.stderr.exp
new file mode 100644
index 0000000..7cda62e
--- /dev/null
+++ b/callgrind/tests/simwork-branch.stderr.exp
@@ -0,0 +1,10 @@
+
+
+Events    : Ir Bc Bcm Bi Bim
+Collected :
+
+I   refs:
+
+Branches:
+Mispredicts:
+Mispred rate:
diff --git a/callgrind/tests/simwork-branch.stdout.exp b/callgrind/tests/simwork-branch.stdout.exp
new file mode 100644
index 0000000..d4c867c
--- /dev/null
+++ b/callgrind/tests/simwork-branch.stdout.exp
@@ -0,0 +1 @@
+Sum: 1000000
diff --git a/callgrind/tests/simwork-branch.vgtest b/callgrind/tests/simwork-branch.vgtest
new file mode 100644
index 0000000..a866e1e
--- /dev/null
+++ b/callgrind/tests/simwork-branch.vgtest
@@ -0,0 +1,3 @@
+prog: simwork
+vgopts: --branch-sim=yes
+cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/simwork-cache.stderr.exp b/callgrind/tests/simwork-cache.stderr.exp
new file mode 100644
index 0000000..0705c1c
--- /dev/null
+++ b/callgrind/tests/simwork-cache.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/simwork-cache.stdout.exp b/callgrind/tests/simwork-cache.stdout.exp
new file mode 100644
index 0000000..d4c867c
--- /dev/null
+++ b/callgrind/tests/simwork-cache.stdout.exp
@@ -0,0 +1 @@
+Sum: 1000000
diff --git a/callgrind/tests/simwork-cache.vgtest b/callgrind/tests/simwork-cache.vgtest
new file mode 100644
index 0000000..ce222c0
--- /dev/null
+++ b/callgrind/tests/simwork-cache.vgtest
@@ -0,0 +1,3 @@
+prog: simwork
+vgopts: --cache-sim=yes
+cleanup: rm callgrind.out.*