Callgrind new feature: count global bus lock events "Ge"

To count global bus lock events, use "--collect-bus=yes".
For x86, this will count the number of executed instructions
with a lock prefix; for architectures with LL/SC, this will
count the number of executed SC instructions.

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11167 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/callgrind/clo.c b/callgrind/clo.c
index 6d7df37..e4da421 100644
--- a/callgrind/clo.c
+++ b/callgrind/clo.c
@@ -415,6 +415,8 @@
    /* compatibility alias, deprecated option */
    else if VG_BOOL_CLO(arg, "--trace-jump",    CLG_(clo).collect_jumps) {}
 
+   else if VG_BOOL_CLO(arg, "--collect-bus", CLG_(clo).collect_bus) {}
+
    else if VG_BOOL_CLO(arg, "--combine-dumps", CLG_(clo).combine_dumps) {}
 
    else if VG_BOOL_CLO(arg, "--collect-atstart", CLG_(clo).collect_atstart) {}
@@ -572,6 +574,7 @@
 "    --collect-atstart=no|yes  Collect at process/thread start [yes]\n"
 "    --toggle-collect=<func>   Toggle collection on enter/leave function\n"
 "    --collect-jumps=no|yes    Collect jumps? [no]\n"
+"    --collect-bus=no|yes      Collect global bus events? [no]\n"
 #if CLG_EXPERIMENTAL
 "    --collect-alloc=no|yes    Collect memory allocation info? [no]\n"
 #endif
diff --git a/callgrind/docs/cl-manual.xml b/callgrind/docs/cl-manual.xml
index 7e43bfa..a133984 100644
--- a/callgrind/docs/cl-manual.xml
+++ b/callgrind/docs/cl-manual.xml
@@ -353,10 +353,27 @@
   start event collection a few million instructions after you have enabled
   instrumentation.</para>
 
-
   </sect2>
 
+  <sect2 id="cl-manual.busevents" xreflabel="Counting global bus events">
+  <title>Counting global bus events</title>
 
+  <para>For access to shared data among threads in a multithreaded
+  code, synchronization is required to avoid raced conditions.
+  Synchronization primitives are usually implemented via atomic instructions.
+  However, excessive use of such instructions can lead to performance
+  issues.</para>
+
+  <para>To enable analysis of this problem, Callgrind optionally can count
+  the number of atomic instructions executed. More precisely, for x86/x86_64,
+  these are instructions using a lock prefix. For architectures supporting
+  LL/SC, these are the number of SC instructions executed. For both, the term
+  "global bus events" is used.</para>
+
+  <para>The short name of the event type used for global bus events is "Ge".
+  To count global bus events, use <option><xref linkend="opt.collect-bus"/></option>.
+  </para>
+  </sect2>
 
   <sect2 id="cl-manual.cycles" xreflabel="Avoiding cycles">
   <title>Avoiding cycles</title>
@@ -762,6 +779,16 @@
     </listitem>
   </varlistentry>
 
+  <varlistentry id="opt.collect-bus" xreflabel="--collect-bus">
+    <term>
+      <option><![CDATA[--collect-bus=<no|yes> [default: no] ]]></option>
+    </term>
+    <listitem>
+      <para>This specifies whether the number of global bus events executed
+      should be collected. The event type "Ge" is used for these events.</para>
+    </listitem>
+  </varlistentry>
+
 </variablelist>
 <!-- end of xi:include in the manpage -->
 </sect2>
diff --git a/callgrind/global.h b/callgrind/global.h
index b285715..db694a8 100644
--- a/callgrind/global.h
+++ b/callgrind/global.h
@@ -87,6 +87,8 @@
   Bool collect_alloc;    /* Collect size of allocated memory */
   Bool collect_systime;  /* Collect time for system calls */
 
+  Bool collect_bus;      /* Collect global bus events */
+
   /* Instrument options */
   Bool instrument_atstart;  /* Instrument at start? */
   Bool simulate_cache;      /* Call into cache simulator ? */
@@ -679,8 +681,9 @@
 #define EG_IR    1
 #define EG_DR    2
 #define EG_DW    3
-#define EG_ALLOC 4
-#define EG_SYS   5
+#define EG_BUS   4
+#define EG_ALLOC 5
+#define EG_SYS   6
 
 struct event_sets {
     EventSet *base, *full;
diff --git a/callgrind/main.c b/callgrind/main.c
index 4aa3ce7..c0290b4 100644
--- a/callgrind/main.c
+++ b/callgrind/main.c
@@ -95,6 +95,30 @@
 
 
 /*------------------------------------------------------------*/
+/*--- Simple callbacks (not cache similator)               ---*/
+/*------------------------------------------------------------*/
+
+VG_REGPARM(1)
+static void log_global_event(InstrInfo* ii)
+{
+    ULong* cost_Bus;
+
+    CLG_DEBUG(0, "log_global_event:  Ir  %#lx/%u\n",
+              CLG_(bb_base) + ii->instr_offset, ii->instr_size);
+
+    if (!CLG_(current_state).collect) return;
+
+    CLG_(current_state).cost[ fullOffset(EG_BUS) ]++;
+
+    if (CLG_(current_state).nonskipped)
+        cost_Bus = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
+    else
+        cost_Bus = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
+    cost_Bus[0]++;
+}
+
+
+/*------------------------------------------------------------*/
 /*--- Instrumentation structures and event queue handling  ---*/
 /*------------------------------------------------------------*/
 
@@ -137,6 +161,7 @@
       Ev_Dr,  // Data read
       Ev_Dw,  // Data write
       Ev_Dm,  // Data modify (read then write)
+      Ev_G    // Global bus event
    }
    EventTag;
 
@@ -159,6 +184,8 @@
 	    IRAtom* ea;
 	    Int     szB;
 	 } Dm;
+	 struct {
+	 } G;
       } Ev;
    }
    Event;
@@ -242,6 +269,9 @@
 	 ppIRExpr(ev->Ev.Dm.ea);
 	 VG_(printf)("\n");
 	 break;
+      case Ev_G:
+         VG_(printf)("G  %p\n", ev->inode);
+         break;
       default:
 	 tl_assert(0);
 	 break;
@@ -286,6 +316,11 @@
 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
 							   EG_DW);
 	       break;
+	   case Ev_G:
+	       // extend event set by Bus counter
+	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
+							   EG_BUS);
+	       break;
 	   default:
 	       tl_assert(0);
 	   }
@@ -401,6 +436,14 @@
 	    regparms = 3;
 	    inew = i+1;
 	    break;
+         case Ev_G:
+            /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
+            helperName = "log_global_event";
+            helperAddr = &log_global_event;
+            argv = mkIRExprVec_1( i_node_expr );
+            regparms = 1;
+            inew = i+1;
+            break;
 	 default:
 	    tl_assert(0);
       }
@@ -505,6 +548,21 @@
    clgs->events_used++;
 }
 
+static
+void addEvent_G ( ClgState* clgs, InstrInfo* inode )
+{
+   Event* evt;
+   if (!CLG_(clo).collect_bus) return;
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_G;
+   evt->inode     = inode;
+   clgs->events_used++;
+}
+
 /* Initialise or check (if already seen before) an InstrInfo for next insn.
    We only can set instr_offset/instr_size here. The required event set and
    resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
@@ -840,6 +898,7 @@
                dataSize *= 2; /* since this is a doubleword-cas */
             addEvent_Dr( &clgs, curr_inode, dataSize, cas->addr );
             addEvent_Dw( &clgs, curr_inode, dataSize, cas->addr );
+            addEvent_G(  &clgs, curr_inode );
             break;
          }
 
@@ -855,6 +914,12 @@
                dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
                addEvent_Dw( &clgs, curr_inode,
                             sizeofIRType(dataTy), st->Ist.LLSC.addr );
+               /* I don't know whether the global-bus-lock cost should
+                  be attributed to the LL or the SC, but it doesn't
+                  really matter since they always have to be used in
+                  pairs anyway.  Hence put it (quite arbitrarily) on
+                  the SC. */
+               addEvent_G(  &clgs, curr_inode );
             }
             break;
          }
diff --git a/callgrind/sim.c b/callgrind/sim.c
index 61377d1..4282456 100644
--- a/callgrind/sim.c
+++ b/callgrind/sim.c
@@ -1782,6 +1782,9 @@
 	CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "D2mw", "I2dmw");
     }
 
+    if (CLG_(clo).collect_bus)
+	CLG_(register_event_group)(EG_BUS, "Ge");
+
     if (CLG_(clo).collect_alloc)
 	CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
 
@@ -1793,6 +1796,7 @@
 
     // event set comprising all event groups, used for inclusive cost
     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
+    CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
 
     CLG_DEBUGIF(1) {
@@ -1819,6 +1823,7 @@
     CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
     CLG_(append_event)(CLG_(dumpmap), "AcCost2");
     CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
+    CLG_(append_event)(CLG_(dumpmap), "Ge");
     CLG_(append_event)(CLG_(dumpmap), "allocCount");
     CLG_(append_event)(CLG_(dumpmap), "allocSize");
     CLG_(append_event)(CLG_(dumpmap), "sysCount");
@@ -1832,7 +1837,8 @@
 {
     if (!CLG_(clo).simulate_cache)
 	cost[ fullOffset(EG_IR) ] += exe_count;
-    else
+
+    if (ii->eventset)
 	CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
 				  ii->eventset, bbcc->cost + ii->cost_offset);
 }