Callgrind merge: code


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5780 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/callgrind/main.c b/callgrind/main.c
new file mode 100644
index 0000000..dd19b3b
--- /dev/null
+++ b/callgrind/main.c
@@ -0,0 +1,1086 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                       main.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call graph
+   profiling programs.
+
+   Copyright (C) 2002-2005, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This skin is derived from and contains code from Cachegrind
+   Copyright (C) 2002-2005 Nicholas Nethercote (njn25@cam.ac.uk)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "config.h"
+#include "callgrind.h"
+#include "global.h"
+
+#include <pub_tool_threadstate.h>
+
+/*------------------------------------------------------------*/
+/*--- Global variables                                     ---*/
+/*------------------------------------------------------------*/
+
+/* for all threads */
+CommandLineOptions CLG_(clo);
+Statistics CLG_(stat);
+Bool CLG_(instrument_state) = True; /* Instrumentation on ? */
+
+/* thread and signal handler specific */
+exec_state CLG_(current_state);
+
+
+/*------------------------------------------------------------*/
+/*--- Statistics                                           ---*/
+/*------------------------------------------------------------*/
+
+static void CLG_(init_statistics)(Statistics* s)
+{
+  s->call_counter        = 0;
+  s->jcnd_counter        = 0;
+  s->jump_counter        = 0;
+  s->rec_call_counter    = 0;
+  s->ret_counter         = 0;
+  s->bb_executions       = 0;
+
+  s->context_counter     = 0;
+  s->bb_retranslations   = 0;
+
+  s->distinct_objs       = 0;
+  s->distinct_files      = 0;
+  s->distinct_fns        = 0;
+  s->distinct_contexts   = 0;
+  s->distinct_bbs        = 0;
+  s->distinct_bbccs      = 0;
+  s->distinct_instrs     = 0;
+  s->distinct_skips      = 0;
+
+  s->bb_hash_resizes     = 0;
+  s->bbcc_hash_resizes   = 0;
+  s->jcc_hash_resizes    = 0;
+  s->cxt_hash_resizes    = 0;
+  s->fn_array_resizes    = 0;
+  s->call_stack_resizes  = 0;
+  s->fn_stack_resizes    = 0;
+
+  s->full_debug_BBs      = 0;
+  s->file_line_debug_BBs = 0;
+  s->fn_name_debug_BBs   = 0;
+  s->no_debug_BBs        = 0;
+  s->bbcc_lru_misses     = 0;
+  s->jcc_lru_misses      = 0;
+  s->cxt_lru_misses      = 0;
+  s->bbcc_clones         = 0;
+}
+
+
+    
+
+/*------------------------------------------------------------*/
+/*--- Cache simulation instrumentation phase               ---*/
+/*------------------------------------------------------------*/
+
+
+static Bool loadStoreAddrsMatch(IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+{
+  // I'm assuming that for 'modify' instructions, that Vex always makes
+  // the loadAddrExpr and storeAddrExpr be of the same type, ie. both Tmp
+  // expressions, or both Const expressions.
+  CLG_ASSERT(isIRAtom(loadAddrExpr));
+  CLG_ASSERT(isIRAtom(storeAddrExpr));
+  return eqIRAtom(loadAddrExpr, storeAddrExpr);
+}
+
+static
+EventSet* insert_simcall(IRBB* bbOut, InstrInfo* ii, UInt dataSize,
+			 Bool instrIssued,
+			 IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+{
+    HChar*    helperName;
+    void*     helperAddr;
+    Int       argc;
+    EventSet* es;
+    IRExpr   *arg1, *arg2 = 0, *arg3 = 0, **argv;
+    IRDirty* di;
+
+    /* Check type of original instruction regarding memory access,
+     * and collect info to be able to generate fitting helper call
+     */
+    if (!loadAddrExpr && !storeAddrExpr) {
+	// no load/store
+	CLG_ASSERT(0 == dataSize);
+	if (instrIssued) {
+	    helperName = 0;
+	    helperAddr = 0;
+	}
+	else {
+	    helperName = CLG_(cachesim).log_1I0D_name;
+	    helperAddr = CLG_(cachesim).log_1I0D;
+	}
+	argc = 1;
+	es = CLG_(sets).D0;
+	
+    } else if (loadAddrExpr && !storeAddrExpr) {
+	// load
+	CLG_ASSERT( isIRAtom(loadAddrExpr) );
+	if (instrIssued) {
+	    helperName = CLG_(cachesim).log_0I1Dr_name;
+	    helperAddr = CLG_(cachesim).log_0I1Dr;
+	}
+	else {
+	    helperName = CLG_(cachesim).log_1I1Dr_name;
+	    helperAddr = CLG_(cachesim).log_1I1Dr;
+	}
+	argc = 2;
+	arg2 = loadAddrExpr;
+	es = CLG_(sets).D1r;
+
+    } else if (!loadAddrExpr && storeAddrExpr) {
+	// store
+	CLG_ASSERT( isIRAtom(storeAddrExpr) );
+	if (instrIssued) {
+	    helperName = CLG_(cachesim).log_0I1Dw_name;
+	    helperAddr = CLG_(cachesim).log_0I1Dw;
+	}
+	else {
+	    helperName = CLG_(cachesim).log_1I1Dw_name;
+	    helperAddr = CLG_(cachesim).log_1I1Dw;
+	}
+	argc = 2;
+	arg2 = storeAddrExpr;
+	es = CLG_(sets).D1w;
+	
+    } else {
+	CLG_ASSERT( loadAddrExpr && storeAddrExpr );
+	CLG_ASSERT( isIRAtom(loadAddrExpr) );
+	CLG_ASSERT( isIRAtom(storeAddrExpr) );
+	
+	if ( loadStoreAddrsMatch(loadAddrExpr, storeAddrExpr) ) {
+	    /* modify: suppose write access, as this is
+	     * more resource consuming (as in callgrind for VG2)
+	     * Cachegrind does a read here (!)
+	     * DISCUSS: Best way depends on simulation model?
+	     */
+	    if (instrIssued) {
+		helperName = CLG_(cachesim).log_0I1Dw_name;
+		helperAddr = CLG_(cachesim).log_0I1Dw;
+	    }
+	    else {
+		helperName = CLG_(cachesim).log_1I1Dw_name;
+		helperAddr = CLG_(cachesim).log_1I1Dw;
+	    }
+	    argc = 2;
+	    arg2 = storeAddrExpr;
+	    es = CLG_(sets).D1w;
+	    
+	} else {
+	    // load/store
+	    if (instrIssued) {
+		helperName = CLG_(cachesim).log_0I2D_name;
+		helperAddr = CLG_(cachesim).log_0I2D;
+	    }
+	    else {
+		helperName = CLG_(cachesim).log_1I2D_name;
+		helperAddr = CLG_(cachesim).log_1I2D;
+	    }
+	    argc = 3;
+	    arg2 = loadAddrExpr;
+	    arg3 = storeAddrExpr;
+	    es = CLG_(sets).D2;
+	}
+    }
+
+    /* helper could be unset depending on the simulator used */
+    if (helperAddr == 0) return 0;
+    
+    /* Setup 1st arg: InstrInfo */
+    arg1 = mkIRExpr_HWord( (HWord)ii );
+    
+    // Add call to the instrumentation function
+    if      (argc == 1)
+	argv = mkIRExprVec_1(arg1);
+    else if (argc == 2)
+	argv = mkIRExprVec_2(arg1, arg2);
+    else if (argc == 3)
+	argv = mkIRExprVec_3(arg1, arg2, arg3);
+    else
+	VG_(tool_panic)("argc... not 1 or 2 or 3?");
+    
+    di = unsafeIRDirty_0_N( argc, helperName, helperAddr, argv);
+    addStmtToIRBB( bbOut, IRStmt_Dirty(di) );
+
+    return es;
+}
+
+
+/* Instrumentation before a conditional jump or at the end
+ * of each original instruction.
+ * Fills the InstrInfo struct if not seen before
+ */
+static
+void endOfInstr(IRBB* bbOut, InstrInfo* ii, Bool bb_seen_before,
+		UInt instr_offset, UInt instrLen, UInt dataSize, 
+		UInt* cost_offset, Bool instrIssued,
+		IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+{
+   IRType    wordTy;
+   EventSet* es;
+
+   // Stay sane ...
+   CLG_ASSERT(sizeof(HWord) == sizeof(void*));
+   if (sizeof(HWord) == 4) {
+      wordTy = Ity_I32;
+   } else
+   if (sizeof(HWord) == 8) {
+      wordTy = Ity_I64;
+   } else {
+      VG_(tool_panic)("endOfInstr: strange word size");
+   }
+
+   if (loadAddrExpr) 
+      CLG_ASSERT(wordTy == typeOfIRExpr(bbOut->tyenv, loadAddrExpr));
+   if (storeAddrExpr) 
+      CLG_ASSERT(wordTy == typeOfIRExpr(bbOut->tyenv, storeAddrExpr));
+
+   // Large (eg. 28B, 108B, 512B on x86) data-sized instructions will be
+   // done inaccurately, but they're very rare and this avoids errors from
+   // hitting more than two cache lines in the simulation.
+   if (dataSize > MIN_LINE_SIZE) dataSize = MIN_LINE_SIZE;
+
+   /* returns 0 if simulator needs no instrumentation */
+   es = insert_simcall(bbOut, ii, dataSize, instrIssued,
+		       loadAddrExpr, storeAddrExpr);
+
+   if (bb_seen_before) {
+       CLG_ASSERT(ii->instr_offset == instr_offset);
+       CLG_ASSERT(ii->instr_size == instrLen);
+       CLG_ASSERT(ii->data_size == dataSize);
+       CLG_ASSERT(ii->cost_offset == *cost_offset);
+       CLG_ASSERT(ii->eventset == es);
+   }
+   else {
+       ii->instr_offset = instr_offset;
+       ii->instr_size = instrLen;
+       ii->data_size = dataSize;
+       ii->cost_offset = *cost_offset;
+       ii->eventset = es;
+
+       CLG_(stat).distinct_instrs++;
+   }
+
+   *cost_offset += es ? es->size : 0;
+
+   CLG_DEBUG(5, "  Instr +%2d (Size %d, DSize %d): ESet %s (Size %d)\n",
+	     instr_offset, instrLen, dataSize, 
+	     es ? es->name : (Char*)"(no Instr)",
+	     es ? es->size : 0);
+}
+
+#if defined(VG_BIGENDIAN)
+# define CLGEndness Iend_BE
+#elif defined(VG_LITTLEENDIAN)
+# define CLGEndness Iend_LE
+#else
+# error "Unknown endianness"
+#endif
+
+static
+Addr IRConst2Addr(IRConst* con)
+{
+    Addr addr;
+
+    if (sizeof(Addr) == 4) {
+	CLG_ASSERT( con->tag == Ico_U32 );
+	addr = con->Ico.U32;
+    }
+    else if (sizeof(Addr) == 8) {
+	CLG_ASSERT( con->tag == Ico_U64 );
+	addr = con->Ico.U64;
+    }
+    else
+	VG_(tool_panic)("Callgrind: invalid Addr type");
+
+    return addr;
+}
+
+/* First pass over a BB to instrument, counting instructions and jumps
+ * This is needed for the size of the BB struct to allocate
+ *
+ * Called from CLG_(get_bb)
+ */
+void CLG_(collectBlockInfo)(IRBB* bbIn,
+			    /*INOUT*/ UInt* instrs,
+			    /*INOUT*/ UInt* cjmps,
+			    /*INOUT*/ Bool* cjmp_inverted)
+{
+    Int i;
+    IRStmt* st;
+    Addr instrAddr =0, jumpDst;
+    UInt instrLen = 0;
+    Bool toNextInstr = False;
+
+    // Ist_Exit has to be ignored in preamble code, before first IMark:
+    // preamble code is added by VEX for self modifying code, and has
+    // nothing to do with client code
+    Bool inPreamble = True;
+
+    if (!bbIn) return;
+
+    for (i = 0; i < bbIn->stmts_used; i++) {
+	  st = bbIn->stmts[i];
+	  if (Ist_IMark == st->tag) {
+	      inPreamble = False;
+
+	      instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
+	      instrLen  = st->Ist.IMark.len;
+
+	      (*instrs)++;
+	      toNextInstr = False;
+	  }
+	  if (inPreamble) continue;
+	  if (Ist_Exit == st->tag) {
+	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
+	      toNextInstr =  (jumpDst == instrAddr + instrLen);
+	      
+	      (*cjmps)++;
+	  }
+    }
+
+    /* if the last instructions of BB conditionally jumps to next instruction
+     * (= first instruction of next BB in memory), this is a inverted by VEX.
+     */
+    *cjmp_inverted = toNextInstr;
+}
+
+static
+void collectStatementInfo(IRTypeEnv* tyenv, IRBB* bbOut, IRStmt* st,
+			  Addr* instrAddr, UInt* instrLen,
+			  IRExpr** loadAddrExpr, IRExpr** storeAddrExpr,
+			  UInt* dataSize, IRType hWordTy)
+{
+   CLG_ASSERT(isFlatIRStmt(st));
+
+   switch (st->tag) {
+   case Ist_NoOp:
+      break;
+
+   case Ist_AbiHint:
+      /* ABI hints aren't interesting.  Ignore. */
+      break;
+
+   case Ist_IMark:
+      /* st->Ist.IMark.addr is a 64-bit int.  ULong_to_Ptr casts this
+         to the host's native pointer type; if that is 32 bits then it
+         discards the upper 32 bits.  If we are cachegrinding on a
+         32-bit host then we are also ensured that the guest word size
+         is 32 bits, due to the assertion in cg_instrument that the
+         host and guest word sizes must be the same.  Hence
+         st->Ist.IMark.addr will have been derived from a 32-bit guest
+         code address and truncation of it is safe.  I believe this
+         assignment should be correct for both 32- and 64-bit
+         machines. */
+      *instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
+      *instrLen =        st->Ist.IMark.len;
+      break;
+
+   case Ist_Tmp: {
+      IRExpr* data = st->Ist.Tmp.data;
+      if (data->tag == Iex_Load) {
+         IRExpr* aexpr = data->Iex.Load.addr;
+         CLG_ASSERT( isIRAtom(aexpr) );
+         // Note also, endianness info is ignored.  I guess that's not
+         // interesting.
+         // XXX: repe cmpsb does two loads... the first one is ignored here!
+         //tl_assert( NULL == *loadAddrExpr );          // XXX: ???
+         *loadAddrExpr = aexpr;
+         *dataSize = sizeofIRType(data->Iex.Load.ty);
+      }
+      break;
+   }
+      
+   case Ist_Store: {
+      IRExpr* data  = st->Ist.Store.data;
+      IRExpr* aexpr = st->Ist.Store.addr;
+      CLG_ASSERT( isIRAtom(aexpr) );
+      if ( NULL == *storeAddrExpr ) {
+          /* this is a kludge: ignore all except the first store from
+             an instruction. */
+          *storeAddrExpr = aexpr;
+          *dataSize = sizeofIRType(typeOfIRExpr(tyenv, data));
+      }
+      break;
+   }
+   
+   case Ist_Dirty: {
+      IRDirty* d = st->Ist.Dirty.details;
+      if (d->mFx != Ifx_None) {
+         /* This dirty helper accesses memory.  Collect the
+            details. */
+         CLG_ASSERT(d->mAddr != NULL);
+         CLG_ASSERT(d->mSize != 0);
+         *dataSize = d->mSize;
+         if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
+            *loadAddrExpr = d->mAddr;
+         if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
+            *storeAddrExpr = d->mAddr;
+      } else {
+         CLG_ASSERT(d->mAddr == NULL);
+         CLG_ASSERT(d->mSize == 0);
+      }
+      break;
+   }
+
+   case Ist_Put:
+   case Ist_PutI:
+   case Ist_MFence:
+   case Ist_Exit:
+       break;
+
+   default:
+      VG_(printf)("\n");
+      ppIRStmt(st);
+      VG_(printf)("\n");
+      VG_(tool_panic)("Callgrind: unhandled IRStmt");
+   }
+}
+
+static
+void addConstMemStoreStmt( IRBB* bbOut, UWord addr, UInt val, IRType hWordTy)
+{
+    addStmtToIRBB( bbOut,
+		   IRStmt_Store(CLGEndness,
+				IRExpr_Const(hWordTy == Ity_I32 ?
+					     IRConst_U32( addr ) :
+					     IRConst_U64( addr )),
+				IRExpr_Const(IRConst_U32(val)) ));
+}   
+
+static
+IRBB* CLG_(instrument)( VgCallbackClosure* closure,
+			IRBB* bbIn,
+			VexGuestLayout* layout,
+			VexGuestExtents* vge,
+			IRType gWordTy, IRType hWordTy )
+{
+   Int      i;
+   IRBB*    bbOut;
+   IRStmt*  st, *stnext;
+   Addr     instrAddr, origAddr;
+   UInt     instrLen = 0, dataSize;
+   UInt     instrCount, costOffset;
+   IRExpr  *loadAddrExpr, *storeAddrExpr;
+
+   BB*         bb;
+
+   IRDirty* di;
+   IRExpr  *arg1, **argv;
+
+   Bool        bb_seen_before     = False;
+   UInt        cJumps = 0, cJumpsCorrected;
+   Bool        beforeIBoundary, instrIssued;
+
+   if (gWordTy != hWordTy) {
+      /* We don't currently support this case. */
+      VG_(tool_panic)("host/guest word size mismatch");
+   }
+
+   // No instrumentation if it is switched off
+   if (! CLG_(instrument_state)) {
+       CLG_DEBUG(5, "instrument(BB %p) [Instrumentation OFF]\n",
+		 (Addr)closure->readdr);
+       return bbIn;
+   }
+
+   CLG_DEBUG(3, "+ instrument(BB %p)\n", (Addr)closure->readdr);
+
+   /* Set up BB for instrumented IR */
+   bbOut           = emptyIRBB();
+   bbOut->tyenv    = dopyIRTypeEnv(bbIn->tyenv);
+   bbOut->next     = dopyIRExpr(bbIn->next);
+   bbOut->jumpkind = bbIn->jumpkind;
+
+   // Copy verbatim any IR preamble preceding the first IMark
+   i = 0;
+   while (i < bbIn->stmts_used && bbIn->stmts[i]->tag != Ist_IMark) {
+      addStmtToIRBB( bbOut, bbIn->stmts[i] );
+      i++;
+   }
+
+   // Get the first statement, and origAddr from it
+   CLG_ASSERT(bbIn->stmts_used > 0);
+   st = bbIn->stmts[i];
+   CLG_ASSERT(Ist_IMark == st->tag);
+   instrAddr = origAddr = (Addr)st->Ist.IMark.addr;
+   CLG_ASSERT(origAddr == st->Ist.IMark.addr);  // XXX: check no overflow
+
+   /* Get BB (creating if necessary).
+    * JS: The hash table is keyed with orig_addr_noredir -- important!
+    * JW: Why? If it is because of different chasing of the redirection,
+    *     this is not needed, as chasing is switched off in callgrind
+    */
+   bb = CLG_(get_bb)(origAddr, bbIn, &bb_seen_before);
+   //bb = CLG_(get_bb)(orig_addr_noredir, bbIn, &bb_seen_before);
+
+   /* 
+    * Precondition:
+    * - jmps_passed has number of cond.jumps passed in last executed BB
+    * - current_bbcc has a pointer to the BBCC of the last executed BB
+    *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
+    *     current_bbcc->bb->jmp_addr
+    *   gives the address of the jump source.
+    *   
+    * The BBCC setup does 2 things:
+    * - trace call:
+    *   * Unwind own call stack, i.e sync our ESP with real ESP
+    *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
+    *   * For CALLs or JMPs crossing objects, record call arg +
+    *     push are on own call stack
+    *
+    * - prepare for cache log functions:
+    *   Set current_bbcc to BBCC that gets the costs for this BB execution
+    *   attached
+    */
+
+   // helper call to setup_bbcc, with pointer to basic block info struct as argument
+   arg1 = mkIRExpr_HWord( (HWord)bb );
+   argv = mkIRExprVec_1(arg1);
+   di = unsafeIRDirty_0_N( 1, "setup_bbcc", & CLG_(setup_bbcc), argv);
+   addStmtToIRBB( bbOut, IRStmt_Dirty(di) );
+
+   instrCount = 0;
+   costOffset = 0;
+
+   // loop for each host instruction (starting from 'i')
+   do {
+
+      // We should be at an IMark statement
+      CLG_ASSERT(Ist_IMark == st->tag);
+
+      // Reset stuff for this original instruction
+      loadAddrExpr = storeAddrExpr = NULL;
+      instrIssued = False;
+      dataSize = 0;
+
+      // Process all the statements for this original instruction (ie. until
+      // the next IMark statement, or the end of the block)
+      do {
+	  i++;
+	  stnext = ( i < bbIn->stmts_used ? bbIn->stmts[i] : NULL );
+	  beforeIBoundary = !stnext || (Ist_IMark == stnext->tag);
+	  collectStatementInfo(bbIn->tyenv, bbOut, st, &instrAddr, &instrLen,
+			       &loadAddrExpr, &storeAddrExpr, &dataSize, hWordTy);
+
+	  // instrument a simulator call before conditional jumps
+	  if (st->tag == Ist_Exit) {
+	      // Nb: instrLen will be zero if Vex failed to decode it.
+	      // Also Client requests can appear to be very large (eg. 18
+	      // bytes on x86) because they are really multiple instructions.
+	      CLG_ASSERT( 0 == instrLen ||
+			  bbIn->jumpkind == Ijk_ClientReq ||
+			  (instrLen >= VG_MIN_INSTR_SZB && 
+			   instrLen <= VG_MAX_INSTR_SZB) );
+
+              // Add instrumentation before this statement
+	      endOfInstr(bbOut, &(bb->instr[instrCount]), bb_seen_before,
+			 instrAddr - origAddr, instrLen, dataSize, &costOffset,
+			 instrIssued, loadAddrExpr, storeAddrExpr);
+
+	      // prepare for a possible further simcall in same host instr
+	      loadAddrExpr = storeAddrExpr = NULL;
+	      instrIssued = True;
+
+	      if (!bb_seen_before) {
+		  bb->jmp[cJumps].instr = instrCount;
+		  bb->jmp[cJumps].skip = False;
+	      }
+	      
+	      /* Update global variable jmps_passed (this is before the jump!)
+	       * A correction is needed if VEX inverted the last jump condition
+	       */
+	      cJumpsCorrected = cJumps;
+	      if ((cJumps+1 == bb->cjmp_count) && bb->cjmp_inverted) cJumpsCorrected++;
+	      addConstMemStoreStmt( bbOut, (UWord) &CLG_(current_state).jmps_passed,
+				    cJumpsCorrected, hWordTy);
+
+	      cJumps++;
+	  }
+
+	  addStmtToIRBB( bbOut, st );
+	  st = stnext;
+      } 
+      while (!beforeIBoundary);
+
+      // Add instrumentation for this original instruction.
+      if (!instrIssued || (loadAddrExpr != 0) || (storeAddrExpr !=0))
+	  endOfInstr(bbOut, &(bb->instr[instrCount]), bb_seen_before,
+		     instrAddr - origAddr, instrLen, dataSize, &costOffset,
+		     instrIssued, loadAddrExpr, storeAddrExpr);
+
+      instrCount++;
+   }
+   while (st);
+
+   /* Always update global variable jmps_passed (at end of BB)
+    * A correction is needed if VEX inverted the last jump condition
+    */
+   cJumpsCorrected = cJumps;
+   if (bb->cjmp_inverted) cJumpsCorrected--;
+   addConstMemStoreStmt( bbOut, (UWord) &CLG_(current_state).jmps_passed,
+			 cJumpsCorrected, hWordTy);
+
+   /* This stores the instr of the call/ret at BB end */
+   bb->jmp[cJumps].instr = instrCount-1;
+
+   CLG_ASSERT(bb->cjmp_count == cJumps);
+   CLG_ASSERT(bb->instr_count == instrCount);
+
+   instrAddr += instrLen;
+   if (bb_seen_before) {
+       CLG_ASSERT(bb->instr_len == instrAddr - origAddr);
+       CLG_ASSERT(bb->cost_count == costOffset);
+       CLG_ASSERT(bb->jmpkind == bbIn->jumpkind);
+   }
+   else {
+       bb->instr_len = instrAddr - origAddr;
+       bb->cost_count = costOffset;
+       bb->jmpkind = bbIn->jumpkind;
+   }
+   
+   CLG_DEBUG(3, "- instrument(BB %p): byteLen %u, CJumps %u, CostLen %u\n",
+	     origAddr, bb->instr_len, bb->cjmp_count, bb->cost_count);
+   if (cJumps>0) {
+       CLG_DEBUG(3, "                     [ ");
+       for (i=0;i<cJumps;i++)
+	   CLG_DEBUG(3, "%d ", bb->jmp[i].instr);
+       CLG_DEBUG(3, "], last inverted: %s \n", bb->cjmp_inverted ? "yes":"no");
+   }
+
+  return bbOut;
+}
+
+/*--------------------------------------------------------------------*/
+/*--- Discarding BB info                                           ---*/
+/*--------------------------------------------------------------------*/
+
+// Called when a translation is removed from the translation cache for
+// any reason at all: to free up space, because the guest code was
+// unmapped or modified, or for any arbitrary reason.
+static
+void clg_discard_basic_block_info ( Addr64 orig_addr64, VexGuestExtents vge )
+{
+    Addr orig_addr = (Addr)orig_addr64;
+
+    tl_assert(vge.n_used > 0);
+
+   if (0)
+      VG_(printf)( "discard_basic_block_info: %p, %p, %llu\n",
+                   (void*)(Addr)orig_addr,
+                   (void*)(Addr)vge.base[0], (ULong)vge.len[0]);
+
+   // Get BB info, remove from table, free BB info.  Simple!  Note that we
+   // use orig_addr, not the first instruction address in vge.
+   CLG_(delete_bb)(orig_addr);
+}
+
+
+/*------------------------------------------------------------*/
+/*--- CLG_(fini)() and related function                     ---*/
+/*------------------------------------------------------------*/
+
+
+
+static void zero_thread_cost(thread_info* t)
+{
+  Int i;
+
+  for(i = 0; i < CLG_(current_call_stack).sp; i++) {
+    if (!CLG_(current_call_stack).entry[i].jcc) continue;
+
+    /* reset call counters to current for active calls */
+    CLG_(copy_cost)( CLG_(sets).full, 
+		    CLG_(current_call_stack).entry[i].enter_cost,
+		    CLG_(current_state).cost );
+  }
+
+  CLG_(forall_bbccs)(CLG_(zero_bbcc));
+
+  /* set counter for last dump */
+  CLG_(copy_cost)( CLG_(sets).full, 
+		  t->lastdump_cost, CLG_(current_state).cost );
+}
+
+void CLG_(zero_all_cost)(Bool only_current_thread)
+{
+  if (VG_(clo_verbosity) > 1)
+    VG_(message)(Vg_DebugMsg, "  Zeroing costs...");
+
+  if (only_current_thread)
+    zero_thread_cost(CLG_(get_current_thread)());
+  else
+    CLG_(forall_threads)(zero_thread_cost);
+
+  if (VG_(clo_verbosity) > 1)
+    VG_(message)(Vg_DebugMsg, "  ...done");
+}
+
+static
+void unwind_thread(thread_info* t)
+{
+  /* unwind signal handlers */
+  while(CLG_(current_state).sig !=0)
+    CLG_(post_signal)(CLG_(current_tid),CLG_(current_state).sig);
+
+  /* unwind regular call stack */
+  while(CLG_(current_call_stack).sp>0)
+    CLG_(pop_call_stack)();
+}
+
+/* Ups, this can go wrong... */
+extern void VG_(discard_translations) ( Addr64 start, ULong range );
+
+void CLG_(set_instrument_state)(Char* reason, Bool state)
+{
+  if (CLG_(instrument_state) == state) {
+    CLG_DEBUG(2, "%s: instrumentation already %s\n",
+	     reason, state ? "ON" : "OFF");
+    return;
+  }
+  CLG_(instrument_state) = state;
+  CLG_DEBUG(2, "%s: Switching instrumentation %s ...\n",
+	   reason, state ? "ON" : "OFF");
+
+  VG_(discard_translations)( (Addr64)0x1000, (ULong) ~0xfffl);
+
+  /* reset internal state: call stacks, simulator */
+  CLG_(forall_threads)(unwind_thread);
+  (*CLG_(cachesim).clear)();
+  if (0)
+    CLG_(forall_threads)(zero_thread_cost);
+
+  if (!state)
+    CLG_(init_exec_state)( &CLG_(current_state) );
+
+  if (VG_(clo_verbosity) > 1)
+    VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n",
+		 reason, state ? "ON" : "OFF");
+}
+  
+
+static
+Bool CLG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
+{
+   if (!VG_IS_TOOL_USERREQ('C','T',args[0]))
+      return False;
+
+   switch(args[0]) {
+   case VG_USERREQ__DUMP_STATS:     
+      CLG_(dump_profile)("Client Request", True);
+      *ret = 0;                 /* meaningless */
+      break;
+
+   case VG_USERREQ__DUMP_STATS_AT:
+     {
+       Char buf[512];
+       VG_(sprintf)(buf,"Client Request: %d", args[1]);
+       CLG_(dump_profile)(buf, True);
+       *ret = 0;                 /* meaningless */
+     }
+     break;
+
+   case VG_USERREQ__ZERO_STATS:
+     CLG_(zero_all_cost)(True);
+      *ret = 0;                 /* meaningless */
+      break;
+
+   case VG_USERREQ__TOGGLE_COLLECT:
+     CLG_(current_state).collect = !CLG_(current_state).collect;
+     CLG_DEBUG(2, "Client Request: toggled collection state to %s\n",
+	      CLG_(current_state).collect ? "ON" : "OFF");
+     *ret = 0;                 /* meaningless */
+     break;
+
+   case VG_USERREQ__START_INSTRUMENTATION:
+     CLG_(set_instrument_state)("Client Request", True);
+     *ret = 0;                 /* meaningless */
+     break;
+
+   case VG_USERREQ__STOP_INSTRUMENTATION:
+     CLG_(set_instrument_state)("Client Request", False);
+     *ret = 0;                 /* meaningless */
+     break;
+
+   default:
+      return False;
+   }
+
+   return True;
+}
+
+
+/* Syscall Timing */
+
+/* struct timeval syscalltime[VG_N_THREADS]; */
+#if CLG_MICROSYSTIME
+#include <sys/time.h>
+#include <sys/syscall.h>
+extern Int VG_(do_syscall) ( UInt, ... );
+
+ULong syscalltime[VG_N_THREADS];
+#else
+UInt syscalltime[VG_N_THREADS];
+#endif
+
+static
+void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno)
+{
+  if (CLG_(clo).collect_systime) {
+#if CLG_MICROSYSTIME
+    struct vki_timeval tv_now;
+    VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
+    syscalltime[tid] = tv_now.tv_sec * 1000000ULL + tv_now.tv_usec;
+#else
+    syscalltime[tid] = VG_(read_millisecond_timer)();
+#endif
+  }
+}
+
+static
+void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno, SysRes res)
+{
+  if (CLG_(clo).collect_systime) {
+    Int o = CLG_(sets).off_full_systime;
+#if CLG_MICROSYSTIME
+    struct vki_timeval tv_now;
+    ULong diff;
+    
+    VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
+    diff = (tv_now.tv_sec * 1000000ULL + tv_now.tv_usec) - syscalltime[tid];
+#else
+    UInt diff = VG_(read_millisecond_timer)() - syscalltime[tid];
+#endif  
+    
+    CLG_DEBUG(0,"   Time (Off %d) for Syscall %d: %ull\n", o, syscallno, diff);
+    
+    if (o<0) return;
+
+    CLG_(current_state).cost[o] ++;
+    CLG_(current_state).cost[o+1] += diff;
+    if (!CLG_(current_state).bbcc->skipped)
+      CLG_(init_cost_lz)(CLG_(sets).full,
+			&(CLG_(current_state).bbcc->skipped));
+    CLG_(current_state).bbcc->skipped[o] ++;
+    CLG_(current_state).bbcc->skipped[o+1] += diff;
+  }
+}
+
+static
+void finish(void)
+{
+  char buf[RESULTS_BUF_LEN];
+
+  CLG_DEBUG(0, "finish()\n");
+
+  (*CLG_(cachesim).finish)();
+
+  /* pop all remaining items from CallStack for correct sum
+   */
+  CLG_(forall_threads)(unwind_thread);
+  
+  CLG_(dump_profile)(0, False);
+  
+  CLG_(finish_command)();
+  
+  if (VG_(clo_verbosity) == 0) return;
+  
+  /* Hash table stats */
+  if (VG_(clo_verbosity) > 1) {
+    int BB_lookups =
+      CLG_(stat).full_debug_BBs +
+      CLG_(stat).fn_name_debug_BBs +
+      CLG_(stat).file_line_debug_BBs +
+      CLG_(stat).no_debug_BBs;
+    
+    VG_(message)(Vg_DebugMsg, "");
+    VG_(message)(Vg_DebugMsg, "Distinct objects: %d",
+		 CLG_(stat).distinct_objs);
+    VG_(message)(Vg_DebugMsg, "Distinct files:   %d",
+		 CLG_(stat).distinct_files);
+    VG_(message)(Vg_DebugMsg, "Distinct fns:     %d",
+		 CLG_(stat).distinct_fns);
+    VG_(message)(Vg_DebugMsg, "Distinct contexts:%d",
+		 CLG_(stat).distinct_contexts);
+    VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d",
+		 CLG_(stat).distinct_bbs);
+    VG_(message)(Vg_DebugMsg, "Cost entries:     %d (Chunks %d)",
+		 CLG_(costarray_entries), CLG_(costarray_chunks));
+    VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d",
+		 CLG_(stat).distinct_bbccs);
+    VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d",
+		 CLG_(stat).distinct_jccs);
+    VG_(message)(Vg_DebugMsg, "Distinct skips:   %d",
+		 CLG_(stat).distinct_skips);
+    VG_(message)(Vg_DebugMsg, "BB lookups:       %d",
+		 BB_lookups);
+    if (BB_lookups>0) {
+      VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)", 
+		   CLG_(stat).full_debug_BBs    * 100 / BB_lookups,
+		   CLG_(stat).full_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)", 
+		   CLG_(stat).file_line_debug_BBs * 100 / BB_lookups,
+		   CLG_(stat).file_line_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)", 
+		   CLG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
+		   CLG_(stat).fn_name_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)", 
+		   CLG_(stat).no_debug_BBs      * 100 / BB_lookups,
+		   CLG_(stat).no_debug_BBs);
+    }
+    VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d",
+		 CLG_(stat).bbcc_clones);
+    VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d",
+		 CLG_(stat).bb_retranslations);
+    VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d",
+		 CLG_(stat).distinct_instrs);
+    VG_(message)(Vg_DebugMsg, "");
+    
+    VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d",
+		 CLG_(stat).cxt_lru_misses);
+    VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d",
+		 CLG_(stat).bbcc_lru_misses);
+    VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d",
+		 CLG_(stat).jcc_lru_misses);
+    VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu",
+		 CLG_(stat).bb_executions);
+    VG_(message)(Vg_DebugMsg, "Calls:             %llu",
+		 CLG_(stat).call_counter);
+    VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu",
+		 CLG_(stat).jcnd_counter);
+    VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu",
+		 CLG_(stat).jump_counter);
+    VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu",
+		 CLG_(stat).rec_call_counter);
+    VG_(message)(Vg_DebugMsg, "Returns:           %llu",
+		 CLG_(stat).ret_counter);
+
+    VG_(message)(Vg_DebugMsg, "");
+  }
+
+  CLG_(sprint_eventmapping)(buf, CLG_(dumpmap));
+  VG_(message)(Vg_UserMsg, "Events    : %s", buf);
+  CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), CLG_(total_cost));
+  VG_(message)(Vg_UserMsg, "Collected : %s", buf);
+  VG_(message)(Vg_UserMsg, "");
+
+  //  if (CLG_(clo).simulate_cache)
+  (*CLG_(cachesim).printstat)();
+}
+
+
+void CLG_(fini)(Int exitcode)
+{
+  finish();
+}
+
+
+/*--------------------------------------------------------------------*/
+/*--- Setup                                                        ---*/
+/*--------------------------------------------------------------------*/
+
+static
+void CLG_(post_clo_init)(void)
+{
+   Char *dir = 0, *fname = 0;
+
+   VG_(clo_vex_control).iropt_unroll_thresh = 0;
+   VG_(clo_vex_control).guest_chase_thresh = 0;
+
+   CLG_DEBUG(1, "  dump threads: %s\n", CLG_(clo).separate_threads ? "Yes":"No");
+   CLG_DEBUG(1, "  call sep. : %d\n", CLG_(clo).separate_callers);
+   CLG_DEBUG(1, "  rec. sep. : %d\n", CLG_(clo).separate_recursions);
+
+   if (!CLG_(clo).dump_line && !CLG_(clo).dump_instr && !CLG_(clo).dump_bb) {
+       VG_(message)(Vg_UserMsg, "Using source line as position.");
+       CLG_(clo).dump_line = True;
+   }
+
+   CLG_(init_files)(&dir,&fname);
+   CLG_(init_command)(dir,fname);
+
+   (*CLG_(cachesim).post_clo_init)();
+
+   CLG_(init_eventsets)(0);
+   CLG_(init_statistics)(& CLG_(stat));
+   CLG_(init_cost_lz)( CLG_(sets).full, &CLG_(total_cost) );
+
+   /* initialize hash tables */
+   CLG_(init_obj_table)();
+   CLG_(init_cxt_table)();
+   CLG_(init_bb_hash)();
+
+   CLG_(init_threads)();
+   CLG_(run_thread)(1);
+
+   CLG_(instrument_state) = CLG_(clo).instrument_atstart;
+
+   VG_(message)(Vg_UserMsg, "");
+   VG_(message)(Vg_UserMsg, "For interactive control, run 'callgrind_control -h'.");
+}
+
+static
+void CLG_(pre_clo_init)(void)
+{
+    VG_(details_name)            ("Callgrind");
+    VG_(details_version)         (VERSION);
+    VG_(details_description)     ("a call-graph generating cache profiler");
+    VG_(details_copyright_author)("Copyright (C) 2002-2006, and GNU GPL'd, "
+				  "by J.Weidendorfer et al.");
+    VG_(details_bug_reports_to)  ("Josef.Weidendorfer@gmx.de");
+    VG_(details_avg_translation_sizeB) ( 155 );
+
+    VG_(basic_tool_funcs)        (CLG_(post_clo_init),
+                                  CLG_(instrument),
+                                  CLG_(fini));
+
+    VG_(needs_basic_block_discards)(clg_discard_basic_block_info);
+
+
+    VG_(needs_command_line_options)(CLG_(process_cmd_line_option),
+				    CLG_(print_usage),
+				    CLG_(print_debug_usage));
+
+    VG_(needs_client_requests)(CLG_(handle_client_request));
+    VG_(needs_syscall_wrapper)(CLG_(pre_syscalltime),
+			       CLG_(post_syscalltime));
+
+    VG_(track_thread_run) ( & CLG_(run_thread) );
+    VG_(track_pre_deliver_signal)  ( & CLG_(pre_signal) );
+    VG_(track_post_deliver_signal)  ( & CLG_(post_signal) );
+
+    CLG_(set_clo_defaults)();
+}
+
+VG_DETERMINE_INTERFACE_VERSION(CLG_(pre_clo_init))
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                   main.c ---*/
+/*--------------------------------------------------------------------*/