Reinstate and extend lackey's functionality (a major overhaul), from
Jeroen N. Witmond. Thanks for that.
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@4979 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/lackey/docs/lk-manual.xml b/lackey/docs/lk-manual.xml
index 7baa753..c911f16 100644
--- a/lackey/docs/lk-manual.xml
+++ b/lackey/docs/lk-manual.xml
@@ -5,13 +5,12 @@
<chapter id="lk-manual" xreflabel="Lackey">
<title>Lackey: a very simple profiler</title>
-
<para>Lackey is a simple Valgrind tool that does some basic
program measurement. It adds quite a lot of simple
instrumentation to the program's code. It is primarily intended
to be of use as an example tool.</para>
-<para>It measures three things:</para>
+<para>It measures and reports:</para>
<orderedlist>
@@ -19,21 +18,65 @@
<para>The number of calls to
<computeroutput>_dl_runtime_resolve()</computeroutput>, the
function in glibc's dynamic linker that resolves function
- lookups into shared objects.</para>
- </listitem>
-
- <listitem>
- <para>The number of UCode instructions (UCode is Valgrind's
- RISC-like intermediate language), x86 instructions, and basic
- blocks executed by the program, and some ratios between the
- three counts.</para>
+ references to shared objects.</para>
+ <para>You can change the name of the function with command line
+ option <computeroutput>--fnname=<name></computeroutput>.</para>
</listitem>
<listitem>
<para>The number of conditional branches encountered and the
- proportion of those taken.</para>
+ number and proportion of those taken.</para>
+ </listitem>
+
+ <listitem>
+
+ <para>Statistics about the amount of work done during the execution
+ of the client program:</para>
+
+ <orderedlist>
+
+ <listitem>
+ <para>The number of basic blocks entered and completed by the
+ program. Note that due to optimisations done by the JIT, this
+ is not really an accurate value.</para>
+ </listitem>
+
+ <listitem>
+ <para>The number of guest (x86, amd64, ppc, etc.) instructions and IR
+ statements executed. IR is Valgrind's RISC-like intermediate
+ representation via which all instrumentation is done.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>Ratios between some of these counts.</para>
+ </listitem>
+
+ <listitem>
+ <para>When command line option
+ <computeroutput>--detailed-counts=yes</computeroutput> is
+ specified, a table is printed with counts of loads, stores and ALU
+ operations for various types of operands.</para>
+
+ <para>The types are identified by their IR name ("I1" ... "I128",
+ "F32", "F64", and "V128").</para>
+ </listitem>
+
+ </orderedlist>
+
+ </listitem>
+
+ <listitem>
+ <para>The exit code of the client program.</para>
</listitem>
</orderedlist>
+<para>Note that Lackey runs quite slowly, especially when
+<computeroutput>--detailed-counts=yes</computeroutput> is specified.
+It could be made to run a lot faster by doing a slightly more
+sophisticated job of the instrumentation, but that would undermine
+its role as a simple example tool. Hence we have chosen not to do
+so.</para>
+
</chapter>
diff --git a/lackey/lk_main.c b/lackey/lk_main.c
index 9c1d29e..dfb05a6 100644
--- a/lackey/lk_main.c
+++ b/lackey/lk_main.c
@@ -1,6 +1,6 @@
/*--------------------------------------------------------------------*/
-/*--- Simple tool for counting UInstrs, using a C helper. ---*/
+/*--- An example Valgrind tool. ---*/
/*--- lk_main.c ---*/
/*--------------------------------------------------------------------*/
@@ -33,33 +33,81 @@
#include "pub_tool_tooliface.h"
#include "pub_tool_libcassert.h"
#include "pub_tool_libcprint.h"
+#include "pub_tool_debuginfo.h"
+#include "pub_tool_libcbase.h"
+#include "pub_tool_options.h"
+
+
+/* The name of the function of which the number of calls is to be
+ * counted, with default. Override with command line option
+ * --fnname. */
+static Char* lk_clo_fnname = "_dl_runtime_resolve";
+
+/* If true, show statistics about loads, stores and alu ops. Set
+ * with command line option --detailed-counts. */
+static Bool lk_clo_detailed_counts = False;
+
+/***********************************************************************
+ * Implement the needs_command_line_options for Valgrind.
+ **********************************************************************/
+
+static Bool lk_process_cmd_line_option(Char* arg)
+{
+ VG_STR_CLO(arg, "--fnname", lk_clo_fnname)
+ else VG_BOOL_CLO(arg, "--detailed-counts", lk_clo_detailed_counts)
+ else
+ return False;
+
+ tl_assert(lk_clo_fnname);
+ tl_assert(lk_clo_fnname[0]);
+ return True;
+}
+
+static void lk_print_usage(void)
+{
+ VG_(printf)(
+" --fnname=<name> count calls to <name> [_dl_runtime_resolve]\n"
+" --detailed-counts=no|yes count loads, stores and alu ops [no]\n"
+ );
+}
+
+static void lk_print_debug_usage(void)
+{
+}
+
+/***********************************************************************
+ * Data and helpers related to the default operation of Lackey.
+ **********************************************************************/
/* Nb: use ULongs because the numbers can get very big */
-static ULong n_dlrr_calls = 0;
-static ULong n_BBs = 0;
-static ULong n_UInstrs = 0;
-static ULong n_guest_instrs = 0;
-static ULong n_Jccs = 0;
-static ULong n_Jccs_untaken = 0;
+static ULong n_func_calls = 0;
+static ULong n_BBs_entered = 0;
+static ULong n_BBs_completed = 0;
+static ULong n_IRStmts = 0;
+static ULong n_guest_instrs = 0;
+static ULong n_Jccs = 0;
+static ULong n_Jccs_untaken = 0;
__attribute__((unused))
-static void add_one_dlrr_call(void)
+static void add_one_func_call(void)
{
- n_dlrr_calls++;
+ n_func_calls++;
}
-/* See comment above lk_instrument for reason why n_machine_instrs is
- incremented here. */
-static void add_one_BB(void)
+static void add_one_BB_entered(void)
{
- n_BBs++;
- n_guest_instrs++;
+ n_BBs_entered++;
+}
+
+static void add_one_BB_completed(void)
+{
+ n_BBs_completed++;
}
__attribute__((unused))
-static void add_one_UInstr(void)
+static void add_one_IRStmt(void)
{
- n_UInstrs++;
+ n_IRStmts++;
}
__attribute__((unused))
@@ -78,52 +126,114 @@
n_Jccs_untaken++;
}
-static void lk_post_clo_init(void)
+/***********************************************************************
+ * Data and helpers related to --detailed-counts.
+ **********************************************************************/
+
+/* --- Operations --- */
+
+typedef enum { OpLoad=0, OpStore=1, OpAlu=2 } Op;
+
+#define N_OPS 3
+
+
+/* --- Types --- */
+
+#define N_TYPES 9
+
+static Int type2index ( IRType ty )
{
+ switch (ty) {
+ case Ity_I1: return 0;
+ case Ity_I8: return 1;
+ case Ity_I16: return 2;
+ case Ity_I32: return 3;
+ case Ity_I64: return 4;
+ case Ity_I128: return 5;
+ case Ity_F32: return 6;
+ case Ity_F64: return 7;
+ case Ity_V128: return 8;
+ default: tl_assert(0); break;
+ }
}
-/* Note: machine instructions are marked by an INCEIP at the end of each one,
- except for the final one in the basic block which ends in an
- unconditional JMP. Sometimes the final unconditional JMP is preceded by
- a conditional JMP (Jcc), and thus it isn't reached. Eg:
+static HChar* nameOfTypeIndex ( IRType ty )
+{
+ switch (ty) {
+ case 0: return "I1"; break;
+ case 1: return "I8"; break;
+ case 2: return "I16"; break;
+ case 3: return "I32"; break;
+ case 4: return "I64"; break;
+ case 5: return "I128"; break;
+ case 6: return "F32"; break;
+ case 7: return "F64"; break;
+ case 8: return "V128"; break;
+ default: tl_assert(0); break;
+ }
+}
- <code a>
- INCEIP ...
- <code b>
- Jcc ...
- JMP ... (will not be reached if Jcc succeeds)
+/* --- Counts --- */
- If we simplemindedly added calls to add_one_guest_instr() before INCEIPs
- and unconditional JMPs, we'd sometimes miss the final call (when a
- preceding conditional JMP succeeds), underestimating the machine instruction
- count.
+static ULong detailCounts[N_OPS][N_TYPES];
- <code a>
- call add_one_guest_instr()
- INCEIP ...
+/* The helper that is called from the instrumented code. */
+static VG_REGPARM(1)
+void increment_detail(ULong* detail)
+{
+ (*detail)++;
+}
- <code b>
- Jcc ...
- call add_one_guest_instr()
- JMP ...
+/* A helper that adds the instrumentation for a detail. */
+static void instrument_detail(IRBB* bb, Op op, IRType type)
+{
+ IRDirty* di;
+ IRExpr** argv;
+ const UInt typeIx = type2index(type);
- Instead we add a call before each INCEIP, and also one at the start of the
- block, but not one at the end, viz:
+ tl_assert(op < N_OPS);
+ tl_assert(typeIx < N_TYPES);
- call add_one_guest_instr()
+ argv = mkIRExprVec_1( mkIRExpr_HWord( (HWord)&detailCounts[op][typeIx] ) );
+ di = unsafeIRDirty_0_N( 1, "increment_detail", &increment_detail, argv);
+ addStmtToIRBB( bb, IRStmt_Dirty(di) );
+}
- <code a>
- call add_one_guest_instr()
- INCEIP ...
+/* Summarize and print the details. */
- <code b>
- Jcc ...
- JMP ...
+static void print_details ( void )
+{
+ Int typeIx;
+ VG_(message)(Vg_UserMsg,
+ " Type Loads Stores AluOps");
+ VG_(message)(Vg_UserMsg,
+ " -------------------------------------------");
+ for (typeIx = 0; typeIx < N_TYPES; typeIx++) {
+ VG_(message)(Vg_UserMsg,
+ " %4s %,12llu %,12llu %,12llu",
+ nameOfTypeIndex( typeIx ),
+ detailCounts[OpLoad ][typeIx],
+ detailCounts[OpStore][typeIx],
+ detailCounts[OpAlu ][typeIx]
+ );
+ }
+}
- Which gives us the right answer. And just to avoid two C calls, we fold
- the basic-block-beginning call in with add_one_BB(). Phew.
-*/
+
+/***********************************************************************
+ * Implement the basic_tool_funcs for Valgrind.
+ **********************************************************************/
+
+static void lk_post_clo_init(void)
+{
+ Int op, tyIx;
+
+ for (op = 0; op < N_OPS; op++)
+ for (tyIx = 0; tyIx < N_TYPES; tyIx++)
+ detailCounts[op][tyIx] = 0;
+}
+
static
IRBB* lk_instrument( IRBB* bb_in, VexGuestLayout* layout,
Addr64 orig_addr_noredir, VexGuestExtents* vge,
@@ -132,6 +242,8 @@
IRDirty* di;
Int i;
IRBB* bb;
+ Char fnname[100];
+ IRType type;
if (gWordTy != hWordTy) {
/* We don't currently support this case. */
@@ -144,47 +256,115 @@
bb->next = dopyIRExpr(bb_in->next);
bb->jumpkind = bb_in->jumpkind;
-#if 0
- /* We need to know the entry point for this bb to do this. In any
- case it's pretty meaningless in the presence of bb chasing since
- we may enter this function part way through an IRBB. */
- /* Count call to dlrr(), if this BB is dlrr()'s entry point */
- if (VG_(get_fnname_if_entry)(orig_addr, fnname, 100) &&
- 0 == VG_(strcmp)(fnname, "_dl_runtime_resolve"))
- {
- addStmtToIRBB(
- bb,
- IRStmt_Dirty(
- unsafeIRDirty_0_N( "add_one_dlrr_call", mkIRExprVec_0() )
- ));
+ // Copy verbatim any IR preamble preceding the first IMark
+ i = 0;
+ while (i < bb_in->stmts_used && bb_in->stmts[i]->tag != Ist_IMark) {
+ addStmtToIRBB( bb, bb_in->stmts[i] );
+ i++;
}
-#endif
- /* Count this basic block */
- di = unsafeIRDirty_0_N( 0, "add_one_BB", &add_one_BB, mkIRExprVec_0() );
+ /* Count this basic block. */
+ di = unsafeIRDirty_0_N( 0, "add_one_BB_entered", &add_one_BB_entered,
+ mkIRExprVec_0() );
addStmtToIRBB( bb, IRStmt_Dirty(di) );
- for (i = 0; i < bb_in->stmts_used; i++) {
+ for (/*use current i*/; i < bb_in->stmts_used; i++) {
IRStmt* st = bb_in->stmts[i];
- if (!st) continue;
+ if (!st || st->tag == Ist_NoOp) continue;
+ /* Count one VEX statement. */
+ di = unsafeIRDirty_0_N( 0, "add_one_IRStmt", &add_one_IRStmt,
+ mkIRExprVec_0() );
+ addStmtToIRBB( bb, IRStmt_Dirty(di) );
+
switch (st->tag) {
+ case Ist_IMark:
+ /* Count guest instruction. */
+ di = unsafeIRDirty_0_N( 0, "add_one_guest_instr",
+ &add_one_guest_instr,
+ mkIRExprVec_0() );
+ addStmtToIRBB( bb, IRStmt_Dirty(di) );
+
+ /* An unconditional branch to a known destination in the
+ * guest's instructions can be represented, in the IRBB to
+ * instrument, by the VEX statements that are the
+ * translation of that known destination. This feature is
+ * called 'BB chasing' and can be influenced by command
+ * line option --vex-guest-chase-thresh.
+ *
+ * To get an accurate count of the calls to a specific
+ * function, taking BB chasing into account, we need to
+ * check for each guest instruction (Ist_IMark) if it is
+ * the entry point of a function.
+ */
+ tl_assert(lk_clo_fnname);
+ tl_assert(lk_clo_fnname[0]);
+ if (VG_(get_fnname_if_entry)(st->Ist.IMark.addr,
+ fnname, sizeof(fnname))
+ && 0 == VG_(strcmp)(fnname, lk_clo_fnname)) {
+ di = unsafeIRDirty_0_N( 0, "add_one_func_call",
+ &add_one_func_call,
+ mkIRExprVec_0() );
+ addStmtToIRBB( bb, IRStmt_Dirty(di) );
+ }
+ addStmtToIRBB( bb, st );
+ break;
+
case Ist_Exit:
/* Count Jcc */
- addStmtToIRBB(
- bb,
- IRStmt_Dirty(
- unsafeIRDirty_0_N( 0, "add_one_Jcc", &add_one_Jcc,
- mkIRExprVec_0() )
- ));
- addStmtToIRBB( bb, dopyIRStmt(st) );
+ di = unsafeIRDirty_0_N( 0, "add_one_Jcc", &add_one_Jcc,
+ mkIRExprVec_0() );
+ addStmtToIRBB( bb, IRStmt_Dirty(di) );
+
+ addStmtToIRBB( bb, st );
+
/* Count non-taken Jcc */
- addStmtToIRBB(
- bb,
- IRStmt_Dirty(
- unsafeIRDirty_0_N( 0, "add_one_Jcc_untaken", &add_one_Jcc_untaken,
- mkIRExprVec_0() )
- ));
+ di = unsafeIRDirty_0_N( 0, "add_one_Jcc_untaken",
+ &add_one_Jcc_untaken, mkIRExprVec_0() );
+ addStmtToIRBB( bb, IRStmt_Dirty(di) );
+ break;
+
+ /* Someone on the users list asked for something like this
+ * just the other day (Christian Stimming, "Fast profiling in
+ * valgrind?", 25 Oct). Personally I think it'd be a
+ * valuable addition.
+ *
+ * Not hard to do either: for stores, examine Ist_Store, and
+ * use typeOfIRExpr(bb->tyenv, st->Ist.Store.data) to get the
+ * store type. For loads and ALU ops, you only need to look
+ * at Ist_Tmp cases where the Ist.Tmp.data is either Iex_Load
+ * or Iex_{Unop,Binop}. All statements you will ever
+ * encounter will satisfy isFlatIRStmt which essentially
+ * constrains them to being flat SSA-style.
+ */
+ case Ist_Store:
+ if (lk_clo_detailed_counts) {
+ type = typeOfIRExpr(bb->tyenv, st->Ist.Store.data);
+ tl_assert(type != Ity_INVALID);
+ instrument_detail( bb, OpStore, type );
+ }
+ addStmtToIRBB( bb, st );
+ break;
+
+ case Ist_Tmp:
+ if (lk_clo_detailed_counts) {
+ IRExpr* expr = st->Ist.Tmp.data;
+ type = typeOfIRExpr(bb->tyenv, expr);
+ tl_assert(type != Ity_INVALID);
+ switch (expr->tag) {
+ case Iex_Load:
+ instrument_detail( bb, OpLoad, type );
+ break;
+ case Iex_Unop:
+ case Iex_Binop:
+ case Iex_Mux0X:
+ instrument_detail( bb, OpAlu, type );
+ break;
+ default:
+ break;
+ }
+ }
+ addStmtToIRBB( bb, st );
break;
default:
@@ -192,87 +372,59 @@
}
}
+ /* Count this basic block. */
+ di = unsafeIRDirty_0_N( 0, "add_one_BB_completed",
+ &add_one_BB_completed, mkIRExprVec_0() );
+ addStmtToIRBB( bb, IRStmt_Dirty(di) );
+
return bb;
-
-
-#if 0
- UCodeBlock* cb;
- Int i;
- UInstr* u;
- Char fnname[100];
-
- cb = VG_(setup_UCodeBlock)(cb_in);
-
- /* Count basic block */
- VG_(call_helper_0_0)(cb, (Addr) & add_one_BB);
-
- for (i = 0; i < VG_(get_num_instrs)(cb_in); i++) {
- u = VG_(get_instr)(cb_in, i);
-
- switch (u->opcode) {
- case NOP: case LOCK: case CALLM_S: case CALLM_E:
- break;
-
- case INCEIP:
- /* Count x86 instr */
- VG_(call_helper_0_0)(cb, (Addr) & add_one_x86_instr);
- VG_(copy_UInstr)(cb, u);
- break;
-
- case JMP:
- if (u->cond != CondAlways) {
- /* Count Jcc */
- VG_(call_helper_0_0)(cb, (Addr) & add_one_Jcc);
- VG_(copy_UInstr)(cb, u);
- /* Count non-taken Jcc */
- VG_(call_helper_0_0)(cb, (Addr) & add_one_Jcc_untaken);
- } else {
- VG_(copy_UInstr)(cb, u);
- }
- break;
-
- default:
- /* Count UInstr */
- VG_(call_helper_0_0)(cb, (Addr) & add_one_UInstr);
- VG_(copy_UInstr)(cb, u);
- break;
- }
- }
-
- VG_(free_UCodeBlock)(cb_in);
- return cb;
-#endif
}
static void lk_fini(Int exitcode)
{
- VG_(message)(Vg_UserMsg,
- "Counted %d calls to _dl_runtime_resolve()", n_dlrr_calls);
+ char percentify_buf[4]; /* Two digits, '%' and 0. */
+ const int percentify_size = sizeof(percentify_buf);
+ const int percentify_decs = 0;
+
+ tl_assert(lk_clo_fnname);
+ tl_assert(lk_clo_fnname[0]);
+ VG_(message)(Vg_UserMsg,
+ "Counted %,llu calls to %s()", n_func_calls, lk_clo_fnname);
- VG_(message)(Vg_UserMsg, "");
- VG_(message)(Vg_UserMsg, "Executed:");
- VG_(message)(Vg_UserMsg, " BBs: %u", n_BBs);
- VG_(message)(Vg_UserMsg, " guest instrs: %u", n_guest_instrs);
- VG_(message)(Vg_UserMsg, " UInstrs: %u", n_UInstrs);
+ VG_(message)(Vg_UserMsg, "");
+ VG_(message)(Vg_UserMsg, "Jccs:");
+ VG_(message)(Vg_UserMsg, " total: %,llu", n_Jccs);
+ VG_(percentify)((n_Jccs - n_Jccs_untaken), (n_Jccs ? n_Jccs : 1),
+ percentify_decs, percentify_size, percentify_buf);
+ VG_(message)(Vg_UserMsg, " taken: %,llu (%s)",
+ (n_Jccs - n_Jccs_untaken), percentify_buf);
+
+ VG_(message)(Vg_UserMsg, "");
+ VG_(message)(Vg_UserMsg, "Executed:");
+ VG_(message)(Vg_UserMsg, " BBs entered: %,llu", n_BBs_entered);
+ VG_(message)(Vg_UserMsg, " BBs completed: %,llu", n_BBs_completed);
+ VG_(message)(Vg_UserMsg, " guest instrs: %,llu", n_guest_instrs);
+ VG_(message)(Vg_UserMsg, " IRStmts: %,llu", n_IRStmts);
+
+ VG_(message)(Vg_UserMsg, "");
+ VG_(message)(Vg_UserMsg, "Ratios:");
+ tl_assert(n_BBs_entered); // Paranoia time.
+ VG_(message)(Vg_UserMsg, " guest instrs : BB entered = %3u : 10",
+ 10 * n_guest_instrs / n_BBs_entered);
+ VG_(message)(Vg_UserMsg, " IRStmts : BB entered = %3u : 10",
+ 10 * n_IRStmts / n_BBs_entered);
+ tl_assert(n_guest_instrs); // Paranoia time.
+ VG_(message)(Vg_UserMsg, " IRStmts : guest instr = %3u : 10",
+ 10 * n_IRStmts / n_guest_instrs);
- VG_(message)(Vg_UserMsg, "");
- VG_(message)(Vg_UserMsg, "Jccs:");
- VG_(message)(Vg_UserMsg, " total: %u", n_Jccs);
- VG_(message)(Vg_UserMsg, " %% taken: %u%%",
- (n_Jccs - n_Jccs_untaken)*100 /
- (n_Jccs ? n_Jccs : 1));
+ if (lk_clo_detailed_counts) {
+ VG_(message)(Vg_UserMsg, "");
+ VG_(message)(Vg_UserMsg, "IR-level counts by type:");
+ print_details();
+ }
- VG_(message)(Vg_UserMsg, "");
- VG_(message)(Vg_UserMsg, "Ratios:");
- VG_(message)(Vg_UserMsg, " guest instrs : BB = %3u : 10",
- 10 * n_guest_instrs / n_BBs);
- VG_(message)(Vg_UserMsg, " UInstrs : BB = %3u : 10",
- 10 * n_UInstrs / n_BBs);
- VG_(message)(Vg_UserMsg, " UInstrs : x86_instr = %3u : 10",
- 10 * n_UInstrs / n_guest_instrs);
-
- VG_(message)(Vg_UserMsg, "");
- VG_(message)(Vg_UserMsg, "Exit code: %d", exitcode);
+ VG_(message)(Vg_UserMsg, "");
+ VG_(message)(Vg_UserMsg, "Exit code: %d", exitcode);
}
static void lk_pre_clo_init(void)
@@ -288,6 +440,9 @@
VG_(basic_tool_funcs) (lk_post_clo_init,
lk_instrument,
lk_fini);
+ VG_(needs_command_line_options)(lk_process_cmd_line_option,
+ lk_print_usage,
+ lk_print_debug_usage);
}
VG_DETERMINE_INTERFACE_VERSION(lk_pre_clo_init)
@@ -295,4 +450,3 @@
/*--------------------------------------------------------------------*/
/*--- end lk_main.c ---*/
/*--------------------------------------------------------------------*/
-