An experiment in generating branch-prediction hints. Enable them with
--branchpred=yes. I'm interested to know if these make a significant
difference for anyone - I see a small speed increase on the Pentium M.
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@2126 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index 5153b4b..ff93b29 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -80,6 +80,26 @@
UPD_Both, /* both are current */
} eflags_state;
+/* ia32 static prediction is very simple. Other implementations are
+ more complex, so we get the condition anyway. */
+static JumpPred static_pred(Condcode cond, Int forward)
+{
+ if (cond == CondAlways)
+ return JP_TAKEN;
+
+ return forward ? JP_NOT_TAKEN : JP_TAKEN;
+}
+
+static const Char *predstr(JumpPred p)
+{
+ switch(p) {
+ default:
+ case JP_NONE: return "";
+ case JP_TAKEN: return ",pt";
+ case JP_NOT_TAKEN: return ",pn";
+ }
+}
+
/* single site for resetting state */
static void reset_state(void)
{
@@ -1937,8 +1957,9 @@
static inline Int mk_tgt(Int state, Int addr)
{
- vg_assert(state == TGT_UNDEF
- || state == TGT_FORWARD || state == TGT_BACKWARD);
+ vg_assert(state == TGT_UNDEF ||
+ state == TGT_FORWARD ||
+ state == TGT_BACKWARD);
vg_assert((addr & 0xffff0000) == 0);
return state | addr;
@@ -1997,26 +2018,38 @@
/* Emit a jump short with an 8-bit signed offset. Note that the
offset is that which should be added to %eip once %eip has been
advanced over this insn. */
-void VG_(emit_jcondshort_delta) ( Bool simd_flags, Condcode cond, Int delta )
+void VG_(emit_jcondshort_delta) ( Bool simd_flags, Condcode cond, Int delta, JumpPred pred )
{
vg_assert(delta >= -128 && delta <= 127);
VG_(new_emit)(simd_flags, FlagsOSZCP, FlagsEmpty);
+
+ if (VG_(clo_branchpred) &&
+ pred != JP_NONE &&
+ pred != static_pred(cond, delta > 0))
+ VG_(emitB)(pred == JP_TAKEN ? 0x3e : 0x2e);
+
VG_(emitB) ( 0x70 + (UInt)cond );
VG_(emitB) ( (UChar)delta );
if (dis)
- VG_(printf)( "\n\t\tj%s-8\t%%eip+%d\n",
- VG_(name_UCondcode)(cond), delta );
+ VG_(printf)( "\n\t\tj%s-8%s\t%%eip+%d\n",
+ VG_(name_UCondcode)(cond), predstr(pred), delta );
}
/* Same as above, but defers emitting the delta */
-void VG_(emit_jcondshort_target) ( Bool simd, Condcode cond, Int *tgt )
+void VG_(emit_jcondshort_target) ( Bool simd, Condcode cond, Int *tgt, JumpPred pred )
{
VG_(new_emit)(simd, FlagsOSZCP, FlagsEmpty);
+
+ if (VG_(clo_branchpred) &&
+ pred != JP_NONE &&
+ pred != static_pred(cond, tgt_state(*tgt) != TGT_BACKWARD))
+ VG_(emitB)(pred == JP_TAKEN ? 0x3e : 0x2e);
+
VG_(emitB) ( 0x70 + (UInt)cond );
VG_(emit_target_delta) (tgt);
if (dis)
- VG_(printf)( "\n\t\tj%s-8\t%%eip+(%d)\n",
- VG_(name_UCondcode)(cond), tgt_addr(*tgt) );
+ VG_(printf)( "\n\t\tj%s-8%s\t%%eip+(%d)\n",
+ VG_(name_UCondcode)(cond), predstr(pred), tgt_addr(*tgt) );
}
@@ -2614,12 +2647,12 @@
if (cond == CondLE) {
/* test Z */
- VG_(emit_jcondshort_target)(False, CondS, &tgt_jump);
+ VG_(emit_jcondshort_target)(False, CondS, &tgt_jump, JP_NONE);
/* test OF != SF */
cond = CondP;
} else {
/* test Z */
- VG_(emit_jcondshort_target)(False, CondS, &tgt2);
+ VG_(emit_jcondshort_target)(False, CondS, &tgt2, JP_NONE);
/* test OF == SF */
cond = CondNP;
}
@@ -2701,7 +2734,7 @@
}
}
- VG_(emit_jcondshort_target) ( simd, cond, &tgt );
+ VG_(emit_jcondshort_target) ( simd, cond, &tgt, JP_NONE );
VG_(target_forward)(&tgt_jump);
synth_jmp_lit ( addr, JmpBoring );
@@ -2720,7 +2753,7 @@
VG_(emit_cmpl_zero_reg) ( False, reg );
- VG_(emit_jcondshort_target) ( False, CondNZ, &tgt );
+ VG_(emit_jcondshort_target) ( False, CondNZ, &tgt, JP_NONE );
synth_jmp_lit ( addr, JmpBoring );
VG_(target_forward)(&tgt);
@@ -3234,7 +3267,7 @@
VG_(init_target)(&tgt);
- VG_(emit_jcondshort_target) ( True, invertCondition(cond), &tgt);
+ VG_(emit_jcondshort_target) ( True, invertCondition(cond), &tgt, JP_NONE);
emit_movl_reg_reg ( src, dst );
VG_(target_forward)(&tgt);
@@ -4286,16 +4319,19 @@
if (dis) VG_(printf)("Generated x86 code:\n");
- /* Generate decl VG_(dispatch_ctr) and drop into dispatch if we hit
+ /* Generate subl $1, VG_(dispatch_ctr) and drop into dispatch if we hit
zero. We have to do this regardless of whether we're t-chaining
- or not. */
+ or not. (The ia32 optimisation guide recommends sub over dec.) */
VG_(init_target)(&tgt);
VG_(new_emit)(False, FlagsEmpty, FlagsOSZAP);
- VG_(emitB) (0xFF); /* decl */
- emit_amode_litmem_reg((Addr)&VG_(dispatch_ctr), 1);
+ VG_(emitB) (0x83); /* subl */
+ emit_amode_litmem_reg((Addr)&VG_(dispatch_ctr), 5);
+ VG_(emitB) (0x01);
+
if (dis)
- VG_(printf)("\n\t\tdecl (%p)\n", &VG_(dispatch_ctr));
- VG_(emit_jcondshort_target)(False, CondNZ, &tgt);
+ VG_(printf)("\n\t\tsubl $1, (%p)\n", &VG_(dispatch_ctr));
+
+ VG_(emit_jcondshort_target)(False, CondNZ, &tgt, JP_TAKEN);
VG_(emit_movv_lit_reg) ( 4, VG_TRC_INNER_COUNTERZERO, R_EBP );
emit_ret();
VG_(target_forward)(&tgt);
diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h
index 9c72831..179e41f 100644
--- a/coregrind/vg_include.h
+++ b/coregrind/vg_include.h
@@ -270,6 +270,8 @@
extern Bool VG_(clo_run_libc_freeres);
/* Use the basic-block chaining optimisation? Default: YES */
extern Bool VG_(clo_chain_bb);
+/* Generate branch-prediction hints? */
+extern Bool VG_(clo_branchpred);
/* Continue stack traces below main()? Default: NO */
extern Bool VG_(clo_show_below_main);
/* Test each client pointer dereference to check it's within the
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
index b409eea..9222f2e 100644
--- a/coregrind/vg_main.c
+++ b/coregrind/vg_main.c
@@ -586,6 +586,7 @@
Bool VG_(clo_chain_bb) = True;
Bool VG_(clo_show_below_main) = False;
Bool VG_(clo_pointercheck) = True;
+Bool VG_(clo_branchpred) = False;
static Bool VG_(clo_wait_for_gdb) = False;
@@ -692,6 +693,7 @@
" --optimise=no|yes improve intermediate code? [yes]\n"
" --profile=no|yes profile? (tool must be built for it) [no]\n"
" --chain-bb=no|yes do basic-block chaining? [yes]\n"
+" --branchpred=yes|no generate branch prediction hints [no]\n"
" --trace-codegen=<XXXXX> show generated code? (X = 0|1) [00000]\n"
" --trace-syscalls=no|yes show all system calls? [no]\n"
" --trace-signals=no|yes show signal handling details? [no]\n"
@@ -896,6 +898,11 @@
else if (VG_CLO_STREQ(argv[i], "--chain-bb=no"))
VG_(clo_chain_bb) = False;
+ else if (VG_CLO_STREQ(argv[i], "--branchpred=yes"))
+ VG_(clo_branchpred) = True;
+ else if (VG_CLO_STREQ(argv[i], "--branchpred=no"))
+ VG_(clo_branchpred) = False;
+
else if (VG_CLO_STREQ(argv[i], "--single-step=yes"))
VG_(clo_single_step) = True;
else if (VG_CLO_STREQ(argv[i], "--single-step=no"))
diff --git a/include/vg_skin.h.base b/include/vg_skin.h.base
index 1477f98..cfee479 100644
--- a/include/vg_skin.h.base
+++ b/include/vg_skin.h.base
@@ -1315,8 +1315,14 @@
extern void VG_(target_forward) ( Int *tgt );
extern void VG_(emit_target_delta) ( Int *tgt );
-extern void VG_(emit_jcondshort_delta) ( Bool simd_cc, Condcode cond, Int delta );
-extern void VG_(emit_jcondshort_target)( Bool simd_cc, Condcode cond, Int *tgt );
+typedef enum {
+ JP_NONE, /* no prediction */
+ JP_TAKEN, /* predict taken */
+ JP_NOT_TAKEN, /* predict not taken */
+} JumpPred;
+
+extern void VG_(emit_jcondshort_delta) ( Bool simd_cc, Condcode cond, Int delta, JumpPred );
+extern void VG_(emit_jcondshort_target)( Bool simd_cc, Condcode cond, Int *tgt, JumpPred );
/*====================================================================*/
diff --git a/memcheck/mc_from_ucode.c b/memcheck/mc_from_ucode.c
index 6cfe815..683f6ae 100644
--- a/memcheck/mc_from_ucode.c
+++ b/memcheck/mc_from_ucode.c
@@ -166,6 +166,8 @@
static void synth_TESTV ( Int sz, Int tag, Int val )
{
+ Int tgt; /* jump target */
+
/* Important note. Note that that the calls to
MC_(helper_value_check[0124]_fail) must be compact helpers due to
the codegen scheme used below. Since there are a shortage of
@@ -173,6 +175,8 @@
actually used, we assert against it. */
sk_assert(sz == 0 || sz == 2 || sz == 4);
+ VG_(init_target)(&tgt);
+
sk_assert(tag == ArchReg || tag == RealReg);
if (tag == ArchReg) {
switch (sz) {
@@ -222,9 +226,12 @@
VG_(skin_panic)("synth_TESTV(RealReg)");
}
}
- VG_(emit_jcondshort_delta) ( False, CondZ, 3 );
+
+ /* predict taken because we assume failures are rare */
+ VG_(emit_jcondshort_target) ( False, CondZ, &tgt, JP_TAKEN );
+
VG_(synth_call) (
- True, /* needed to guarantee that this insn is indeed 3 bytes long */
+ False,
( sz==4
? VG_(helper_offset)((Addr) & MC_(helper_value_check4_fail))
: ( sz==2
@@ -234,6 +241,7 @@
: VG_(helper_offset)((Addr) & MC_(helper_value_check0_fail))))),
False, FlagsEmpty, FlagsOSZACP /* helpers don't preserve flags */
);
+ VG_(target_forward)(&tgt);
}