An experiment in generating branch-prediction hints. Enable them with
--branchpred=yes. I'm interested to know if these make a significant
difference for anyone - I see a small speed increase on the Pentium M.
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@2126 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index 5153b4b..ff93b29 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -80,6 +80,26 @@
UPD_Both, /* both are current */
} eflags_state;
+/* ia32 static prediction is very simple. Other implementations are
+ more complex, so we get the condition anyway. */
+static JumpPred static_pred(Condcode cond, Int forward)
+{
+ if (cond == CondAlways)
+ return JP_TAKEN;
+
+ return forward ? JP_NOT_TAKEN : JP_TAKEN;
+}
+
+static const Char *predstr(JumpPred p)
+{
+ switch(p) {
+ default:
+ case JP_NONE: return "";
+ case JP_TAKEN: return ",pt";
+ case JP_NOT_TAKEN: return ",pn";
+ }
+}
+
/* single site for resetting state */
static void reset_state(void)
{
@@ -1937,8 +1957,9 @@
static inline Int mk_tgt(Int state, Int addr)
{
- vg_assert(state == TGT_UNDEF
- || state == TGT_FORWARD || state == TGT_BACKWARD);
+ vg_assert(state == TGT_UNDEF ||
+ state == TGT_FORWARD ||
+ state == TGT_BACKWARD);
vg_assert((addr & 0xffff0000) == 0);
return state | addr;
@@ -1997,26 +2018,38 @@
/* Emit a jump short with an 8-bit signed offset. Note that the
offset is that which should be added to %eip once %eip has been
advanced over this insn. */
-void VG_(emit_jcondshort_delta) ( Bool simd_flags, Condcode cond, Int delta )
+void VG_(emit_jcondshort_delta) ( Bool simd_flags, Condcode cond, Int delta, JumpPred pred )
{
vg_assert(delta >= -128 && delta <= 127);
VG_(new_emit)(simd_flags, FlagsOSZCP, FlagsEmpty);
+
+ if (VG_(clo_branchpred) &&
+ pred != JP_NONE &&
+ pred != static_pred(cond, delta > 0))
+ VG_(emitB)(pred == JP_TAKEN ? 0x3e : 0x2e);
+
VG_(emitB) ( 0x70 + (UInt)cond );
VG_(emitB) ( (UChar)delta );
if (dis)
- VG_(printf)( "\n\t\tj%s-8\t%%eip+%d\n",
- VG_(name_UCondcode)(cond), delta );
+ VG_(printf)( "\n\t\tj%s-8%s\t%%eip+%d\n",
+ VG_(name_UCondcode)(cond), predstr(pred), delta );
}
/* Same as above, but defers emitting the delta */
-void VG_(emit_jcondshort_target) ( Bool simd, Condcode cond, Int *tgt )
+void VG_(emit_jcondshort_target) ( Bool simd, Condcode cond, Int *tgt, JumpPred pred )
{
VG_(new_emit)(simd, FlagsOSZCP, FlagsEmpty);
+
+ if (VG_(clo_branchpred) &&
+ pred != JP_NONE &&
+ pred != static_pred(cond, tgt_state(*tgt) != TGT_BACKWARD))
+ VG_(emitB)(pred == JP_TAKEN ? 0x3e : 0x2e);
+
VG_(emitB) ( 0x70 + (UInt)cond );
VG_(emit_target_delta) (tgt);
if (dis)
- VG_(printf)( "\n\t\tj%s-8\t%%eip+(%d)\n",
- VG_(name_UCondcode)(cond), tgt_addr(*tgt) );
+ VG_(printf)( "\n\t\tj%s-8%s\t%%eip+(%d)\n",
+ VG_(name_UCondcode)(cond), predstr(pred), tgt_addr(*tgt) );
}
@@ -2614,12 +2647,12 @@
if (cond == CondLE) {
/* test Z */
- VG_(emit_jcondshort_target)(False, CondS, &tgt_jump);
+ VG_(emit_jcondshort_target)(False, CondS, &tgt_jump, JP_NONE);
/* test OF != SF */
cond = CondP;
} else {
/* test Z */
- VG_(emit_jcondshort_target)(False, CondS, &tgt2);
+ VG_(emit_jcondshort_target)(False, CondS, &tgt2, JP_NONE);
/* test OF == SF */
cond = CondNP;
}
@@ -2701,7 +2734,7 @@
}
}
- VG_(emit_jcondshort_target) ( simd, cond, &tgt );
+ VG_(emit_jcondshort_target) ( simd, cond, &tgt, JP_NONE );
VG_(target_forward)(&tgt_jump);
synth_jmp_lit ( addr, JmpBoring );
@@ -2720,7 +2753,7 @@
VG_(emit_cmpl_zero_reg) ( False, reg );
- VG_(emit_jcondshort_target) ( False, CondNZ, &tgt );
+ VG_(emit_jcondshort_target) ( False, CondNZ, &tgt, JP_NONE );
synth_jmp_lit ( addr, JmpBoring );
VG_(target_forward)(&tgt);
@@ -3234,7 +3267,7 @@
VG_(init_target)(&tgt);
- VG_(emit_jcondshort_target) ( True, invertCondition(cond), &tgt);
+ VG_(emit_jcondshort_target) ( True, invertCondition(cond), &tgt, JP_NONE);
emit_movl_reg_reg ( src, dst );
VG_(target_forward)(&tgt);
@@ -4286,16 +4319,19 @@
if (dis) VG_(printf)("Generated x86 code:\n");
- /* Generate decl VG_(dispatch_ctr) and drop into dispatch if we hit
+ /* Generate subl $1, VG_(dispatch_ctr) and drop into dispatch if we hit
zero. We have to do this regardless of whether we're t-chaining
- or not. */
+ or not. (The ia32 optimisation guide recommends sub over dec.) */
VG_(init_target)(&tgt);
VG_(new_emit)(False, FlagsEmpty, FlagsOSZAP);
- VG_(emitB) (0xFF); /* decl */
- emit_amode_litmem_reg((Addr)&VG_(dispatch_ctr), 1);
+ VG_(emitB) (0x83); /* subl */
+ emit_amode_litmem_reg((Addr)&VG_(dispatch_ctr), 5);
+ VG_(emitB) (0x01);
+
if (dis)
- VG_(printf)("\n\t\tdecl (%p)\n", &VG_(dispatch_ctr));
- VG_(emit_jcondshort_target)(False, CondNZ, &tgt);
+ VG_(printf)("\n\t\tsubl $1, (%p)\n", &VG_(dispatch_ctr));
+
+ VG_(emit_jcondshort_target)(False, CondNZ, &tgt, JP_TAKEN);
VG_(emit_movv_lit_reg) ( 4, VG_TRC_INNER_COUNTERZERO, R_EBP );
emit_ret();
VG_(target_forward)(&tgt);