Merge patch from JeremyF (with a little added paranoia, for this one
could potentially cause hard-to-find code generation bugs):
00-lazy-fp
This patch implements lazy FPU state save and restore, which improves
the performance of FPU-intensive code by a factor of 15 or so. [when
running without any instrumentatation, that is.]
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1335 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index d253c3d..feff35a 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -1805,18 +1805,14 @@
UChar second_byte_masked,
Int reg )
{
- emit_get_fpu_state();
emit_fpu_regmem ( first_byte, second_byte_masked, reg );
- emit_put_fpu_state();
}
static void synth_fpu_no_mem ( UChar first_byte,
UChar second_byte )
{
- emit_get_fpu_state();
emit_fpu_no_mem ( first_byte, second_byte );
- emit_put_fpu_state();
}
@@ -1961,7 +1957,16 @@
}
-static void emitUInstr ( UCodeBlock* cb, Int i, RRegSet regs_live_before )
+/* fplive==True indicates that the simulated machine's FPU state is in
+ the real FPU. If so we need to be very careful not to trash it.
+ If FPU state is live and we deem it necessary to copy it back to
+ the simulated machine's FPU state, we do so. The final state of
+ fpliveness is returned. In short we _must_ do put_fpu_state if
+ there is any chance at all that the code generated for a UInstr
+ will change the real FPU state.
+*/
+static Bool emitUInstr ( UCodeBlock* cb, Int i, RRegSet regs_live_before,
+ Bool fplive )
{
Int old_emitted_code_used;
UInstr* u = &cb->instrs[i];
@@ -1969,15 +1974,6 @@
if (dis)
VG_(pp_UInstr_regs)(i, u);
-# if 0
- if (0&& VG_(translations_done) >= 600) {
- Bool old_dis = dis;
- dis = False;
- synth_OINK(i);
- dis = old_dis;
- }
-# endif
-
old_emitted_code_used = emitted_code_used;
switch (u->opcode) {
@@ -2208,6 +2204,11 @@
vg_assert(u->tag2 == RealReg);
vg_assert(u->size == 0);
+ if (fplive) {
+ emit_put_fpu_state();
+ fplive = False;
+ }
+
VG_(synth_ccall) ( (Addr) & VG_(do_useseg),
2, /* args */
0, /* regparms_n */
@@ -2294,6 +2295,10 @@
case JMP: {
vg_assert(u->tag2 == NoValue);
vg_assert(u->tag1 == RealReg || u->tag1 == Literal);
+ if (fplive) {
+ emit_put_fpu_state();
+ fplive = False;
+ }
if (u->cond == CondAlways) {
switch (u->tag1) {
case RealReg:
@@ -2327,6 +2332,10 @@
vg_assert(u->tag1 == RealReg);
vg_assert(u->tag2 == Literal);
vg_assert(u->size == 4);
+ if (fplive) {
+ emit_put_fpu_state();
+ fplive = False;
+ }
synth_jmp_ifzero_reg_lit ( u->val1, u->lit32 );
break;
@@ -2346,6 +2355,10 @@
vg_assert(u->tag1 == Lit16);
vg_assert(u->tag2 == NoValue);
vg_assert(u->size == 0);
+ if (fplive) {
+ emit_put_fpu_state();
+ fplive = False;
+ }
if (anyFlagUse ( u ))
emit_get_eflags();
VG_(synth_call) ( False, u->val1 );
@@ -2370,6 +2383,10 @@
else vg_assert(u->tag3 == NoValue);
vg_assert(u->size == 0);
+ if (fplive) {
+ emit_put_fpu_state();
+ fplive = False;
+ }
VG_(synth_ccall) ( u->lit32, u->argc, u->regparms_n, argv, tagv,
ret_reg, regs_live_before, u->regs_live_after );
break;
@@ -2392,6 +2409,10 @@
case FPU_W:
vg_assert(u->tag1 == Lit16);
vg_assert(u->tag2 == RealReg);
+ if (!fplive) {
+ emit_get_fpu_state();
+ fplive = True;
+ }
synth_fpu_regmem ( (u->val1 >> 8) & 0xFF,
u->val1 & 0xFF,
u->val2 );
@@ -2402,6 +2423,10 @@
vg_assert(u->tag2 == NoValue);
if (anyFlagUse ( u ))
emit_get_eflags();
+ if (!fplive) {
+ emit_get_fpu_state();
+ fplive = True;
+ }
synth_fpu_no_mem ( (u->val1 >> 8) & 0xFF,
u->val1 & 0xFF );
if (writeFlagUse ( u ))
@@ -2409,9 +2434,13 @@
break;
default:
- if (VG_(needs).extended_UCode)
+ if (VG_(needs).extended_UCode) {
+ if (fplive) {
+ emit_put_fpu_state();
+ fplive = False;
+ }
SK_(emit_XUInstr)(u, regs_live_before);
- else {
+ } else {
VG_(printf)("\nError:\n"
" unhandled opcode: %u. Perhaps "
" VG_(needs).extended_UCode should be set?\n",
@@ -2421,10 +2450,17 @@
}
}
+ if (0 && fplive) {
+ emit_put_fpu_state();
+ fplive = False;
+ }
+
/* Update UInstr histogram */
vg_assert(u->opcode < 100);
histogram[u->opcode].counts++;
histogram[u->opcode].size += (emitted_code_used - old_emitted_code_used);
+
+ return fplive;
}
@@ -2434,13 +2470,15 @@
{
Int i;
UChar regs_live_before = 0; /* No regs live at BB start */
-
+ Bool fplive;
+
emitted_code_used = 0;
emitted_code_size = 500; /* reasonable initial size */
emitted_code = VG_(arena_malloc)(VG_AR_JITTER, emitted_code_size);
if (dis) VG_(printf)("Generated x86 code:\n");
+ fplive = False;
for (i = 0; i < cb->used; i++) {
UInstr* u = &cb->instrs[i];
if (cb->instrs[i].opcode != NOP) {
@@ -2452,11 +2490,12 @@
VG_(up_UInstr)( i, u );
}
vg_assert(sane);
- emitUInstr( cb, i, regs_live_before );
+ fplive = emitUInstr( cb, i, regs_live_before, fplive );
}
regs_live_before = u->regs_live_after;
}
if (dis) VG_(printf)("\n");
+ vg_assert(!fplive); /* FPU state must be saved by end of BB */
/* Returns a pointer to the emitted code. This will have to be
copied by the caller into the translation cache, and then freed */