Merge in a somewhat modified patch version of Jeremy Fitzhardinge's
translation chaining patch.
47-chained-bb
This implements basic-block chaining. Rather than always going through
the dispatch loop, a BB may jump directly to a successor BB if it is
present in the translation cache.
When the BB's code is first generated, the jumps to the successor BBs
are filled with undefined instructions. When the BB is inserted into
the translation cache, the undefined instructions are replaced with a
call to VG_(patch_me). When VG_(patch_me) is called, it looks up the
desired target address in the fast translation cache. If present, it
backpatches the call to patch_me with a jump to the translated target
BB. If the fast lookup fails, it falls back into the normal dispatch
loop.
When the parts of the translation cache are discarded, all translations
are unchained, so as to ensure we don't have direct jumps to code which
has been thrown away.
This optimisation only has effect on direct jumps; indirect jumps
(including returns) still go through the dispatch loop. The -v stats
indicate a worst-case rate of about 16% of jumps having to go via the
slow mechanism. This will be a combination of function returns and
genuine indirect jumps.
Certain parts of the dispatch loop's actions have to be moved into
each basic block; namely: updating the virtual EIP and keeping track
of the basic block counter.
At present, basic block chaining seems to improve performance by up to
25% with --skin=none. Gains for skins adding more instrumentation
will be correspondingly smaller.
There is a command line option: --chain-bb=yes|no (defaults to yes).
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1336 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/vg_constants.h b/coregrind/vg_constants.h
index 926fa01..4bc4351 100644
--- a/coregrind/vg_constants.h
+++ b/coregrind/vg_constants.h
@@ -53,6 +53,17 @@
#define VG_TRC_INNER_COUNTERZERO 29 /* TRC only; means bb ctr == 0 */
#define VG_TRC_UNRESUMABLE_SIGNAL 37 /* TRC only; got sigsegv/sigbus */
+/* size of call instruction put into generated code at jump sites */
+#define VG_PATCHME_CALLSZ 5
+
+/* size of jmp instruction which overwrites the call */
+#define VG_PATCHME_JMPSZ 5
+
+/* maximum number of normal jumps which can appear in a basic block */
+#define VG_MAX_JUMPS 2
+
+/* Offset of code in a TCEntry */
+#define VG_CODE_OFFSET (8 + VG_MAX_JUMPS * 2)
/* Debugging hack for assembly code ... sigh. */
#if 0
diff --git a/coregrind/vg_dispatch.S b/coregrind/vg_dispatch.S
index 195e290..08ae86b 100644
--- a/coregrind/vg_dispatch.S
+++ b/coregrind/vg_dispatch.S
@@ -50,13 +50,20 @@
*/
+#define TT_LOOKUP(reg, fail) \
+ movl %eax, reg; \
+ andl $VG_TT_FAST_MASK, reg; \
+ movl VG_(tt_fast)(,reg,4), reg; \
+ cmpl %eax, (reg); \
+ jnz fail
+
/* The C world needs a way to get started simulating. So we provide
a function void vg_run_innerloop ( void ), which starts running
from vg_m_eip, and exits when the counter reaches zero. This loop
can also exit if vg_oursignalhandler() catches a non-resumable
signal, for example SIGSEGV. It then longjmp()s back past here.
*/
-
+
.globl VG_(run_innerloop)
VG_(run_innerloop):
/* OYNK(1000) */
@@ -101,47 +108,16 @@
/* save the jump address at VG_(baseBlock)[VGOFF_(m_eip)] */
movl VGOFF_(m_eip), %esi
movl %eax, (%ebp, %esi, 4)
- /* Are we out of timeslice? If yes, defer to scheduler. */
- decl VG_(dispatch_ctr)
- jz counter_is_zero
- /* try a fast lookup in the translation cache */
- movl %eax, %ebx
- andl $VG_TT_FAST_MASK, %ebx
- /* ebx = tt_fast index */
- movl VG_(tt_fast)(,%ebx,4), %ebx
- /* ebx points at a tc entry
- now compare target with the tce.orig_addr field (+0) */
- cmpl %eax, (%ebx)
- jnz fast_lookup_failed
-
- /* Found a match. Call the tce.payload field (+8) */
- addl $8, %ebx
- call *%ebx
-
- cmpl $VG_(baseBlock), %ebp
- jnz dispatch_exceptional
-
-dispatch_boring_unroll2:
- /* save the jump address at VG_(baseBlock)[VGOFF_(m_eip)] */
- movl VGOFF_(m_eip), %esi
- movl %eax, (%ebp, %esi, 4)
/* Are we out of timeslice? If yes, defer to scheduler. */
- decl VG_(dispatch_ctr)
+ cmpl $0, VG_(dispatch_ctr)
jz counter_is_zero
-
/* try a fast lookup in the translation cache */
- movl %eax, %ebx
- andl $VG_TT_FAST_MASK, %ebx
- /* ebx = tt_fast index */
- movl VG_(tt_fast)(,%ebx,4), %ebx
- /* ebx points at a tc entry
- now compare target with the tce.orig_addr field (+0) */
- cmpl %eax, (%ebx)
- jnz fast_lookup_failed
+ TT_LOOKUP(%ebx, fast_lookup_failed)
- /* Found a match. Call the tce.payload field (+8) */
- addl $8, %ebx
+ /* Found a match. Call the tce.payload field (+VG_CODE_OFFSET) */
+ addl $VG_CODE_OFFSET, %ebx
+ incl VG_(unchained_jumps_done) /* update stats */
call *%ebx
cmpl $VG_(baseBlock), %ebp
@@ -180,7 +156,9 @@
jz dispatch_syscall
cmpl $VG_TRC_EBP_JMP_CLIENTREQ, %ebp
jz dispatch_clientreq
-
+ cmpl $VG_TRC_INNER_COUNTERZERO, %ebp
+ jz counter_is_zero
+
/* ebp has an invalid value ... crap out. */
pushl $panic_msg_ebp
call VG_(core_panic)
@@ -202,6 +180,34 @@
movl $VG_TRC_EBP_JMP_CLIENTREQ, %eax
jmp run_innerloop_exit
+
+/* This is the translation chainer, our run-time linker, if you like.
+
+ This enters with %eax pointing to next eip we want. If
+ we've already compiled that eip (ie, get a fast hit), we
+ backpatch the call instruction with a jump, and jump there.
+ Otherwise, we do a slow hit/compile through the normal path
+ (and get to do a backpatch next time through).
+*/
+.globl VG_(patch_me)
+VG_(patch_me):
+ /* try a fast lookup in the translation cache */
+ TT_LOOKUP(%ebx, 1f)
+
+ /* Patch call instruction at callsite into a chained jmp */
+ popl %eax /* eax = just after (VG_PATCHME_CALLSZ byte) call */
+ addl $VG_CODE_OFFSET, %ebx /* ebx = target eip */
+ subl %eax, %ebx /* ebx = delta */
+ movb $0xE9, -(VG_PATCHME_CALLSZ-0)(%eax) /* 0xe9 = jmp */
+ movl %ebx, -(VG_PATCHME_CALLSZ-1)(%eax) /* store delta */
+ addl %eax, %ebx
+ incl VG_(bb_enchain_count) /* update stats */
+ jmp *%ebx /* jmp to dest */
+
+ /* tt_fast miss: return into main dispatch loop */
+1: addl $4, %esp /* remove our call address */
+ ret /* return into main dispatch loop above */
+
.data
panic_msg_ebp:
.ascii "vg_dispatch: %ebp has invalid value!"
diff --git a/coregrind/vg_errcontext.c b/coregrind/vg_errcontext.c
index f00e6b6..004110a 100644
--- a/coregrind/vg_errcontext.c
+++ b/coregrind/vg_errcontext.c
@@ -416,7 +416,7 @@
if ((i+1 == VG_(clo_dump_error))) {
VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to below NULLs */,
- p_min->where->eips[0], NULL, NULL, NULL );
+ p_min->where->eips[0], NULL, NULL, NULL, NULL );
}
p_min->count = 1 << 30;
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index feff35a..11843ee 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -69,6 +69,10 @@
static Int emitted_code_used;
static Int emitted_code_size;
+/* offset (in bytes into the basic block) */
+static UShort jumps[VG_MAX_JUMPS];
+static Int jumpidx;
+
/* Statistics about C functions called from generated code. */
static UInt ccalls = 0;
static UInt ccall_reg_saves = 0;
@@ -1040,6 +1044,110 @@
VG_(printf)("\n\t\tret\n");
}
+/* Predicate used in sanity checks elsewhere - returns true if any
+ jump-site is an actual chained jump */
+Bool VG_(is_chained_jumpsite)(Addr a)
+{
+ UChar *cp = (UChar *)a;
+
+ return (*cp == 0xE9); /* 0xE9 -- jmp */
+}
+
+/* Predicate used in sanity checks elsewhere - returns true if all
+ jump-sites are calls to VG_(patch_me) */
+Bool VG_(is_unchained_jumpsite)(Addr a)
+{
+ UChar *cp = (UChar *)a;
+ Int delta = ((Addr)&VG_(patch_me)) - (a + VG_PATCHME_CALLSZ);
+ Int idelta;
+
+ if (*cp++ != 0xE8) /* 0xE8 == call */
+ return False;
+
+ idelta = (*cp++) << 0;
+ idelta |= (*cp++) << 8;
+ idelta |= (*cp++) << 16;
+ idelta |= (*cp++) << 24;
+
+ return idelta == delta;
+}
+
+/* Return target address for a direct jmp */
+Addr VG_(get_jmp_dest)(Addr a)
+{
+ Int delta;
+ UChar *cp = (UChar *)a;
+
+ if (*cp++ != 0xE9) /* 0xE9 == jmp */
+ return 0;
+
+ delta = (*cp++) << 0;
+ delta |= (*cp++) << 8;
+ delta |= (*cp++) << 16;
+ delta |= (*cp++) << 24;
+
+ return a + VG_PATCHME_JMPSZ + delta;
+}
+
+/* unchain a BB by generating a call to VG_(patch_me) */
+void VG_(unchain_jumpsite)(Addr a)
+{
+ Int delta = ((Addr)&VG_(patch_me)) - (a + VG_PATCHME_CALLSZ);
+ UChar *cp = (UChar *)a;
+
+ if (VG_(is_unchained_jumpsite)(a))
+ return; /* don't write unnecessarily */
+
+ *cp++ = 0xE8; /* call */
+ *cp++ = (delta >> 0) & 0xff;
+ *cp++ = (delta >> 8) & 0xff;
+ *cp++ = (delta >> 16) & 0xff;
+ *cp++ = (delta >> 24) & 0xff;
+ VG_(bb_dechain_count)++; /* update stats */
+}
+
+/* This doesn't actually generate a call to VG_(patch_me), but
+ reserves enough space in the instruction stream for it to happen
+ and records the offset into the jump table. This is because call
+ is a relative jump, and so will be affected when this code gets
+ moved about. The translation table will "unchain" this basic block
+ on insertion (with VG_(unchain_BB)()), and thereby generate a
+ proper call instruction. */
+static void emit_call_patchme( void )
+{
+ vg_assert(VG_PATCHME_CALLSZ == 5);
+
+ VG_(new_emit)();
+
+ if (jumpidx >= VG_MAX_JUMPS) {
+ /* If there too many jumps in this basic block, fall back to
+ dispatch loop. We still need to keep it the same size as the
+ call sequence. */
+ VG_(emitB) ( 0xC3 ); /* ret */
+ VG_(emitB) ( 0x90 ); /* nop */
+ VG_(emitB) ( 0x90 ); /* nop */
+ VG_(emitB) ( 0x90 ); /* nop */
+ VG_(emitB) ( 0x90 ); /* nop */
+
+ if (dis)
+ VG_(printf)("\n\t\tret; nop; nop; nop; nop\n");
+
+ if (0 && VG_(clo_verbosity))
+ VG_(message)(Vg_DebugMsg, "too many chained jumps in basic-block");
+ } else {
+ jumps[jumpidx++] = emitted_code_used;
+
+ VG_(emitB) ( 0x0F ); /* UD2 - undefined instruction */
+ VG_(emitB) ( 0x0B );
+ VG_(emitB) ( 0x0F ); /* UD2 - undefined instruction */
+ VG_(emitB) ( 0x0B );
+ VG_(emitB) ( 0x90 ); /* NOP */
+
+ if (dis)
+ VG_(printf)("\n\t\tud2; ud2; nop\n");
+ }
+}
+
void VG_(emit_pushal) ( void )
{
VG_(new_emit)();
@@ -1410,13 +1518,20 @@
emit_ret();
}
+static void synth_mov_reg_offregmem ( Int size, Int reg, Int off, Int areg );
/* Same deal as synth_jmp_reg. */
static void synth_jmp_lit ( Addr addr, JmpKind jmpkind )
{
- load_ebp_from_JmpKind ( jmpkind );
VG_(emit_movv_lit_reg) ( 4, addr, R_EAX );
- emit_ret();
+
+ if (VG_(clo_chain_bb) && (jmpkind == JmpBoring || jmpkind == JmpCall)) {
+ synth_mov_reg_offregmem(4, R_EAX, 4*VGOFF_(m_eip), R_EBP); /* update EIP */
+ emit_call_patchme();
+ } else {
+ load_ebp_from_JmpKind ( jmpkind );
+ emit_ret();
+ }
}
@@ -1436,7 +1551,23 @@
6 xyxyxy:
*/
emit_get_eflags();
- VG_(emit_jcondshort_delta) ( invertCondition(cond), 5+1 );
+ if (VG_(clo_chain_bb)) {
+ /* When using BB chaining, the jump sequence is:
+ jmp short if not cond to xyxyxy
+ addr -> eax
+ call VG_(patch_me)/jmp target
+ xyxyxy
+
+ je 1f
+ mov $0x4000d190,%eax // 5
+ mov %eax, VGOFF_(m_eip)(%ebp) // 3
+ call 0x40050f9a <vgPlain_patch_me> // 5
+ 1: mov $0x4000d042,%eax
+ call 0x40050f9a <vgPlain_patch_me>
+ */
+ VG_(emit_jcondshort_delta) ( invertCondition(cond), 5+3+5 );
+ } else
+ VG_(emit_jcondshort_delta) ( invertCondition(cond), 5+1 );
synth_jmp_lit ( addr, JmpBoring );
}
@@ -1450,7 +1581,10 @@
next:
*/
VG_(emit_cmpl_zero_reg) ( reg );
- VG_(emit_jcondshort_delta) ( CondNZ, 5+1 );
+ if (VG_(clo_chain_bb))
+ VG_(emit_jcondshort_delta) ( CondNZ, 5+3+5 );
+ else
+ VG_(emit_jcondshort_delta) ( CondNZ, 5+1 );
synth_jmp_lit ( addr, JmpBoring );
}
@@ -1965,8 +2099,8 @@
there is any chance at all that the code generated for a UInstr
will change the real FPU state.
*/
-static Bool emitUInstr ( UCodeBlock* cb, Int i, RRegSet regs_live_before,
- Bool fplive )
+static Bool emitUInstr ( UCodeBlock* cb, Int i,
+ RRegSet regs_live_before, Bool fplive )
{
Int old_emitted_code_used;
UInstr* u = &cb->instrs[i];
@@ -2466,18 +2600,31 @@
/* Emit x86 for the ucode in cb, returning the address of the
generated code and setting *nbytes to its size. */
-UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes )
+UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes, UShort j[VG_MAX_JUMPS] )
{
Int i;
UChar regs_live_before = 0; /* No regs live at BB start */
Bool fplive;
-
+
emitted_code_used = 0;
emitted_code_size = 500; /* reasonable initial size */
emitted_code = VG_(arena_malloc)(VG_AR_JITTER, emitted_code_size);
+ jumpidx = 0;
if (dis) VG_(printf)("Generated x86 code:\n");
+ /* Generate decl VG_(dispatch_ctr) and drop into dispatch if we hit
+ zero. We have to do this regardless of whether we're t-chaining
+ or not. */
+ VG_(new_emit)();
+ VG_(emitB) (0xFF); /* decl */
+ emit_amode_litmem_reg((Addr)&VG_(dispatch_ctr), 1);
+ if (dis)
+ VG_(printf)("\n\t\tdecl (%p)\n", &VG_(dispatch_ctr));
+ VG_(emit_jcondshort_delta)(CondNZ, 5+1);
+ VG_(emit_movv_lit_reg) ( 4, VG_TRC_INNER_COUNTERZERO, R_EBP );
+ emit_ret();
+
fplive = False;
for (i = 0; i < cb->used; i++) {
UInstr* u = &cb->instrs[i];
@@ -2497,6 +2644,12 @@
if (dis) VG_(printf)("\n");
vg_assert(!fplive); /* FPU state must be saved by end of BB */
+ if (j != NULL) {
+ vg_assert(jumpidx <= VG_MAX_JUMPS);
+ for(i = 0; i < jumpidx; i++)
+ j[i] = jumps[i];
+ }
+
/* Returns a pointer to the emitted code. This will have to be
copied by the caller into the translation cache, and then freed */
*nbytes = emitted_code_used;
diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h
index 5fa88b4..00fa4ba 100644
--- a/coregrind/vg_include.h
+++ b/coregrind/vg_include.h
@@ -244,6 +244,8 @@
is ignored. Ie if a skin says no, I don't want this to run, that
cannot be overridden from the command line. */
extern Bool VG_(clo_run_libc_freeres);
+/* Use the basic-block chaining optimisation */
+extern Bool VG_(clo_chain_bb);
/* ---------------------------------------------------------------------
@@ -1045,11 +1047,16 @@
Exports of vg_from_ucode.c
------------------------------------------------------------------ */
-extern UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes );
+extern UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes, UShort jumps[VG_MAX_JUMPS] );
extern void VG_(print_ccall_stats) ( void );
extern void VG_(print_UInstr_histogram) ( void );
+extern void VG_(unchain_jumpsite) ( Addr jumpsite );
+extern Addr VG_(get_jmp_dest) ( Addr jumpsite );
+extern Bool VG_(is_unchained_jumpsite) ( Addr jumpsite );
+extern Bool VG_(is_chained_jumpsite) ( Addr jumpsite );
+
/* ---------------------------------------------------------------------
Exports of vg_to_ucode.c
------------------------------------------------------------------ */
@@ -1062,6 +1069,7 @@
/* Expandable arrays of uinstrs. */
struct _UCodeBlock {
+ Addr orig_eip;
Int used;
Int size;
UInstr* instrs;
@@ -1074,7 +1082,8 @@
Addr orig_addr,
UInt* orig_size,
Addr* trans_addr,
- UInt* trans_size );
+ UInt* trans_size,
+ UShort jumps[VG_MAX_JUMPS]);
extern Char* VG_(nameCondcode) ( Condcode cond );
extern Bool VG_(saneUInstr) ( Bool beforeRA, Bool beforeLiveness,
@@ -1369,9 +1378,14 @@
extern UInt VG_(overall_out_count);
extern UInt VG_(overall_out_osize);
extern UInt VG_(overall_out_tsize);
-
/* The number of discards of TT/TC. */
extern UInt VG_(number_of_tc_discards);
+/* Counts of chain and unchain operations done. */
+extern UInt VG_(bb_enchain_count);
+extern UInt VG_(bb_dechain_count);
+/* Number of unchained jumps performed. */
+extern UInt VG_(unchained_jumps_done);
+
/* Counts pertaining to the register allocator. */
@@ -1445,7 +1459,8 @@
extern void VG_(get_tt_tc_used) ( UInt* tt_used, UInt* tc_used );
extern void VG_(add_to_trans_tab) ( Addr orig_addr, Int orig_size,
- Addr trans_addr, Int trans_size );
+ Addr trans_addr, Int trans_size,
+ UShort jumps[VG_MAX_JUMPS]);
extern void VG_(invalidate_translations) ( Addr start, UInt range );
@@ -1482,6 +1497,9 @@
which means we need to defer to the scheduler. */
extern UInt VG_(run_innerloop) ( void );
+/* The patching routing called when a BB wants to chain itself to
+ another. */
+extern UInt VG_(patch_me);
/* ---------------------------------------------------------------------
Exports of vg_helpers.S
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
index b9792b4..4fed580 100644
--- a/coregrind/vg_main.c
+++ b/coregrind/vg_main.c
@@ -403,9 +403,13 @@
UInt VG_(overall_out_count) = 0;
UInt VG_(overall_out_osize) = 0;
UInt VG_(overall_out_tsize) = 0;
-
/* The number of discards of TT/TC. */
UInt VG_(number_of_tc_discards) = 0;
+/* Counts of chain and unchain operations done. */
+UInt VG_(bb_enchain_count) = 0;
+UInt VG_(bb_dechain_count) = 0;
+/* Number of unchained jumps performed. */
+UInt VG_(unchained_jumps_done) = 0;
/* Counts pertaining to the register allocator. */
@@ -468,6 +472,7 @@
Int VG_(clo_backtrace_size) = 4;
Char* VG_(clo_weird_hacks) = NULL;
Bool VG_(clo_run_libc_freeres) = True;
+Bool VG_(clo_chain_bb) = True;
/* This Bool is needed by wrappers in vg_clientmalloc.c to decide how
to behave. Initially we say False. */
@@ -558,6 +563,7 @@
" --single-step=no|yes translate each instr separately? [no]\n"
" --optimise=no|yes improve intermediate code? [yes]\n"
" --profile=no|yes profile? (skin must be built for it) [no]\n"
+" --chain-bb=no|yes do basic-block chaining? [yes]\n"
" --trace-codegen=<XXXXX> show generated code? (X = 0|1) [00000]\n"
" --trace-syscalls=no|yes show all system calls? [no]\n"
" --trace-signals=no|yes show signal handling details? [no]\n"
@@ -833,6 +839,11 @@
else if (STREQ(argv[i], "--profile=no"))
VG_(clo_profile) = False;
+ else if (STREQ(argv[i], "--chain-bb=yes"))
+ VG_(clo_chain_bb) = True;
+ else if (STREQ(argv[i], "--chain-bb=no"))
+ VG_(clo_chain_bb) = False;
+
else if (STREQ(argv[i], "--single-step=yes"))
VG_(clo_single_step) = True;
else if (STREQ(argv[i], "--single-step=no"))
@@ -1174,6 +1185,9 @@
" TT/TC: %d tc sectors discarded.",
VG_(number_of_tc_discards) );
VG_(message)(Vg_DebugMsg,
+ " %d chainings, %d unchainings.",
+ VG_(bb_enchain_count), VG_(bb_dechain_count) );
+ VG_(message)(Vg_DebugMsg,
"translate: new %d (%d -> %d; ratio %d:10)",
VG_(overall_in_count),
VG_(overall_in_osize),
@@ -1186,10 +1200,19 @@
VG_(overall_out_tsize),
safe_idiv(10*VG_(overall_out_tsize), VG_(overall_out_osize)));
VG_(message)(Vg_DebugMsg,
- " dispatch: %lu basic blocks, %d/%d sched events, %d tt_fast misses.",
- VG_(bbs_done), VG_(num_scheduling_events_MAJOR),
+ " dispatch: %lu jumps (bb entries), of which %u (%lu%%) were unchained.",
+ VG_(bbs_done),
+ VG_(unchained_jumps_done),
+ ((ULong)(100) * (ULong)(VG_(unchained_jumps_done)))
+ / ( VG_(bbs_done)==0 ? 1 : VG_(bbs_done) )
+ );
+
+ VG_(message)(Vg_DebugMsg,
+ " %d/%d major/minor sched events. %d tt_fast misses.",
+ VG_(num_scheduling_events_MAJOR),
VG_(num_scheduling_events_MINOR),
VG_(tt_fast_misses));
+
VG_(message)(Vg_DebugMsg,
"reg-alloc: %d t-req-spill, "
"%d+%d orig+spill uis, %d total-reg-r.",
diff --git a/coregrind/vg_scheduler.c b/coregrind/vg_scheduler.c
index f11ed79..4e6561b 100644
--- a/coregrind/vg_scheduler.c
+++ b/coregrind/vg_scheduler.c
@@ -316,12 +316,17 @@
static
void create_translation_for ( ThreadId tid, Addr orig_addr )
{
- Addr trans_addr;
- Int orig_size, trans_size;
+ Addr trans_addr;
+ Int orig_size, trans_size;
+ UShort jumps[VG_MAX_JUMPS];
+ Int i;
+
+ for(i = 0; i < VG_MAX_JUMPS; i++)
+ jumps[i] = (UShort)-1;
/* Make a translation, into temporary storage. */
VG_(translate)( &VG_(threads)[tid],
- orig_addr, &orig_size, &trans_addr, &trans_size );
+ orig_addr, &orig_size, &trans_addr, &trans_size, jumps );
/* Copy data at trans_addr into the translation cache. */
/* Since the .orig_size and .trans_size fields are
@@ -329,7 +334,7 @@
vg_assert(orig_size > 0 && orig_size < 65536);
vg_assert(trans_size > 0 && trans_size < 65536);
- VG_(add_to_trans_tab)( orig_addr, orig_size, trans_addr, trans_size );
+ VG_(add_to_trans_tab)( orig_addr, orig_size, trans_addr, trans_size, jumps );
/* Free the intermediary -- was allocated by VG_(emit_code). */
VG_(arena_free)( VG_AR_JITTER, (void*)trans_addr );
@@ -1579,7 +1584,7 @@
VG_(printf)(
"======vvvvvvvv====== LAST TRANSLATION ======vvvvvvvv======\n");
VG_(translate)( &VG_(threads)[tid],
- VG_(threads)[tid].m_eip, NULL, NULL, NULL );
+ VG_(threads)[tid].m_eip, NULL, NULL, NULL, NULL );
VG_(printf)("\n");
VG_(printf)(
"======^^^^^^^^====== LAST TRANSLATION ======^^^^^^^^======\n");
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index 8f5ac3d..6904c56 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -55,6 +55,7 @@
UCodeBlock* VG_(setup_UCodeBlock) ( UCodeBlock* cb_in )
{
UCodeBlock* cb = VG_(arena_malloc)(VG_AR_CORE, sizeof(UCodeBlock));
+ cb->orig_eip = cb_in->orig_eip;
cb->used = cb->size = 0;
cb->nextTemp = cb_in->nextTemp;
cb->instrs = NULL;
@@ -1762,6 +1763,7 @@
/* Resulting code goes here. We generate it all in a forwards
pass. */
c2 = VG_(alloc_UCodeBlock)();
+ c2->orig_eip = c1->orig_eip;
/* At the start, no TempRegs are assigned to any real register.
Correspondingly, all temps claim to be currently resident in
@@ -2017,7 +2019,8 @@
/*IN*/ Addr orig_addr,
/*OUT*/ UInt* orig_size,
/*OUT*/ Addr* trans_addr,
- /*OUT*/ UInt* trans_size )
+ /*OUT*/ UInt* trans_size,
+ /*OUT*/ UShort jumps[VG_MAX_JUMPS])
{
Int n_disassembled_bytes, final_code_size;
Bool debugging_translation;
@@ -2032,6 +2035,7 @@
VG_TRACK( pre_mem_read, Vg_CoreTranslate, tst, "", orig_addr, 1 );
cb = VG_(alloc_UCodeBlock)();
+ cb->orig_eip = orig_addr;
/* If doing any code printing, print a basic block start marker */
if (VG_(clo_trace_codegen)) {
@@ -2088,7 +2092,7 @@
VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(5);
VGP_PUSHCC(VgpFromUcode);
- final_code = VG_(emit_code)(cb, &final_code_size );
+ final_code = VG_(emit_code)(cb, &final_code_size, jumps );
VGP_POPCC(VgpFromUcode);
VG_(free_UCodeBlock)(cb);
diff --git a/coregrind/vg_transtab.c b/coregrind/vg_transtab.c
index 8372d6c..630e48a 100644
--- a/coregrind/vg_transtab.c
+++ b/coregrind/vg_transtab.c
@@ -30,6 +30,7 @@
*/
#include "vg_include.h"
+#include <stddef.h>
/* #define DEBUG_TRANSTAB */
@@ -58,6 +59,10 @@
/*------------------ TYPES ------------------*/
+#define CODE_ALIGNMENT 16 /* alignment of TCEntries */
+#define CODE_ALIGN(a) (((a)+CODE_ALIGNMENT-1) & ~(CODE_ALIGNMENT-1))
+#define IS_ALIGNED(a) (((a) & (CODE_ALIGNMENT-1)) == 0)
+
/* An entry in TC. Payload always is always padded out to a 4-aligned
quantity so that these structs are always word-aligned. */
typedef
@@ -65,7 +70,8 @@
/* +0 */ Addr orig_addr;
/* +4 */ UShort orig_size;
/* +6 */ UShort trans_size;
- /* +8 */ UChar payload[0];
+ /* +8 */ UShort jump_sites[VG_MAX_JUMPS];
+ /* +VG_CODE_OFFSET */ UChar payload[0];
}
TCEntry;
@@ -130,6 +136,50 @@
vg_dispatch.S. */
Addr /* TCEntry*, really */ VG_(tt_fast)[VG_TT_FAST_SIZE];
+static void for_each_tc(Int sector, void (*fn)(TCEntry *));
+
+
+/*------------------ T-CHAINING HELPERS ------------------*/
+
+static
+void for_each_jumpsite(TCEntry *tce, void (*fn)(Addr))
+{
+ Int i;
+ for(i = 0; i < VG_MAX_JUMPS; i++) {
+ Addr a;
+ UShort idx = tce->jump_sites[i];
+
+ if (idx == (UShort)-1)
+ continue;
+
+ a = (Addr)&tce->payload[idx];
+
+ (*fn)(a);
+ }
+}
+
+static inline
+void unchain_tce(TCEntry *tce)
+{
+ for_each_jumpsite(tce, VG_(unchain_jumpsite));
+}
+
+/* Unchain any jumps pointing to a sector we're about to free */
+static
+void unchain_sector(Int s, Addr base, UInt len)
+{
+ void unchain_site(Addr a) {
+ Addr jmp = VG_(get_jmp_dest)(a);
+ if (jmp >= base && jmp < (base+len))
+ VG_(unchain_jumpsite)(a);
+ }
+ void _unchain_tce(TCEntry *tce) {
+ for_each_jumpsite(tce, unchain_site);
+ }
+
+ for_each_tc(s, _unchain_tce);
+}
+
/*------------------ TT HELPERS ------------------*/
@@ -176,6 +226,7 @@
if (i == VG_TT_SIZE)
i = 0;
}
+
vg_tt[i].orig_addr = tce->orig_addr;
vg_tt[i].tcentry = tce;
vg_tt_used++;
@@ -221,24 +272,13 @@
void rebuild_TT ( void )
{
Int s;
- UChar* pc;
- UChar* pc_lim;
- TCEntry* tce;
/* Throw away TT. */
initialise_tt();
/* Rebuild TT from the remaining quarters. */
for (s = 0; s < VG_TC_N_SECTORS; s++) {
- pc = &(vg_tc[s][0]);
- pc_lim = &(vg_tc[s][vg_tc_used[s]]);
- while (True) {
- if (pc >= pc_lim) break;
- tce = (TCEntry*)pc;
- pc += sizeof(TCEntry) + tce->trans_size;
- if (tce->orig_addr != VG_TTE_DELETED)
- add_tt_entry(tce);
- }
+ for_each_tc(s, add_tt_entry);
}
pp_tt_tc_status ( "after rebuild of TC" );
}
@@ -246,6 +286,24 @@
/*------------------ TC HELPERS ------------------*/
+static
+void for_each_tc(Int s, void (*fn)(TCEntry *))
+{
+ UChar *pc;
+ UChar *pc_lim;
+ TCEntry *tce;
+
+ pc = &(vg_tc[s][0]);
+ pc_lim = &(vg_tc[s][vg_tc_used[s]]);
+ while (True) {
+ if (pc >= pc_lim) break;
+ tce = (TCEntry*)pc;
+ pc += sizeof(TCEntry) + tce->trans_size;
+ if (tce->orig_addr != VG_TTE_DELETED)
+ (*fn)(tce);
+ }
+}
+
/* Find the oldest non-NULL, non-empty sector, or -1 if none such. */
static
Int find_oldest_sector ( void )
@@ -274,9 +332,17 @@
Char msg[100];
Int s = find_oldest_sector();
if (s != -1) {
+ Int i;
+
vg_assert(s >= 0 && s < VG_TC_N_SECTORS);
VG_(sprintf)(msg, "before discard of sector %d (%d bytes)",
s, vg_tc_used[s]);
+
+ for(i = 0; i < VG_TC_N_SECTORS; i++) {
+ if (i != s && vg_tc[i] != NULL)
+ unchain_sector(i, (Addr)vg_tc[s], vg_tc_used[s]);
+ }
+
pp_tt_tc_status ( msg );
VG_(overall_out_count) += vg_tc_stats_count[s];
VG_(overall_out_osize) += vg_tc_stats_osize[s];
@@ -331,7 +397,7 @@
{
Int i;
- vg_assert(0 == (nBytes & 3));
+ vg_assert(IS_ALIGNED(nBytes));
/* Ensure the TT is still OK. */
while (vg_tt_used >= VG_TT_LIMIT) {
@@ -421,7 +487,8 @@
pointer, which is inserted here.
*/
void VG_(add_to_trans_tab) ( Addr orig_addr, Int orig_size,
- Addr trans_addr, Int trans_size )
+ Addr trans_addr, Int trans_size,
+ UShort jumps[VG_MAX_JUMPS])
{
Int i, nBytes, trans_size_aligned;
TCEntry* tce;
@@ -431,12 +498,12 @@
tte->trans_addr, tte->trans_size);
*/
+ vg_assert(offsetof(TCEntry, payload) == VG_CODE_OFFSET);
+
/* figure out how many bytes we require. */
- trans_size_aligned = trans_size;
- while ((trans_size_aligned & 3) != 0)
- trans_size_aligned++;
- nBytes = trans_size_aligned + sizeof(TCEntry);
- vg_assert((nBytes & 3) == 0);
+ nBytes = CODE_ALIGN(trans_size + sizeof(TCEntry));
+ trans_size_aligned = nBytes-sizeof(TCEntry);
+ vg_assert(IS_ALIGNED(nBytes));
tce = (TCEntry*)allocate(nBytes);
/* VG_(printf)("allocate returned %p\n", tce); */
@@ -445,10 +512,14 @@
tce->orig_addr = orig_addr;
tce->orig_size = (UShort)orig_size; /* what's the point of storing this? */
tce->trans_size = (UShort)trans_size_aligned;
+ for (i = 0; i < VG_MAX_JUMPS; i++) {
+ tce->jump_sites[i] = jumps[i];
+ }
for (i = 0; i < trans_size; i++) {
tce->payload[i] = ((UChar*)trans_addr)[i];
}
-
+
+ unchain_tce(tce);
add_tt_entry(tce);
/* Update stats. */
@@ -553,6 +624,10 @@
{
Int s;
+ /* Otherwise we wind up with non-32-bit-aligned code in
+ TCEntries. */
+ vg_assert((VG_MAX_JUMPS % 2) == 0);
+
/* Figure out how big each sector should be. */
vg_tc_sector_szB
= (VG_TT_LIMIT /* max TT entries we expect */