- x86 back end: change code generation convention, so that instead of
  dispatchers CALLing generated code which later RETs, dispatchers
  jump to generated code and it jumps back to the dispatcher.  This
  removes two memory references per translation run and by itself
  gives a measureable performance improvement on P4.  As a result,
  there is new plumbing so that the caller of LibVEX_Translate can
  supply the address of the dispatcher to jump back to.

  This probably breaks all other targets.  Do not update.

- Administrative cleanup: LibVEX_Translate has an excessive
  number of arguments.  Remove them all and instead add a struct
  by which the arguments are supplied.  Add further comments 
  about the meaning of some fields.



git-svn-id: svn://svn.valgrind.org/vex/trunk@1494 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/host-x86/hdefs.c b/priv/host-x86/hdefs.c
index 8471492..ca94605 100644
--- a/priv/host-x86/hdefs.c
+++ b/priv/host-x86/hdefs.c
@@ -942,14 +942,16 @@
             vex_printf("if (%%eflags.%s) { ", 
                        showX86CondCode(i->Xin.Goto.cond));
 	 }
-         if (i->Xin.Goto.jk != Ijk_Boring) {
+         if (i->Xin.Goto.jk != Ijk_Boring
+             && i->Xin.Goto.jk != Ijk_Call
+             && i->Xin.Goto.jk != Ijk_Ret) {
             vex_printf("movl $");
             ppIRJumpKind(i->Xin.Goto.jk);
             vex_printf(",%%ebp ; ");
          }
          vex_printf("movl ");
          ppX86RI(i->Xin.Goto.dst);
-         vex_printf(",%%eax ; ret");
+         vex_printf(",%%eax ; movl $dispatcher_addr,%%edx ; jmp *%%edx");
          if (i->Xin.Goto.cond != Xcc_ALWAYS) {
             vex_printf(" }");
 	 }
@@ -1216,8 +1218,13 @@
          return;
       case Xin_Goto:
          addRegUsage_X86RI(u, i->Xin.Goto.dst);
-         addHRegUse(u, HRmWrite, hregX86_EAX());
-         if (i->Xin.Goto.jk != Ijk_Boring)
+         addHRegUse(u, HRmWrite, hregX86_EAX()); /* used for next guest addr */
+         addHRegUse(u, HRmWrite, hregX86_EDX()); /* used for dispatcher addr */
+         if (i->Xin.Goto.jk != Ijk_Boring
+             && i->Xin.Goto.jk != Ijk_Call
+             && i->Xin.Goto.jk != Ijk_Ret)
+            /* note, this is irrelevant since ebp is not actually
+               available to the allocator.  But still .. */
             addHRegUse(u, HRmWrite, hregX86_EBP());
          return;
       case Xin_CMov32:
@@ -1832,7 +1839,8 @@
    Note that buf is not the insn's final place, and therefore it is
    imperative to emit position-independent code. */
 
-Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i, Bool mode64 )
+Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i, 
+                    Bool mode64, void* dispatch )
 {
    UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
 
@@ -2185,20 +2193,30 @@
 
       /* Get the destination address into %eax */
       if (i->Xin.Goto.dst->tag == Xri_Imm) {
-         /* movl $immediate, %eax ; ret */
+         /* movl $immediate, %eax */
          *p++ = 0xB8;
          p = emit32(p, i->Xin.Goto.dst->Xri.Imm.imm32);
       } else {
          vassert(i->Xin.Goto.dst->tag == Xri_Reg);
-         /* movl %reg, %eax ; ret */
+         /* movl %reg, %eax */
          if (i->Xin.Goto.dst->Xri.Reg.reg != hregX86_EAX()) {
             *p++ = 0x89;
             p = doAMode_R(p, i->Xin.Goto.dst->Xri.Reg.reg, hregX86_EAX());
          }
       }
 
-      /* ret */
-      *p++ = 0xC3;
+      /* Get the dispatcher address into %edx.  This has to happen
+         after the load of %eax since %edx might be carrying the value
+         destined for %eax immediately prior to this Xin_Goto. */
+      vassert(sizeof(UInt) == sizeof(void*));
+      vassert(dispatch != NULL);
+      /* movl $imm32, %edx */
+      *p++ = 0xBA;
+      p = emit32(p, (UInt)dispatch);
+
+      /* jmp *%edx */
+      *p++ = 0xFF;
+      *p++ = 0xE2;
 
       /* Fix up the conditional jump, if there was one. */
       if (i->Xin.Goto.cond != Xcc_ALWAYS) {