Merge branches/TCHAIN from r2271 (its creation point) into trunk.


git-svn-id: svn://svn.valgrind.org/vex/trunk@2296 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_amd64_defs.h b/priv/guest_amd64_defs.h
index 6038ded..93b6d12 100644
--- a/priv/guest_amd64_defs.h
+++ b/priv/guest_amd64_defs.h
@@ -47,7 +47,6 @@
    bb_to_IR.h. */
 extern
 DisResult disInstr_AMD64 ( IRSB*        irbb,
-                           Bool         put_IP,
                            Bool         (*resteerOkFn) ( void*, Addr64 ),
                            Bool         resteerCisOk,
                            void*        callback_opaque,
diff --git a/priv/guest_amd64_helpers.c b/priv/guest_amd64_helpers.c
index 022edaa..80b5a74 100644
--- a/priv/guest_amd64_helpers.c
+++ b/priv/guest_amd64_helpers.c
@@ -3452,6 +3452,10 @@
 /* VISIBLE TO LIBVEX CLIENT */
 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
 {
+   vex_state->host_EvC_FAILADDR = 0;
+   vex_state->host_EvC_COUNTER = 0;
+   vex_state->pad0 = 0;
+
    vex_state->guest_RAX = 0;
    vex_state->guest_RCX = 0;
    vex_state->guest_RDX = 0;
@@ -3522,7 +3526,7 @@
    vex_state->guest_GS_0x60  = 0;
 
    vex_state->guest_IP_AT_SYSCALL = 0;
-   /* vex_state->padding = 0; */
+   vex_state->pad1 = 0;
 }
 
 
diff --git a/priv/guest_amd64_toIR.c b/priv/guest_amd64_toIR.c
index b94d0b6..ab79312 100644
--- a/priv/guest_amd64_toIR.c
+++ b/priv/guest_amd64_toIR.c
@@ -1511,7 +1511,8 @@
             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
                    mkexpr(oldTmp), mkexpr(expTmp) ),
             Ijk_Boring, /*Ijk_NoRedir*/
-            IRConst_U64( restart_point ) 
+            IRConst_U64( restart_point ),
+            OFFB_RIP
          ));
 }
 
@@ -2091,36 +2092,55 @@
 /*--- JMP helpers                                          ---*/
 /*------------------------------------------------------------*/
 
-static void jmp_lit( IRJumpKind kind, Addr64 d64 )
+static void jmp_lit( /*MOD*/DisResult* dres,
+                     IRJumpKind kind, Addr64 d64 )
 {
-   irsb->next     = mkU64(d64);
-   irsb->jumpkind = kind;
+   vassert(dres->whatNext    == Dis_Continue);
+   vassert(dres->len         == 0);
+   vassert(dres->continueAt  == 0);
+   vassert(dres->jk_StopHere == Ijk_INVALID);
+   dres->whatNext    = Dis_StopHere;
+   dres->jk_StopHere = kind;
+   stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
 }
 
-static void jmp_treg( IRJumpKind kind, IRTemp t )
+static void jmp_treg( /*MOD*/DisResult* dres,
+                      IRJumpKind kind, IRTemp t )
 {
-   irsb->next     = mkexpr(t);
-   irsb->jumpkind = kind;
+   vassert(dres->whatNext    == Dis_Continue);
+   vassert(dres->len         == 0);
+   vassert(dres->continueAt  == 0);
+   vassert(dres->jk_StopHere == Ijk_INVALID);
+   dres->whatNext    = Dis_StopHere;
+   dres->jk_StopHere = kind;
+   stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
 }
 
 static 
-void jcc_01 ( AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
+void jcc_01 ( /*MOD*/DisResult* dres,
+              AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
 {
    Bool          invert;
    AMD64Condcode condPos;
+   vassert(dres->whatNext    == Dis_Continue);
+   vassert(dres->len         == 0);
+   vassert(dres->continueAt  == 0);
+   vassert(dres->jk_StopHere == Ijk_INVALID);
+   dres->whatNext    = Dis_StopHere;
+   dres->jk_StopHere = Ijk_Boring;
    condPos = positiveIse_AMD64Condcode ( cond, &invert );
    if (invert) {
       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
                          Ijk_Boring,
-                         IRConst_U64(d64_false) ) );
-      irsb->next     = mkU64(d64_true);
-      irsb->jumpkind = Ijk_Boring;
+                         IRConst_U64(d64_false),
+                         OFFB_RIP ) );
+      stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
    } else {
       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
                          Ijk_Boring,
-                         IRConst_U64(d64_true) ) );
-      irsb->next     = mkU64(d64_false);
-      irsb->jumpkind = Ijk_Boring;
+                         IRConst_U64(d64_true),
+                         OFFB_RIP ) );
+      stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
    }
 }
 
@@ -3966,7 +3986,7 @@
 static
 ULong dis_Grp5 ( VexAbiInfo* vbi,
                  Prefix pfx, Int sz, Long delta,
-                 DisResult* dres, Bool* decode_OK )
+                 /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
 {
    Int     len;
    UChar   modrm;
@@ -4009,8 +4029,8 @@
             putIReg64(R_RSP, mkexpr(t2));
             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
-            jmp_treg(Ijk_Call,t3);
-            dres->whatNext = Dis_StopHere;
+            jmp_treg(dres, Ijk_Call, t3);
+            vassert(dres->whatNext == Dis_StopHere);
             showSz = False;
             break;
          case 4: /* jmp Ev */
@@ -4019,8 +4039,8 @@
             sz = 8;
             t3 = newTemp(Ity_I64);
             assign(t3, getIRegE(sz,pfx,modrm));
-            jmp_treg(Ijk_Boring,t3);
-            dres->whatNext = Dis_StopHere;
+            jmp_treg(dres, Ijk_Boring, t3);
+            vassert(dres->whatNext == Dis_StopHere);
             showSz = False;
             break;
          default: 
@@ -4073,8 +4093,8 @@
             putIReg64(R_RSP, mkexpr(t2));
             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
-            jmp_treg(Ijk_Call,t3);
-            dres->whatNext = Dis_StopHere;
+            jmp_treg(dres, Ijk_Call, t3);
+            vassert(dres->whatNext == Dis_StopHere);
             showSz = False;
             break;
          case 4: /* JMP Ev */
@@ -4083,8 +4103,8 @@
             sz = 8;
             t3 = newTemp(Ity_I64);
             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
-            jmp_treg(Ijk_Boring,t3);
-            dres->whatNext = Dis_StopHere;
+            jmp_treg(dres, Ijk_Boring, t3);
+            vassert(dres->whatNext == Dis_StopHere);
             showSz = False;
             break;
          case 6: /* PUSH Ev */
@@ -4287,7 +4307,8 @@
    the insn is the last one in the basic block, and so emit a jump to
    the next insn, rather than just falling through. */
 static 
-void dis_REP_op ( AMD64Condcode cond,
+void dis_REP_op ( /*MOD*/DisResult* dres,
+                  AMD64Condcode cond,
                   void (*dis_OP)(Int, IRTemp, Prefix),
                   Int sz, Addr64 rip, Addr64 rip_next, HChar* name,
                   Prefix pfx )
@@ -4310,7 +4331,8 @@
       cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
    }
 
-   stmt( IRStmt_Exit( cmp, Ijk_Boring, IRConst_U64(rip_next) ) );
+   stmt( IRStmt_Exit( cmp, Ijk_Boring,
+                      IRConst_U64(rip_next), OFFB_RIP ) );
 
    if (haveASO(pfx))
       putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
@@ -4321,12 +4343,15 @@
    dis_OP (sz, t_inc, pfx);
 
    if (cond == AMD64CondAlways) {
-      jmp_lit(Ijk_Boring,rip);
+      jmp_lit(dres, Ijk_Boring, rip);
+      vassert(dres->whatNext == Dis_StopHere);
    } else {
       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
                          Ijk_Boring,
-                         IRConst_U64(rip) ) );
-      jmp_lit(Ijk_Boring,rip_next);
+                         IRConst_U64(rip),
+                         OFFB_RIP ) );
+      jmp_lit(dres, Ijk_Boring, rip_next);
+      vassert(dres->whatNext == Dis_StopHere);
    }
    DIP("%s%c\n", name, nameISize(sz));
 }
@@ -5130,7 +5155,8 @@
                   IRStmt_Exit(
                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
                      Ijk_EmWarn,
-                     IRConst_U64( guest_RIP_bbstart+delta )
+                     IRConst_U64( guest_RIP_bbstart+delta ),
+                     OFFB_RIP
                   )
                );
 
@@ -5172,7 +5198,8 @@
                   IRStmt_Exit(
                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
                      Ijk_EmWarn,
-                     IRConst_U64( guest_RIP_bbstart+delta )
+                     IRConst_U64( guest_RIP_bbstart+delta ),
+                     OFFB_RIP
                   )
                );
                break;
@@ -6108,7 +6135,8 @@
                   IRStmt_Exit(
                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
                      Ijk_EmWarn,
-                     IRConst_U64( guest_RIP_bbstart+delta )
+                     IRConst_U64( guest_RIP_bbstart+delta ),
+                     OFFB_RIP
                   )
                );
 
@@ -8143,7 +8171,7 @@
 //.. }
 
 static
-void dis_ret ( VexAbiInfo* vbi, ULong d64 )
+void dis_ret ( /*MOD*/DisResult* dres, VexAbiInfo* vbi, ULong d64 )
 {
    IRTemp t1 = newTemp(Ity_I64); 
    IRTemp t2 = newTemp(Ity_I64);
@@ -8153,7 +8181,8 @@
    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
    putIReg64(R_RSP, mkexpr(t3));
    make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
-   jmp_treg(Ijk_Ret,t2);
+   jmp_treg(dres, Ijk_Ret, t2);
+   vassert(dres->whatNext == Dis_StopHere);
 }
 
 
@@ -8964,7 +8993,8 @@
                binop(Iop_And64,mkexpr(effective_addr),mkU64(0xF)),
                mkU64(0)),
          Ijk_SigSEGV,
-         IRConst_U64(guest_RIP_curr_instr)
+         IRConst_U64(guest_RIP_curr_instr),
+         OFFB_RIP
       )
    );
 }
@@ -11452,10 +11482,8 @@
                          mkU64( ~(lineszB-1) ))) );
 
          stmt( IRStmt_Put(OFFB_TILEN, mkU64(lineszB) ) );
- 
-         irsb->jumpkind = Ijk_TInval;
-         irsb->next     = mkU64(guest_RIP_bbstart+delta);
-         dres->whatNext = Dis_StopHere;
+
+         jmp_lit(dres, Ijk_TInval, (Addr64)(guest_RIP_bbstart+delta));
 
          DIP("clflush %s\n", dis_buf);
          goto decode_success;
@@ -11527,7 +11555,8 @@
             IRStmt_Exit(
                binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
                Ijk_EmWarn,
-               IRConst_U64(guest_RIP_bbstart+delta)
+               IRConst_U64(guest_RIP_bbstart+delta),
+               OFFB_RIP
             )
          );
          goto decode_success;
@@ -16954,7 +16983,8 @@
                   mk_amd64g_calculate_condition(
                      (AMD64Condcode)(1 ^ (opc - 0x70))),
                   Ijk_Boring,
-                  IRConst_U64(guest_RIP_bbstart+delta) ) );
+                  IRConst_U64(guest_RIP_bbstart+delta),
+                  OFFB_RIP ) );
          dres->whatNext   = Dis_ResteerC;
          dres->continueAt = d64;
          comment = "(assumed taken)";
@@ -16972,7 +17002,8 @@
          stmt( IRStmt_Exit( 
                   mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
                   Ijk_Boring,
-                  IRConst_U64(d64) ) );
+                  IRConst_U64(d64),
+                  OFFB_RIP ) );
          dres->whatNext   = Dis_ResteerC;
          dres->continueAt = guest_RIP_bbstart+delta;
          comment = "(assumed not taken)";
@@ -16980,10 +17011,9 @@
       else {
          /* Conservative default translation - end the block at this
             point. */
-         jcc_01( (AMD64Condcode)(opc - 0x70), 
-                 guest_RIP_bbstart+delta,
-                 d64 );
-         dres->whatNext = Dis_StopHere;
+         jcc_01( dres, (AMD64Condcode)(opc - 0x70),
+                 guest_RIP_bbstart+delta, d64 );
+         vassert(dres->whatNext == Dis_StopHere);
       }
       DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), d64, comment);
       return delta;
@@ -17154,8 +17184,8 @@
          DIP("rep nop (P4 pause)\n");
          /* "observe" the hint.  The Vex client needs to be careful not
             to cause very long delays as a result, though. */
-         jmp_lit(Ijk_Yield, guest_RIP_bbstart+delta);
-         dres->whatNext = Dis_StopHere;
+         jmp_lit(dres, Ijk_Yield, guest_RIP_bbstart+delta);
+         vassert(dres->whatNext == Dis_StopHere);
          return delta;
       }
       /* detect and handle NOPs specially */
@@ -17393,7 +17423,7 @@
       if (haveF3(pfx) && !haveF2(pfx)) {
          if (opc == 0xA4)
             sz = 1;
-         dis_REP_op ( AMD64CondAlways, dis_MOVS, sz,
+         dis_REP_op ( dres, AMD64CondAlways, dis_MOVS, sz,
                       guest_RIP_curr_instr,
                       guest_RIP_bbstart+delta, "rep movs", pfx );
         dres->whatNext = Dis_StopHere;
@@ -17414,7 +17444,7 @@
       if (haveF3(pfx) && !haveF2(pfx)) {
          if (opc == 0xA6)
             sz = 1;
-         dis_REP_op ( AMD64CondZ, dis_CMPS, sz, 
+         dis_REP_op ( dres, AMD64CondZ, dis_CMPS, sz, 
                       guest_RIP_curr_instr,
                       guest_RIP_bbstart+delta, "repe cmps", pfx );
          dres->whatNext = Dis_StopHere;
@@ -17428,11 +17458,11 @@
       if (haveF3(pfx) && !haveF2(pfx)) {
          if (opc == 0xAA)
             sz = 1;
-         dis_REP_op ( AMD64CondAlways, dis_STOS, sz,
+         dis_REP_op ( dres, AMD64CondAlways, dis_STOS, sz,
                       guest_RIP_curr_instr,
                       guest_RIP_bbstart+delta, "rep stos", pfx );
-        dres->whatNext = Dis_StopHere;
-        return delta;
+         vassert(dres->whatNext == Dis_StopHere);
+         return delta;
       }
       /* AA/AB: stosb/stos{w,l,q} */
       if (!haveF3(pfx) && !haveF2(pfx)) {
@@ -17463,20 +17493,20 @@
       if (haveF2(pfx) && !haveF3(pfx)) {
          if (opc == 0xAE)
             sz = 1;
-         dis_REP_op ( AMD64CondNZ, dis_SCAS, sz, 
+         dis_REP_op ( dres, AMD64CondNZ, dis_SCAS, sz, 
                       guest_RIP_curr_instr,
                       guest_RIP_bbstart+delta, "repne scas", pfx );
-         dres->whatNext = Dis_StopHere;
+         vassert(dres->whatNext == Dis_StopHere);
          return delta;
       }
       /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
       if (!haveF2(pfx) && haveF3(pfx)) {
          if (opc == 0xAE)
             sz = 1;
-         dis_REP_op ( AMD64CondZ, dis_SCAS, sz, 
+         dis_REP_op ( dres, AMD64CondZ, dis_SCAS, sz, 
                       guest_RIP_curr_instr,
                       guest_RIP_bbstart+delta, "repe scas", pfx );
-         dres->whatNext = Dis_StopHere;
+         vassert(dres->whatNext == Dis_StopHere);
          return delta;
       }
       /* AE/AF: scasb/scas{w,l,q} */
@@ -17563,16 +17593,14 @@
       if (have66orF2orF3(pfx)) goto decode_failure;
       d64 = getUDisp16(delta); 
       delta += 2;
-      dis_ret(vbi, d64);
-      dres->whatNext = Dis_StopHere;
+      dis_ret(dres, vbi, d64);
       DIP("ret $%lld\n", d64);
       return delta;
 
    case 0xC3: /* RET */
       if (have66orF2(pfx)) goto decode_failure;
       /* F3 is acceptable on AMD. */
-      dis_ret(vbi, 0);
-      dres->whatNext = Dis_StopHere;
+      dis_ret(dres, vbi, 0);
       DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
       return delta;
 
@@ -17655,8 +17683,8 @@
       return delta;
 
    case 0xCC: /* INT 3 */
-      jmp_lit(Ijk_SigTRAP, guest_RIP_bbstart + delta);
-      dres->whatNext = Dis_StopHere;
+      jmp_lit(dres, Ijk_SigTRAP, guest_RIP_bbstart + delta);
+      vassert(dres->whatNext == Dis_StopHere);
       DIP("int $0x3\n");
       return delta;
 
@@ -17808,7 +17836,7 @@
          default:
 	    vassert(0);
       }
-      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64)) );
+      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) );
 
       DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", d64);
       return delta;
@@ -17822,20 +17850,22 @@
       if (haveASO(pfx)) {
          /* 32-bit */
          stmt( IRStmt_Exit( binop(Iop_CmpEQ64, 
-                            unop(Iop_32Uto64, getIReg32(R_RCX)), 
-                            mkU64(0)),
-               Ijk_Boring,
-               IRConst_U64(d64)) 
-             );
+                                  unop(Iop_32Uto64, getIReg32(R_RCX)), 
+                                  mkU64(0)),
+                            Ijk_Boring,
+                            IRConst_U64(d64),
+                            OFFB_RIP
+             ));
          DIP("jecxz 0x%llx\n", d64);
       } else {
          /* 64-bit */
          stmt( IRStmt_Exit( binop(Iop_CmpEQ64, 
                                   getIReg64(R_RCX), 
                                   mkU64(0)),
-               Ijk_Boring,
-               IRConst_U64(d64)) 
-             );
+                            Ijk_Boring,
+                            IRConst_U64(d64),
+                            OFFB_RIP
+               ));
          DIP("jrcxz 0x%llx\n", d64);
       }
       return delta;
@@ -17953,8 +17983,8 @@
          dres->whatNext   = Dis_ResteerU;
          dres->continueAt = d64;
       } else {
-         jmp_lit(Ijk_Call,d64);
-         dres->whatNext = Dis_StopHere;
+         jmp_lit(dres, Ijk_Call, d64);
+         vassert(dres->whatNext == Dis_StopHere);
       }
       DIP("call 0x%llx\n",d64);
       return delta;
@@ -17969,8 +17999,8 @@
          dres->whatNext   = Dis_ResteerU;
          dres->continueAt = d64;
       } else {
-         jmp_lit(Ijk_Boring,d64);
-         dres->whatNext = Dis_StopHere;
+         jmp_lit(dres, Ijk_Boring, d64);
+         vassert(dres->whatNext == Dis_StopHere);
       }
       DIP("jmp 0x%llx\n", d64);
       return delta;
@@ -17985,8 +18015,8 @@
          dres->whatNext   = Dis_ResteerU;
          dres->continueAt = d64;
       } else {
-         jmp_lit(Ijk_Boring,d64);
-         dres->whatNext = Dis_StopHere;
+         jmp_lit(dres, Ijk_Boring, d64);
+         vassert(dres->whatNext == Dis_StopHere);
       }
       DIP("jmp-8 0x%llx\n", d64);
       return delta;
@@ -18153,8 +18183,8 @@
       /* It's important that all guest state is up-to-date
          at this point.  So we declare an end-of-block here, which
          forces any cached guest state to be flushed. */
-      jmp_lit(Ijk_Sys_syscall, guest_RIP_next_assumed);
-      dres->whatNext = Dis_StopHere;
+      jmp_lit(dres, Ijk_Sys_syscall, guest_RIP_next_assumed);
+      vassert(dres->whatNext == Dis_StopHere);
       DIP("syscall\n");
       return delta;
 
@@ -18243,7 +18273,9 @@
                   mk_amd64g_calculate_condition(
                      (AMD64Condcode)(1 ^ (opc - 0x80))),
                   Ijk_Boring,
-                  IRConst_U64(guest_RIP_bbstart+delta) ) );
+                  IRConst_U64(guest_RIP_bbstart+delta),
+                  OFFB_RIP
+             ));
          dres->whatNext   = Dis_ResteerC;
          dres->continueAt = d64;
          comment = "(assumed taken)";
@@ -18262,7 +18294,9 @@
                   mk_amd64g_calculate_condition((AMD64Condcode)
                                                 (opc - 0x80)),
                   Ijk_Boring,
-                  IRConst_U64(d64) ) );
+                  IRConst_U64(d64),
+                  OFFB_RIP
+             ));
          dres->whatNext   = Dis_ResteerC;
          dres->continueAt = guest_RIP_bbstart+delta;
          comment = "(assumed not taken)";
@@ -18270,10 +18304,9 @@
       else {
          /* Conservative default translation - end the block at
             this point. */
-         jcc_01( (AMD64Condcode)(opc - 0x80), 
-                 guest_RIP_bbstart+delta,
-                 d64 );
-         dres->whatNext = Dis_StopHere;
+         jcc_01( dres, (AMD64Condcode)(opc - 0x80),
+                 guest_RIP_bbstart+delta, d64 );
+         vassert(dres->whatNext == Dis_StopHere);
       }
       DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), d64, comment);
       return delta;
@@ -18985,7 +19018,6 @@
 static
 DisResult disInstr_AMD64_WRK ( 
              /*OUT*/Bool* expect_CAS,
-             Bool         put_IP,
              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
              Bool         resteerCisOk,
              void*        callback_opaque,
@@ -19015,10 +19047,10 @@
    Prefix pfx = PFX_EMPTY;
 
    /* Set result defaults. */
-   dres.whatNext   = Dis_Continue;
-   dres.len        = 0;
-   dres.continueAt = 0;
-
+   dres.whatNext    = Dis_Continue;
+   dres.len         = 0;
+   dres.continueAt  = 0;
+   dres.jk_StopHere = Ijk_INVALID;
    *expect_CAS = False;
 
    vassert(guest_RIP_next_assumed == 0);
@@ -19028,10 +19060,6 @@
 
    DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
 
-   /* We may be asked to update the guest RIP before going further. */
-   if (put_IP)
-      stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr)) );
-
    /* Spot "Special" instructions (see comment at top of file). */
    {
       UChar* code = (UChar*)(guest_code + delta);
@@ -19055,8 +19083,8 @@
             /* %RDX = client_request ( %RAX ) */
             DIP("%%rdx = client_request ( %%rax )\n");
             delta += 19;
-            jmp_lit(Ijk_ClientReq, guest_RIP_bbstart+delta);
-            dres.whatNext = Dis_StopHere;
+            jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta);
+            vassert(dres.whatNext == Dis_StopHere);
             goto decode_success;
          }
          else
@@ -19080,8 +19108,8 @@
             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
             putIReg64(R_RSP, mkexpr(t2));
             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
-            jmp_treg(Ijk_NoRedir,t1);
-            dres.whatNext = Dis_StopHere;
+            jmp_treg(&dres, Ijk_NoRedir, t1);
+            vassert(dres.whatNext == Dis_StopHere);
             goto decode_success;
          }
          /* We don't know what it is. */
@@ -19309,7 +19337,7 @@
       /* It's important that all ArchRegs carry their up-to-date value
          at this point.  So we declare an end-of-block here, which
          forces any TempRegs caching ArchRegs to be flushed. */
-      dres.whatNext = Dis_StopHere;
+      vassert(dres.whatNext == Dis_StopHere);
       DIP("int $0x%02x\n", (UInt)d64);
       break;
    }
@@ -19452,9 +19480,9 @@
       insn, but nevertheless be paranoid and update it again right
       now. */
    stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
-   jmp_lit(Ijk_NoDecode, guest_RIP_curr_instr);
-   dres.whatNext = Dis_StopHere;
-   dres.len      = 0;
+   jmp_lit(&dres, Ijk_NoDecode, guest_RIP_curr_instr);
+   vassert(dres.whatNext == Dis_StopHere);
+   dres.len = 0;
    /* We also need to say that a CAS is not expected now, regardless
       of what it might have been set to at the start of the function,
       since the IR that we've emitted just above (to synthesis a
@@ -19467,6 +19495,20 @@
 
   decode_success:
    /* All decode successes end up here. */
+   switch (dres.whatNext) {
+      case Dis_Continue:
+         stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
+         break;
+      case Dis_ResteerU:
+      case Dis_ResteerC:
+         stmt( IRStmt_Put( OFFB_RIP, mkU64(dres.continueAt) ) );
+         break;
+      case Dis_StopHere:
+         break;
+      default:
+         vassert(0);
+   }
+
    DIP("\n");
    dres.len = (Int)toUInt(delta - delta_start);
    return dres;
@@ -19484,7 +19526,6 @@
    is located in host memory at &guest_code[delta]. */
 
 DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
-                           Bool         put_IP,
                            Bool         (*resteerOkFn) ( void*, Addr64 ),
                            Bool         resteerCisOk,
                            void*        callback_opaque,
@@ -19514,7 +19555,7 @@
 
    x1 = irsb_IN->stmts_used;
    expect_CAS = False;
-   dres = disInstr_AMD64_WRK ( &expect_CAS, put_IP, resteerOkFn,
+   dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
                                resteerCisOk,
                                callback_opaque,
                                delta, archinfo, abiinfo );
@@ -19547,7 +19588,7 @@
       /* inconsistency detected.  re-disassemble the instruction so as
          to generate a useful error message; then assert. */
       vex_traceflags |= VEX_TRACE_FE;
-      dres = disInstr_AMD64_WRK ( &expect_CAS, put_IP, resteerOkFn,
+      dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
                                   resteerCisOk,
                                   callback_opaque,
                                   delta, archinfo, abiinfo );
diff --git a/priv/guest_arm_defs.h b/priv/guest_arm_defs.h
index be6dd1c..a225240 100644
--- a/priv/guest_arm_defs.h
+++ b/priv/guest_arm_defs.h
@@ -41,7 +41,6 @@
    bb_to_IR.h. */
 extern
 DisResult disInstr_ARM ( IRSB*        irbb,
-                         Bool         put_IP,
                          Bool         (*resteerOkFn) ( void*, Addr64 ),
                          Bool         resteerCisOk,
                          void*        callback_opaque,
@@ -157,7 +156,7 @@
    OP                DEP1              DEP2              DEP3
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-   OP_COPY           current NZCV      unused            unused
+   OP_COPY           curr_NZCV:28x0    unused            unused
    OP_ADD            argL              argR              unused
    OP_SUB            argL              argR              unused
    OP_ADC            argL              argR              31x0:old_C
diff --git a/priv/guest_arm_helpers.c b/priv/guest_arm_helpers.c
index a545e34..122d034 100644
--- a/priv/guest_arm_helpers.c
+++ b/priv/guest_arm_helpers.c
@@ -697,6 +697,18 @@
                            mkU32(1)));
       }
 
+      /*---------------- COPY ----------------*/
+
+      if (isU32(cond_n_op, (ARMCondNE << 4) | ARMG_CC_OP_COPY)) {
+         /* NE after COPY --> ((cc_dep1 >> ARMG_CC_SHIFT_Z) ^ 1) & 1 */
+         return binop(Iop_And32,
+                      binop(Iop_Xor32,
+                            binop(Iop_Shr32, cc_dep1,
+                                             mkU8(ARMG_CC_SHIFT_Z)),
+                            mkU32(1)),
+                      mkU32(1));
+      }
+
       /*----------------- AL -----------------*/
 
       /* A critically important case for Thumb code.
@@ -937,6 +949,9 @@
 /* VISIBLE TO LIBVEX CLIENT */
 void LibVEX_GuestARM_initialise ( /*OUT*/VexGuestARMState* vex_state )
 {
+   vex_state->host_EvC_FAILADDR = 0;
+   vex_state->host_EvC_COUNTER = 0;
+
    vex_state->guest_R0  = 0;
    vex_state->guest_R1  = 0;
    vex_state->guest_R2  = 0;
@@ -1014,8 +1029,6 @@
    vex_state->guest_ITSTATE = 0;
 
    vex_state->padding1 = 0;
-   vex_state->padding2 = 0;
-   vex_state->padding3 = 0;
 }
 
 
diff --git a/priv/guest_arm_toIR.c b/priv/guest_arm_toIR.c
index 5935b1e..dcf1787 100644
--- a/priv/guest_arm_toIR.c
+++ b/priv/guest_arm_toIR.c
@@ -1398,7 +1398,8 @@
    stmt( IRStmt_Exit(
             unop(Iop_Not1, unop(Iop_32to1, mkexpr(guardT))),
             Ijk_Boring,
-            IRConst_U32(toUInt(guest_R15_curr_instr_notENC + 4))
+            IRConst_U32(toUInt(guest_R15_curr_instr_notENC + 4)),
+            OFFB_R15T
        ));
 }
 
@@ -1414,7 +1415,8 @@
    stmt( IRStmt_Exit(
             unop(Iop_Not1, unop(Iop_32to1, mkexpr(guardT))),
             Ijk_Boring,
-            IRConst_U32(toUInt((guest_R15_curr_instr_notENC + 2) | 1))
+            IRConst_U32(toUInt((guest_R15_curr_instr_notENC + 2) | 1)),
+            OFFB_R15T
        ));
 }
 
@@ -1431,7 +1433,8 @@
    stmt( IRStmt_Exit(
             unop(Iop_Not1, unop(Iop_32to1, mkexpr(guardT))),
             Ijk_Boring,
-            IRConst_U32(toUInt((guest_R15_curr_instr_notENC + 4) | 1))
+            IRConst_U32(toUInt((guest_R15_curr_instr_notENC + 4) | 1)),
+            OFFB_R15T
        ));
 }
 
@@ -1448,7 +1451,8 @@
       IRStmt_Exit(
          binop(Iop_CmpNE32, mkexpr(t), mkU32(0)),
          Ijk_NoDecode,
-         IRConst_U32(toUInt(guest_R15_curr_instr_notENC | 1))
+         IRConst_U32(toUInt(guest_R15_curr_instr_notENC | 1)),
+         OFFB_R15T
       )
    );
 }
@@ -11962,9 +11966,9 @@
       UInt dst = guest_R15_curr_instr_notENC + 8 + (simm24 | 1);
       putIRegA( 14, mkU32(guest_R15_curr_instr_notENC + 4),
                     IRTemp_INVALID/*because AL*/, Ijk_Boring );
-      irsb->next     = mkU32(dst);
-      irsb->jumpkind = Ijk_Call;
-      dres->whatNext = Dis_StopHere;
+      llPutIReg(15, mkU32(dst));
+      dres->jk_StopHere = Ijk_Call;
+      dres->whatNext    = Dis_StopHere;
       DIP("blx 0x%x (and switch to Thumb mode)\n", dst - 1);
       return True;
    }
@@ -12040,7 +12044,6 @@
 
 static
 DisResult disInstr_ARM_WRK (
-             Bool         put_IP,
              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
              Bool         resteerCisOk,
              void*        callback_opaque,
@@ -12066,9 +12069,10 @@
    // etc etc
 
    /* Set result defaults. */
-   dres.whatNext   = Dis_Continue;
-   dres.len        = 4;
-   dres.continueAt = 0;
+   dres.whatNext    = Dis_Continue;
+   dres.len         = 4;
+   dres.continueAt  = 0;
+   dres.jk_StopHere = Ijk_INVALID;
 
    /* Set default actions for post-insn handling of writes to r15, if
       required. */
@@ -12085,11 +12089,7 @@
 
    DIP("\t(arm) 0x%x:  ", (UInt)guest_R15_curr_instr_notENC);
 
-   /* We may be asked to update the guest R15 before going further. */
    vassert(0 == (guest_R15_curr_instr_notENC & 3));
-   if (put_IP) {
-      llPutIReg( 15, mkU32(guest_R15_curr_instr_notENC) );
-   }
 
    /* ----------------------------------------------------------- */
 
@@ -12116,9 +12116,9 @@
                                                /* orr r10,r10,r10 */) {
             /* R3 = client_request ( R4 ) */
             DIP("r3 = client_request ( %%r4 )\n");
-            irsb->next     = mkU32( guest_R15_curr_instr_notENC + 20 );
-            irsb->jumpkind = Ijk_ClientReq;
-            dres.whatNext  = Dis_StopHere;
+            llPutIReg(15, mkU32( guest_R15_curr_instr_notENC + 20 ));
+            dres.jk_StopHere = Ijk_ClientReq;
+            dres.whatNext    = Dis_StopHere;
             goto decode_success;
          }
          else
@@ -12136,9 +12136,9 @@
             /*  branch-and-link-to-noredir R4 */
             DIP("branch-and-link-to-noredir r4\n");
             llPutIReg(14, mkU32( guest_R15_curr_instr_notENC + 20) );
-            irsb->next     = llGetIReg(4);
-            irsb->jumpkind = Ijk_NoRedir;
-            dres.whatNext  = Dis_StopHere;
+            llPutIReg(15, llGetIReg(4));
+            dres.jk_StopHere = Ijk_NoRedir;
+            dres.whatNext    = Dis_StopHere;
             goto decode_success;
          }
          /* We don't know what it is.  Set opc1/opc2 so decode_failure
@@ -12977,9 +12977,9 @@
             dres.continueAt = (Addr64)dst;
          } else {
             /* no; terminate the SB at this point. */
-            irsb->next     = mkU32(dst);
-            irsb->jumpkind = jk;
-            dres.whatNext  = Dis_StopHere;
+            llPutIReg(15, mkU32(dst));
+            dres.jk_StopHere = jk;
+            dres.whatNext    = Dis_StopHere;
          }
          DIP("b%s 0x%x\n", link ? "l" : "", dst);
       } else {
@@ -13002,7 +13002,8 @@
             stmt( IRStmt_Exit( unop(Iop_Not1,
                                     unop(Iop_32to1, mkexpr(condT))),
                                Ijk_Boring,
-                               IRConst_U32(guest_R15_curr_instr_notENC+4) ));
+                               IRConst_U32(guest_R15_curr_instr_notENC+4),
+                               OFFB_R15T ));
             dres.whatNext   = Dis_ResteerC;
             dres.continueAt = (Addr64)(Addr32)dst;
             comment = "(assumed taken)";
@@ -13021,7 +13022,8 @@
                following this one. */
             stmt( IRStmt_Exit( unop(Iop_32to1, mkexpr(condT)),
                                Ijk_Boring,
-                               IRConst_U32(dst) ));
+                               IRConst_U32(dst),
+                               OFFB_R15T ));
             dres.whatNext   = Dis_ResteerC;
             dres.continueAt = (Addr64)(Addr32)
                                       (guest_R15_curr_instr_notENC+4);
@@ -13031,10 +13033,10 @@
             /* Conservative default translation - end the block at
                this point. */
             stmt( IRStmt_Exit( unop(Iop_32to1, mkexpr(condT)),
-                               jk, IRConst_U32(dst) ));
-            irsb->next     = mkU32(guest_R15_curr_instr_notENC + 4);
-            irsb->jumpkind = Ijk_Boring;
-            dres.whatNext  = Dis_StopHere;
+                               jk, IRConst_U32(dst), OFFB_R15T ));
+            llPutIReg(15, mkU32(guest_R15_curr_instr_notENC + 4));
+            dres.jk_StopHere = Ijk_Boring;
+            dres.whatNext    = Dis_StopHere;
          }
          DIP("b%s%s 0x%x %s\n", link ? "l" : "", nCC(INSN_COND),
              dst, comment);
@@ -13065,10 +13067,10 @@
             putIRegA( 14, mkU32(guest_R15_curr_instr_notENC + 4),
                       IRTemp_INVALID/*because AL*/, Ijk_Boring );
          }
-         irsb->next     = mkexpr(dst);
-         irsb->jumpkind = link ? Ijk_Call
-                               : (rM == 14 ? Ijk_Ret : Ijk_Boring);
-         dres.whatNext  = Dis_StopHere;
+         llPutIReg(15, mkexpr(dst));
+         dres.jk_StopHere = link ? Ijk_Call
+                                 : (rM == 14 ? Ijk_Ret : Ijk_Boring);
+         dres.whatNext    = Dis_StopHere;
          if (condT == IRTemp_INVALID) {
             DIP("b%sx r%u\n", link ? "l" : "", rM);
          } else {
@@ -13363,9 +13365,9 @@
             mk_skip_over_A32_if_cond_is_false( condT );
          }
          // AL after here
-         irsb->next     = mkU32( guest_R15_curr_instr_notENC + 4 );
-         irsb->jumpkind = Ijk_Sys_syscall;
-         dres.whatNext  = Dis_StopHere;
+         llPutIReg(15, mkU32( guest_R15_curr_instr_notENC + 4 ));
+         dres.jk_StopHere = Ijk_Sys_syscall;
+         dres.whatNext    = Dis_StopHere;
          DIP("svc%s #0x%08x\n", nCC(INSN_COND), imm24);
          goto decode_success;
       }
@@ -13415,7 +13417,8 @@
          }
          stmt( IRStmt_Exit(unop(Iop_Not1, mkexpr(tSC1)),
                            /*Ijk_NoRedir*/Ijk_Boring,
-                           IRConst_U32(guest_R15_curr_instr_notENC)) );
+                           IRConst_U32(guest_R15_curr_instr_notENC),
+                           OFFB_R15T ));
          putIRegA(rD, isB ? unop(Iop_8Uto32, mkexpr(tOld)) : mkexpr(tOld),
                       IRTemp_INVALID, Ijk_Boring);
          DIP("swp%s%s r%u, r%u, [r%u]\n",
@@ -14142,10 +14145,9 @@
       now. */
    vassert(0 == (guest_R15_curr_instr_notENC & 3));
    llPutIReg( 15, mkU32(guest_R15_curr_instr_notENC) );
-   irsb->next     = mkU32(guest_R15_curr_instr_notENC);
-   irsb->jumpkind = Ijk_NoDecode;
-   dres.whatNext  = Dis_StopHere;
-   dres.len       = 0;
+   dres.whatNext    = Dis_StopHere;
+   dres.jk_StopHere = Ijk_NoDecode;
+   dres.len         = 0;
    return dres;
 
   decode_success:
@@ -14186,12 +14188,31 @@
                        binop(Iop_Xor32,
                              mkexpr(r15guard), mkU32(1))),
                   r15kind,
-                  IRConst_U32(guest_R15_curr_instr_notENC + 4)
+                  IRConst_U32(guest_R15_curr_instr_notENC + 4),
+                  OFFB_R15T
          ));
       }
-      irsb->next     = llGetIReg(15);
-      irsb->jumpkind = r15kind;
-      dres.whatNext  = Dis_StopHere;
+      /* This seems crazy, but we're required to finish the insn with
+         a write to the guest PC.  As usual we rely on ir_opt to tidy
+         up later. */
+      llPutIReg(15, llGetIReg(15));
+      dres.whatNext    = Dis_StopHere;
+      dres.jk_StopHere = r15kind;
+   } else {
+      /* Set up the end-state in the normal way. */
+      switch (dres.whatNext) {
+         case Dis_Continue:
+            llPutIReg(15, mkU32(dres.len + guest_R15_curr_instr_notENC));
+            break;
+         case Dis_ResteerU:
+         case Dis_ResteerC:
+            llPutIReg(15, mkU32(dres.continueAt));
+            break;
+         case Dis_StopHere:
+            break;
+         default:
+            vassert(0);
+      }
    }
 
    return dres;
@@ -14219,7 +14240,6 @@
 
 static   
 DisResult disInstr_THUMB_WRK (
-             Bool         put_IP,
              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
              Bool         resteerCisOk,
              void*        callback_opaque,
@@ -14249,9 +14269,10 @@
    // etc etc
 
    /* Set result defaults. */
-   dres.whatNext   = Dis_Continue;
-   dres.len        = 2;
-   dres.continueAt = 0;
+   dres.whatNext    = Dis_Continue;
+   dres.len         = 2;
+   dres.continueAt  = 0;
+   dres.jk_StopHere = Ijk_INVALID;
 
    /* Set default actions for post-insn handling of writes to r15, if
       required. */
@@ -14270,11 +14291,7 @@
 
    DIP("\t(thumb) 0x%x:  ", (UInt)guest_R15_curr_instr_notENC);
 
-   /* We may be asked to update the guest R15 before going further. */
    vassert(0 == (guest_R15_curr_instr_notENC & 1));
-   if (put_IP) {
-      llPutIReg( 15, mkU32(guest_R15_curr_instr_notENC | 1) );
-   }
 
    /* ----------------------------------------------------------- */
    /* Spot "Special" instructions (see comment at top of file). */
@@ -14301,9 +14318,9 @@
                                                /* orr.w r10,r10,r10 */) {
             /* R3 = client_request ( R4 ) */
             DIP("r3 = client_request ( %%r4 )\n");
-            irsb->next     = mkU32( (guest_R15_curr_instr_notENC + 20) | 1 );
-            irsb->jumpkind = Ijk_ClientReq;
-            dres.whatNext  = Dis_StopHere;
+            llPutIReg(15, mkU32( (guest_R15_curr_instr_notENC + 20) | 1 ));
+            dres.jk_StopHere = Ijk_ClientReq;
+            dres.whatNext    = Dis_StopHere;
             goto decode_success;
          }
          else
@@ -14323,9 +14340,9 @@
             /*  branch-and-link-to-noredir R4 */
             DIP("branch-and-link-to-noredir r4\n");
             llPutIReg(14, mkU32( (guest_R15_curr_instr_notENC + 20) | 1 ));
-            irsb->next     = getIRegT(4);
-            irsb->jumpkind = Ijk_NoRedir;
-            dres.whatNext  = Dis_StopHere;
+            llPutIReg(15, getIRegT(4));
+            dres.jk_StopHere = Ijk_NoRedir;
+            dres.whatNext    = Dis_StopHere;
             goto decode_success;
          }
          /* We don't know what it is.  Set insn0 so decode_failure
@@ -14982,9 +14999,9 @@
             vassert(rM == 15);
             assign( dst, mkU32(guest_R15_curr_instr_notENC + 4) );
          }
-         irsb->next     = mkexpr(dst);
-         irsb->jumpkind = rM == 14 ? Ijk_Ret : Ijk_Boring;
-         dres.whatNext  = Dis_StopHere;
+         llPutIReg(15, mkexpr(dst));
+         dres.jk_StopHere = rM == 14 ? Ijk_Ret : Ijk_Boring;
+         dres.whatNext    = Dis_StopHere;
          DIP("bx r%u (possibly switch to ARM mode)\n", rM);
          goto decode_success;
       }
@@ -15006,9 +15023,9 @@
             assign( dst, getIRegT(rM) );
             putIRegT( 14, mkU32( (guest_R15_curr_instr_notENC + 2) | 1 ),
                           IRTemp_INVALID );
-            irsb->next     = mkexpr(dst);
-            irsb->jumpkind = Ijk_Call;
-            dres.whatNext  = Dis_StopHere;
+            llPutIReg(15, mkexpr(dst));
+            dres.jk_StopHere = Ijk_Call;
+            dres.whatNext    = Dis_StopHere;
             DIP("blx r%u (possibly switch to ARM mode)\n", rM);
             goto decode_success;
          }
@@ -15039,9 +15056,9 @@
          // stash pseudo-reg, and back up from that if we have to
          // restart.
          // uncond after here
-         irsb->next     = mkU32( (guest_R15_curr_instr_notENC + 2) | 1 );
-         irsb->jumpkind = Ijk_Sys_syscall;
-         dres.whatNext  = Dis_StopHere;
+         llPutIReg(15, mkU32( (guest_R15_curr_instr_notENC + 2) | 1 ));
+         dres.jk_StopHere = Ijk_Sys_syscall;
+         dres.whatNext    = Dis_StopHere;
          DIP("svc #0x%08x\n", imm8);
          goto decode_success;
       }
@@ -15121,9 +15138,9 @@
             condT = IRTemp_INVALID;
             // now uncond
             /* non-interworking branch */
-            irsb->next = binop(Iop_Or32, mkexpr(val), mkU32(1));
-            irsb->jumpkind = rM == 14 ? Ijk_Ret : Ijk_Boring;
-            dres.whatNext = Dis_StopHere;
+            llPutIReg(15, binop(Iop_Or32, mkexpr(val), mkU32(1)));
+            dres.jk_StopHere = rM == 14 ? Ijk_Ret : Ijk_Boring;
+            dres.whatNext    = Dis_StopHere;
          }
          DIP("mov r%u, r%u\n", rD, rM);
          goto decode_success;
@@ -15178,7 +15195,8 @@
       UInt dst = (guest_R15_curr_instr_notENC + 4 + imm32) | 1;
       stmt(IRStmt_Exit( mkexpr(kond),
                         Ijk_Boring,
-                        IRConst_U32(toUInt(dst)) ));
+                        IRConst_U32(toUInt(dst)),
+                        OFFB_R15T ));
       DIP("cb%s r%u, 0x%x\n", bOP ? "nz" : "z", rN, dst - 1);
       goto decode_success;
    }
@@ -15322,9 +15340,9 @@
             it as is, no need to mess with it.  Note, therefore, this
             is an interworking return. */
          if (bitR) {
-            irsb->next     = mkexpr(newPC);
-            irsb->jumpkind = Ijk_Ret;
-            dres.whatNext  = Dis_StopHere;
+            llPutIReg(15, mkexpr(newPC));
+            dres.jk_StopHere = Ijk_Ret;
+            dres.whatNext    = Dis_StopHere;
          }
 
          DIP("pop {%s0x%04x}\n", bitR ? "pc," : "", regList & 0xFF);
@@ -15869,9 +15887,9 @@
       mk_skip_over_T16_if_cond_is_false(condT);
       condT = IRTemp_INVALID;
       // now uncond
-      irsb->next     = mkU32( dst | 1 /*CPSR.T*/ );
-      irsb->jumpkind = Ijk_Boring;
-      dres.whatNext  = Dis_StopHere;
+      llPutIReg(15, mkU32( dst | 1 /*CPSR.T*/ ));
+      dres.jk_StopHere = Ijk_Boring;
+      dres.whatNext    = Dis_StopHere;
       DIP("b 0x%x\n", dst);
       goto decode_success;
    }
@@ -15900,11 +15918,12 @@
          assign( kondT, mk_armg_calculate_condition(cond) );
          stmt( IRStmt_Exit( unop(Iop_32to1, mkexpr(kondT)),
                             Ijk_Boring,
-                            IRConst_U32(dst | 1/*CPSR.T*/) ));
-         irsb->next = mkU32( (guest_R15_curr_instr_notENC + 2) 
-                             | 1 /*CPSR.T*/ );
-         irsb->jumpkind = Ijk_Boring;
-         dres.whatNext  = Dis_StopHere;
+                            IRConst_U32(dst | 1/*CPSR.T*/),
+                            OFFB_R15T ));
+         llPutIReg(15, mkU32( (guest_R15_curr_instr_notENC + 2) 
+                              | 1 /*CPSR.T*/ ));
+         dres.jk_StopHere = Ijk_Boring;
+         dres.whatNext    = Dis_StopHere;
          DIP("b%s 0x%x\n", nCC(cond), dst);
          goto decode_success;
       }
@@ -15982,17 +16001,17 @@
          if (isBL) {
             /* BL: unconditional T -> T call */
             /* we're calling Thumb code, hence "| 1" */
-            irsb->next = mkU32( dst | 1 );
+            llPutIReg(15, mkU32( dst | 1 ));
             DIP("bl 0x%x (stay in Thumb mode)\n", dst);
          } else {
             /* BLX: unconditional T -> A call */
             /* we're calling ARM code, hence "& 3" to align to a
                valid ARM insn address */
-            irsb->next = mkU32( dst & ~3 );
+            llPutIReg(15, mkU32( dst & ~3 ));
             DIP("blx 0x%x (switch to ARM mode)\n", dst & ~3);
          }
-         irsb->jumpkind = Ijk_Call;
-         dres.whatNext = Dis_StopHere;
+         dres.whatNext    = Dis_StopHere;
+         dres.jk_StopHere = Ijk_Call;
          goto decode_success;
       }
    }
@@ -16057,15 +16076,15 @@
          condT = IRTemp_INVALID;
          // now uncond
 
-         /* Generate the IR.  This might generate a write to R15, */
+         /* Generate the IR.  This might generate a write to R15. */
          mk_ldm_stm(False/*!arm*/, rN, bINC, bBEFORE, bW, bL, regList);
 
          if (bL == 1 && (regList & (1<<15))) {
             // If we wrote to R15, we have an interworking return to
             // deal with.
-            irsb->next     = llGetIReg(15);
-            irsb->jumpkind = Ijk_Ret;
-            dres.whatNext  = Dis_StopHere;
+            llPutIReg(15, llGetIReg(15));
+            dres.jk_StopHere = Ijk_Ret;
+            dres.whatNext    = Dis_StopHere;
          }
 
          DIP("%sm%c%c r%u%s, {0x%04x}\n",
@@ -16930,18 +16949,19 @@
                putIRegT(rT, mkexpr(newRt), IRTemp_INVALID);
             }
 
-            if (loadsPC) {
-               /* Presumably this is an interworking branch. */
-               irsb->next = mkexpr(newRt);
-               irsb->jumpkind = Ijk_Boring;  /* or _Ret ? */
-               dres.whatNext  = Dis_StopHere;
-            }
-
             /* Update Rn if necessary. */
             if (bW == 1) {
                vassert(rN != rT); // assured by validity check above
                putIRegT(rN, mkexpr(postAddr), IRTemp_INVALID);
             }
+
+            if (loadsPC) {
+               /* Presumably this is an interworking branch. */
+               vassert(rN != 15); // assured by validity check above
+               llPutIReg(15, mkexpr(newRt));
+               dres.jk_StopHere = Ijk_Boring;  /* or _Ret ? */
+               dres.whatNext    = Dis_StopHere;
+            }
          }
 
          if (bP == 1 && bW == 0) {
@@ -17087,9 +17107,9 @@
 
             if (loadsPC) {
                /* Presumably this is an interworking branch. */
-               irsb->next = mkexpr(newRt);
-               irsb->jumpkind = Ijk_Boring;  /* or _Ret ? */
-               dres.whatNext  = Dis_StopHere;
+               llPutIReg(15, mkexpr(newRt));
+               dres.jk_StopHere = Ijk_Boring;  /* or _Ret ? */
+               dres.whatNext    = Dis_StopHere;
             }
          }
 
@@ -17345,11 +17365,12 @@
          assign( kondT, mk_armg_calculate_condition(cond) );
          stmt( IRStmt_Exit( unop(Iop_32to1, mkexpr(kondT)),
                             Ijk_Boring,
-                            IRConst_U32(dst | 1/*CPSR.T*/) ));
-         irsb->next = mkU32( (guest_R15_curr_instr_notENC + 4) 
-                             | 1 /*CPSR.T*/ );
-         irsb->jumpkind = Ijk_Boring;
-         dres.whatNext  = Dis_StopHere;
+                            IRConst_U32(dst | 1/*CPSR.T*/),
+                            OFFB_R15T ));
+         llPutIReg(15, mkU32( (guest_R15_curr_instr_notENC + 4) 
+                              | 1 /*CPSR.T*/ ));
+         dres.jk_StopHere = Ijk_Boring;
+         dres.whatNext    = Dis_StopHere;
          DIP("b%s.w 0x%x\n", nCC(cond), dst);
          goto decode_success;
       }
@@ -17390,9 +17411,9 @@
          // now uncond
 
          // branch to dst
-         irsb->next = mkU32( dst | 1 /*CPSR.T*/ );
-         irsb->jumpkind = Ijk_Boring;
-         dres.whatNext  = Dis_StopHere;
+         llPutIReg(15, mkU32( dst | 1 /*CPSR.T*/ ));
+         dres.jk_StopHere = Ijk_Boring;
+         dres.whatNext    = Dis_StopHere;
          DIP("b.w 0x%x\n", dst);
          goto decode_success;
       }
@@ -17423,16 +17444,17 @@
             assign(delta, unop(Iop_8Uto32, loadLE(Ity_I8, ea)));
          }
 
-         irsb->next
-            = binop(Iop_Or32,
-                    binop(Iop_Add32,
-                          getIRegT(15),
-                          binop(Iop_Shl32, mkexpr(delta), mkU8(1))
-                    ),
-                    mkU32(1)
-              );
-         irsb->jumpkind = Ijk_Boring;
-         dres.whatNext = Dis_StopHere;
+         llPutIReg(
+            15,
+            binop(Iop_Or32,
+                  binop(Iop_Add32,
+                        getIRegT(15),
+                        binop(Iop_Shl32, mkexpr(delta), mkU8(1))
+                  ),
+                  mkU32(1)
+         ));
+         dres.jk_StopHere = Ijk_Boring;
+         dres.whatNext    = Dis_StopHere;
          DIP("tb%c [r%u, r%u%s]\n",
              bH ? 'h' : 'b', rN, rM, bH ? ", LSL #1" : "");
          goto decode_success;
@@ -18199,60 +18221,29 @@
       now. */
    vassert(0 == (guest_R15_curr_instr_notENC & 1));
    llPutIReg( 15, mkU32(guest_R15_curr_instr_notENC | 1) );
-   irsb->next     = mkU32(guest_R15_curr_instr_notENC | 1 /* CPSR.T */);
-   irsb->jumpkind = Ijk_NoDecode;
-   dres.whatNext  = Dis_StopHere;
-   dres.len       = 0;
+   dres.whatNext    = Dis_StopHere;
+   dres.jk_StopHere = Ijk_NoDecode;
+   dres.len         = 0;
    return dres;
 
   decode_success:
    /* All decode successes end up here. */
-   DIP("\n");
-
-   vassert(dres.len == 2 || dres.len == 4 || dres.len == 20);
-
-#if 0
-   // XXX is this necessary on Thumb?
-   /* Now then.  Do we have an implicit jump to r15 to deal with? */
-   if (r15written) {
-      /* If we get jump to deal with, we assume that there's been no
-         other competing branch stuff previously generated for this
-         insn.  That's reasonable, in the sense that the ARM insn set
-         appears to declare as "Unpredictable" any instruction which
-         generates more than one possible new value for r15.  Hence
-         just assert.  The decoders themselves should check against
-         all such instructions which are thusly Unpredictable, and
-         decline to decode them.  Hence we should never get here if we
-         have competing new values for r15, and hence it is safe to
-         assert here. */
-      vassert(dres.whatNext == Dis_Continue);
-      vassert(irsb->next == NULL);
-      vassert(irsb->jumpkind == Ijk_Boring);
-      /* If r15 is unconditionally written, terminate the block by
-         jumping to it.  If it's conditionally written, still
-         terminate the block (a shame, but we can't do side exits to
-         arbitrary destinations), but first jump to the next
-         instruction if the condition doesn't hold. */
-      /* We can't use getIRegT(15) to get the destination, since that
-         will produce r15+4, which isn't what we want.  Must use
-         llGetIReg(15) instead. */
-      if (r15guard == IRTemp_INVALID) {
-         /* unconditional */
-      } else {
-         /* conditional */
-         stmt( IRStmt_Exit(
-                  unop(Iop_32to1,
-                       binop(Iop_Xor32,
-                             mkexpr(r15guard), mkU32(1))),
-                  r15kind,
-                  IRConst_U32(guest_R15_curr_instr_notENC + 4)
-         ));
-      }
-      irsb->next     = llGetIReg(15);
-      irsb->jumpkind = r15kind;
-      dres.whatNext  = Dis_StopHere;
+   vassert(dres.len == 4 || dres.len == 2 || dres.len == 20);
+   switch (dres.whatNext) {
+      case Dis_Continue:
+         llPutIReg(15, mkU32(dres.len + (guest_R15_curr_instr_notENC | 1)));
+         break;
+      case Dis_ResteerU:
+      case Dis_ResteerC:
+         llPutIReg(15, mkU32(dres.continueAt));
+         break;
+      case Dis_StopHere:
+         break;
+      default:
+         vassert(0);
    }
-#endif
+
+   DIP("\n");
 
    return dres;
 
@@ -18351,7 +18342,6 @@
    is located in host memory at &guest_code[delta]. */
 
 DisResult disInstr_ARM ( IRSB*        irsb_IN,
-                         Bool         put_IP,
                          Bool         (*resteerOkFn) ( void*, Addr64 ),
                          Bool         resteerCisOk,
                          void*        callback_opaque,
@@ -18380,12 +18370,12 @@
    }
 
    if (isThumb) {
-      dres = disInstr_THUMB_WRK ( put_IP, resteerOkFn,
+      dres = disInstr_THUMB_WRK ( resteerOkFn,
                                   resteerCisOk, callback_opaque,
                                   &guest_code_IN[delta_ENCODED - 1],
                                   archinfo, abiinfo );
    } else {
-      dres = disInstr_ARM_WRK ( put_IP, resteerOkFn,
+      dres = disInstr_ARM_WRK ( resteerOkFn,
                                 resteerCisOk, callback_opaque,
                                 &guest_code_IN[delta_ENCODED],
                                 archinfo, abiinfo );
diff --git a/priv/guest_generic_bb_to_IR.c b/priv/guest_generic_bb_to_IR.c
index 32dca8c..6066fe0 100644
--- a/priv/guest_generic_bb_to_IR.c
+++ b/priv/guest_generic_bb_to_IR.c
@@ -140,6 +140,43 @@
    (In fact it's a VgInstrumentClosure.)
 */
 
+/* Regarding IP updating.  dis_instr_fn (that does the guest specific
+   work of disassembling an individual instruction) must finish the
+   resulting IR with "PUT(guest_IP) = ".  Hence in all cases it must
+   state the next instruction address.
+
+   If the block is to be ended at that point, then this routine
+   (bb_to_IR) will set up the next/jumpkind/offsIP fields so as to
+   make a transfer (of the right kind) to "GET(guest_IP)".  Hence if
+   dis_instr_fn generates incorrect IP updates we will see it
+   immediately (due to jumping to the wrong next guest address).
+
+   However it is also necessary to set this up so it can be optimised
+   nicely.  The IRSB exit is defined to update the guest IP, so that
+   chaining works -- since the chain_me stubs expect the chain-to
+   address to be in the guest state.  Hence what the IRSB next fields
+   will contain initially is (implicitly)
+
+   PUT(guest_IP) [implicitly] = GET(guest_IP) [explicit expr on ::next]
+
+   which looks pretty strange at first.  Eg so unconditional branch
+   to some address 0x123456 looks like this:
+
+   PUT(guest_IP) = 0x123456;  // dis_instr_fn generates this
+   // the exit
+   PUT(guest_IP) [implicitly] = GET(guest_IP); exit-Boring
+
+   after redundant-GET and -PUT removal by iropt, we get what we want:
+
+   // the exit
+   PUT(guest_IP) [implicitly] = 0x123456; exit-Boring
+
+   This makes the IRSB-end case the same as the side-exit case: update
+   IP, then transfer.  There is no redundancy of representation for
+   the destination, and we use the destination specified by
+   dis_instr_fn, so any errors it makes show up sooner.
+*/
+
 IRSB* bb_to_IR ( 
          /*OUT*/VexGuestExtents* vge,
          /*OUT*/UInt*            n_sc_extents,
@@ -155,13 +192,15 @@
          /*IN*/ IRType           guest_word_type,
          /*IN*/ UInt             (*needs_self_check)(void*,VexGuestExtents*),
          /*IN*/ Bool             (*preamble_function)(void*,IRSB*),
-         /*IN*/ Int              offB_TISTART,
-         /*IN*/ Int              offB_TILEN
+         /*IN*/ Int              offB_GUEST_TISTART,
+         /*IN*/ Int              offB_GUEST_TILEN,
+         /*IN*/ Int              offB_GUEST_IP,
+         /*IN*/ Int              szB_GUEST_IP
       )
 {
    Long       delta;
    Int        i, n_instrs, first_stmt_idx;
-   Bool       resteerOK, need_to_put_IP, debug_print;
+   Bool       resteerOK, debug_print;
    DisResult  dres;
    IRStmt*    imark;
    IRStmt*    nop;
@@ -185,6 +224,14 @@
    vassert(vex_control.guest_chase_thresh < vex_control.guest_max_insns);
    vassert(guest_word_type == Ity_I32 || guest_word_type == Ity_I64);
 
+   if (guest_word_type == Ity_I32) {
+      vassert(szB_GUEST_IP == 4);
+      vassert((offB_GUEST_IP % 4) == 0);
+   } else {
+      vassert(szB_GUEST_IP == 8);
+      vassert((offB_GUEST_IP % 8) == 0);
+   }
+
    /* Start a new, empty extent. */
    vge->n_used  = 1;
    vge->base[0] = guest_IP_bbstart;
@@ -297,13 +344,12 @@
          );
       }
 
-      /* for the first insn, the dispatch loop will have set
-         %IP, but for all the others we have to do it ourselves. */
-      need_to_put_IP = toBool(n_instrs > 0);
+      if (debug_print && n_instrs > 0)
+         vex_printf("\n");
 
       /* Finally, actually disassemble an instruction. */
+      vassert(irsb->next == NULL);
       dres = dis_instr_fn ( irsb,
-                            need_to_put_IP,
                             resteerOKfn,
                             toBool(n_cond_resteers_allowed > 0),
                             callback_opaque,
@@ -347,18 +393,22 @@
          }
       }
 
-      /* If dis_instr_fn terminated the BB at this point, check it
-         also filled in the irsb->next field. */
-      if (dres.whatNext == Dis_StopHere) {
-         vassert(irsb->next != NULL);
-         if (debug_print) {
-            vex_printf("              ");
-            vex_printf( "goto {");
-            ppIRJumpKind(irsb->jumpkind);
-            vex_printf( "} ");
-            ppIRExpr( irsb->next );
-            vex_printf( "\n");
-         }
+      /* Individual insn disassembly may not mess with irsb->next.
+         This function is the only place where it can be set. */
+      vassert(irsb->next == NULL);
+      vassert(irsb->jumpkind == Ijk_Boring);
+      vassert(irsb->offsIP == 0);
+
+      /* Individual insn disassembly must finish the IR for each
+         instruction with an assignment to the guest PC. */
+      vassert(first_stmt_idx < irsb->stmts_used);
+      /* it follows that irsb->stmts_used must be > 0 */
+      { IRStmt* st = irsb->stmts[irsb->stmts_used-1];
+        vassert(st);
+        vassert(st->tag == Ist_Put);
+        vassert(st->Ist.Put.offset == offB_GUEST_IP);
+        /* Really we should also check that the type of the Put'd data
+           == guest_word_type, but that's a bit expensive. */
       }
 
       /* Update the VexGuestExtents we are constructing. */
@@ -370,36 +420,38 @@
       vge->len[vge->n_used-1] 
          = toUShort(toUInt( vge->len[vge->n_used-1] + dres.len ));
       n_instrs++;
-      if (debug_print) 
-         vex_printf("\n");
 
       /* Advance delta (inconspicuous but very important :-) */
       delta += (Long)dres.len;
 
       switch (dres.whatNext) {
          case Dis_Continue:
-            vassert(irsb->next == NULL);
+            vassert(dres.continueAt == 0);
+            vassert(dres.jk_StopHere == Ijk_INVALID);
             if (n_instrs < vex_control.guest_max_insns) {
                /* keep going */
             } else {
-               /* We have to stop. */
-               irsb->next 
-                  = IRExpr_Const(
-                       guest_word_type == Ity_I32
-                          ? IRConst_U32(toUInt(guest_IP_bbstart+delta))
-                          : IRConst_U64(guest_IP_bbstart+delta)
-                    );
+               /* We have to stop.  See comment above re irsb field
+                  settings here. */
+               irsb->next = IRExpr_Get(offB_GUEST_IP, guest_word_type);
+               /* irsb->jumpkind must already by Ijk_Boring */
+               irsb->offsIP = offB_GUEST_IP;
                goto done;
             }
             break;
          case Dis_StopHere:
-            vassert(irsb->next != NULL);
+            vassert(dres.continueAt == 0);
+            vassert(dres.jk_StopHere != Ijk_INVALID);
+            /* See comment above re irsb field settings here. */
+            irsb->next = IRExpr_Get(offB_GUEST_IP, guest_word_type);
+            irsb->jumpkind = dres.jk_StopHere;
+            irsb->offsIP = offB_GUEST_IP;
             goto done;
+
          case Dis_ResteerU:
          case Dis_ResteerC:
             /* Check that we actually allowed a resteer .. */
             vassert(resteerOK);
-            vassert(irsb->next == NULL);
             if (dres.whatNext == Dis_ResteerC) {
                vassert(n_cond_resteers_allowed > 0);
                n_cond_resteers_allowed--;
@@ -628,10 +680,10 @@
             = IRStmt_WrTmp(tilen_tmp, IRExpr_Const(len2check_IRConst) );
 
          irsb->stmts[selfcheck_idx + i * 5 + 2]
-            = IRStmt_Put( offB_TISTART, IRExpr_RdTmp(tistart_tmp) );
+            = IRStmt_Put( offB_GUEST_TISTART, IRExpr_RdTmp(tistart_tmp) );
 
          irsb->stmts[selfcheck_idx + i * 5 + 3]
-            = IRStmt_Put( offB_TILEN, IRExpr_RdTmp(tilen_tmp) );
+            = IRStmt_Put( offB_GUEST_TILEN, IRExpr_RdTmp(tilen_tmp) );
 
          /* Generate the entry point descriptors */
          if (abiinfo_both->host_ppc_calls_use_fndescrs) {
@@ -685,11 +737,25 @@
                  /* Where we must restart if there's a failure: at the
                     first extent, regardless of which extent the
                     failure actually happened in. */
-                 guest_IP_bbstart_IRConst
+                 guest_IP_bbstart_IRConst,
+                 offB_GUEST_IP
               );
       } /* for (i = 0; i < vge->n_used; i++) */
    }
 
+   /* irsb->next must now be set, since we've finished the block.
+      Print it if necessary.*/
+   vassert(irsb->next != NULL);
+   if (debug_print) {
+      vex_printf("              ");
+      vex_printf( "PUT(%d) = ", irsb->offsIP);
+      ppIRExpr( irsb->next );
+      vex_printf( "; exit-");
+      ppIRJumpKind(irsb->jumpkind);
+      vex_printf( "\n");
+      vex_printf( "\n");
+   }
+
    return irsb;
 }
 
diff --git a/priv/guest_generic_bb_to_IR.h b/priv/guest_generic_bb_to_IR.h
index f623443..9c1e740 100644
--- a/priv/guest_generic_bb_to_IR.h
+++ b/priv/guest_generic_bb_to_IR.h
@@ -76,6 +76,13 @@
       enum { Dis_StopHere, Dis_Continue, 
              Dis_ResteerU, Dis_ResteerC } whatNext;
 
+      /* For Dis_StopHere, we need to end the block and create a
+         transfer to whatever the NIA is.  That will have presumably
+         been set by the IR generated for this insn.  So we need to
+         know the jump kind to use.  Should Ijk_INVALID in other Dis_
+         cases. */
+      IRJumpKind jk_StopHere;
+
       /* For Dis_Resteer, this is the guest address we should continue
          at.  Otherwise ignored (should be zero). */
       Addr64 continueAt;
@@ -112,10 +119,6 @@
       /* This is the IRSB to which the resulting IR is to be appended. */
       /*OUT*/ IRSB*        irbb,
 
-      /* Do we need to generate IR to set the guest IP for this insn,
-         or not? */
-      /*IN*/  Bool         put_IP,
-
       /* Return True iff resteering to the given addr is allowed (for
          branches/calls to destinations that are known at JIT-time) */
       /*IN*/  Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
@@ -173,8 +176,10 @@
          /*IN*/ IRType           guest_word_type,
          /*IN*/ UInt             (*needs_self_check)(void*,VexGuestExtents*),
          /*IN*/ Bool             (*preamble_function)(void*,IRSB*),
-         /*IN*/ Int              offB_TISTART,
-         /*IN*/ Int              offB_TILEN
+         /*IN*/ Int              offB_GUEST_TISTART,
+         /*IN*/ Int              offB_GUEST_TILEN,
+         /*IN*/ Int              offB_GUEST_IP,
+         /*IN*/ Int              szB_GUEST_IP
       );
 
 
diff --git a/priv/guest_ppc_defs.h b/priv/guest_ppc_defs.h
index 7c8dc8e..b60766d 100644
--- a/priv/guest_ppc_defs.h
+++ b/priv/guest_ppc_defs.h
@@ -48,7 +48,6 @@
    bb_to_IR.h. */
 extern
 DisResult disInstr_PPC ( IRSB*        irbb,
-                         Bool         put_IP,
                          Bool         (*resteerOkFn) ( void*, Addr64 ),
                          Bool         resteerCisOk,
                          void*        callback_opaque,
diff --git a/priv/guest_ppc_helpers.c b/priv/guest_ppc_helpers.c
index 26ec86f..7944f82 100644
--- a/priv/guest_ppc_helpers.c
+++ b/priv/guest_ppc_helpers.c
@@ -352,6 +352,11 @@
 void LibVEX_GuestPPC32_initialise ( /*OUT*/VexGuestPPC32State* vex_state )
 {
    Int i;
+   vex_state->host_EvC_FAILADDR = 0;
+   vex_state->host_EvC_COUNTER  = 0;
+   vex_state->pad3 = 0;
+   vex_state->pad4 = 0;
+
    vex_state->guest_GPR0  = 0;
    vex_state->guest_GPR1  = 0;
    vex_state->guest_GPR2  = 0;
@@ -385,7 +390,6 @@
    vex_state->guest_GPR30 = 0;
    vex_state->guest_GPR31 = 0;
 
-
    /* Initialise the vector state. */
 #  define VECZERO(_vr) _vr[0]=_vr[1]=_vr[2]=_vr[3] = 0;
 
@@ -484,6 +488,8 @@
 
    vex_state->guest_FPROUND  = PPCrm_NEAREST;
    vex_state->guest_DFPROUND = PPCrm_NEAREST;
+   vex_state->pad1 = 0;
+   vex_state->pad2 = 0;
 
    vex_state->guest_VRSAVE = 0;
 
@@ -503,6 +509,8 @@
 
    vex_state->guest_IP_AT_SYSCALL = 0;
    vex_state->guest_SPRG3_RO = 0;
+
+   vex_state->padding = 0;
 }
 
 
@@ -510,6 +518,9 @@
 void LibVEX_GuestPPC64_initialise ( /*OUT*/VexGuestPPC64State* vex_state )
 {
    Int i;
+   vex_state->host_EvC_FAILADDR = 0;
+   vex_state->host_EvC_COUNTER = 0;
+   vex_state->pad0 = 0;
    vex_state->guest_GPR0  = 0;
    vex_state->guest_GPR1  = 0;
    vex_state->guest_GPR2  = 0;
@@ -641,6 +652,8 @@
 
    vex_state->guest_FPROUND  = PPCrm_NEAREST;
    vex_state->guest_DFPROUND = PPCrm_NEAREST;
+   vex_state->pad1 = 0;
+   vex_state->pad2 = 0;
 
    vex_state->guest_VRSAVE = 0;
 
diff --git a/priv/guest_ppc_toIR.c b/priv/guest_ppc_toIR.c
index 91f568e..cfad56a 100644
--- a/priv/guest_ppc_toIR.c
+++ b/priv/guest_ppc_toIR.c
@@ -1505,7 +1505,7 @@
                   binop(Iop_And64, mkexpr(addr), mkU64(align-1)),
                   mkU64(0)),
             Ijk_SigBUS,
-            IRConst_U64( guest_CIA_curr_instr )
+            IRConst_U64( guest_CIA_curr_instr ), OFFB_CIA
          )
       );
    } else {
@@ -1516,7 +1516,7 @@
                   binop(Iop_And32, mkexpr(addr), mkU32(align-1)),
                   mkU32(0)),
             Ijk_SigBUS,
-            IRConst_U32( guest_CIA_curr_instr )
+            IRConst_U32( guest_CIA_curr_instr ), OFFB_CIA
          )
       );
    }
@@ -2693,7 +2693,7 @@
             IRStmt_Exit(
                binop(Iop_CmpNE32, mkU32(ew), mkU32(EmWarn_NONE)),
                Ijk_EmWarn,
-               mkSzConst( ty, nextInsnAddr()) ));
+               mkSzConst( ty, nextInsnAddr()), OFFB_CIA ));
       }
 
       /* Ignore all other writes */
@@ -4977,7 +4977,7 @@
       /* if (nBytes < (i+1)) goto NIA; */
       stmt( IRStmt_Exit( binop(Iop_CmpLT32U, e_nbytes, mkU32(i+1)),
                          Ijk_Boring, 
-                         mkSzConst( ty, nextInsnAddr()) ));
+                         mkSzConst( ty, nextInsnAddr()), OFFB_CIA ));
       /* when crossing into a new dest register, set it to zero. */
       if ((i % 4) == 0) {
          rD++; if (rD == 32) rD = 0;
@@ -5028,7 +5028,7 @@
       /* if (nBytes < (i+1)) goto NIA; */
       stmt( IRStmt_Exit( binop(Iop_CmpLT32U, e_nbytes, mkU32(i+1)),
                          Ijk_Boring, 
-                         mkSzConst( ty, nextInsnAddr() ) ));
+                         mkSzConst( ty, nextInsnAddr() ), OFFB_CIA ));
       /* check for crossing into a new src register. */
       if ((i % 4) == 0) {
          rS++; if (rS == 32) rS = 0;
@@ -5250,6 +5250,7 @@
 
    /* The default what-next.  Individual cases can override it. */    
    dres->whatNext = Dis_StopHere;
+   vassert(dres->jk_StopHere == Ijk_INVALID);
 
    switch (opc1) {
    case 0x12: // b     (Branch, PPC32 p360)
@@ -5282,8 +5283,8 @@
          dres->whatNext   = Dis_ResteerU;
          dres->continueAt = tgt;
       } else {
-         irsb->jumpkind = flag_LK ? Ijk_Call : Ijk_Boring;
-         irsb->next     = mkSzImm(ty, tgt);
+         dres->jk_StopHere = flag_LK ? Ijk_Call : Ijk_Boring; ;
+         putGST( PPC_GST_CIA, mkSzImm(ty, tgt) );
       }
       break;
       
@@ -5319,10 +5320,10 @@
       stmt( IRStmt_Exit(
                binop(Iop_CmpNE32, mkexpr(do_branch), mkU32(0)),
                flag_LK ? Ijk_Call : Ijk_Boring,
-               mkSzConst(ty, tgt) ) );
-      
-      irsb->jumpkind = Ijk_Boring;
-      irsb->next     = e_nia;
+               mkSzConst(ty, tgt), OFFB_CIA ) );
+
+      dres->jk_StopHere = Ijk_Boring;
+      putGST( PPC_GST_CIA, e_nia );
       break;
       
    case 0x13:
@@ -5354,15 +5355,15 @@
          stmt( IRStmt_Exit(
                   binop(Iop_CmpEQ32, mkexpr(cond_ok), mkU32(0)),
                   Ijk_Boring,
-                  c_nia ));
+                  c_nia, OFFB_CIA ));
 
          if (flag_LK && vbi->guest_ppc_zap_RZ_at_bl) {
             make_redzone_AbiHint( vbi, lr_old,
                                   "b-ctr-l (indirect call)" );
 	 }
 
-         irsb->jumpkind = flag_LK ? Ijk_Call : Ijk_Boring;
-         irsb->next     = mkexpr(lr_old);
+         dres->jk_StopHere = flag_LK ? Ijk_Call : Ijk_Boring;;
+         putGST( PPC_GST_CIA, mkexpr(lr_old) );
          break;
          
       case 0x010: { // bclr (Branch Cond. to Link Register, PPC32 p365) 
@@ -5394,7 +5395,7 @@
          stmt( IRStmt_Exit(
                   binop(Iop_CmpEQ32, mkexpr(do_branch), mkU32(0)),
                   Ijk_Boring,
-                  c_nia ));
+                  c_nia, OFFB_CIA ));
 
          if (vanilla_return && vbi->guest_ppc_zap_RZ_at_blr) {
             make_redzone_AbiHint( vbi, lr_old,
@@ -5404,8 +5405,8 @@
          /* blrl is pretty strange; it's like a return that sets the
             return address of its caller to the insn following this
             one.  Mark it as a return. */
-         irsb->jumpkind = Ijk_Ret;  /* was flag_LK ? Ijk_Call : Ijk_Ret; */
-         irsb->next     = mkexpr(lr_old);
+         dres->jk_StopHere = Ijk_Ret;  /* was flag_LK ? Ijk_Call : Ijk_Ret; */
+         putGST( PPC_GST_CIA, mkexpr(lr_old) );
          break;
       }
       default:
@@ -5561,7 +5562,8 @@
       stmt( IRStmt_Exit( 
                binop(opCMPEQ, const0, const0), 
                Ijk_SigTRAP,
-               mode64 ? IRConst_U64(cia) : IRConst_U32((UInt)cia) 
+               mode64 ? IRConst_U64(cia) : IRConst_U32((UInt)cia),
+               OFFB_CIA
       ));
       return True; /* unconditional trap */
    }
@@ -5604,7 +5606,8 @@
    stmt( IRStmt_Exit( 
             binop(opCMPNE, cond, const0), 
             Ijk_SigTRAP,
-            mode64 ? IRConst_U64(cia) : IRConst_U32((UInt)cia) 
+            mode64 ? IRConst_U64(cia) : IRConst_U32((UInt)cia),
+            OFFB_CIA
    ));
    return False; /* not an unconditional trap */
 }
@@ -5652,9 +5655,9 @@
    if (uncond) {
       /* If the trap shows signs of being unconditional, don't
          continue decoding past it. */
-      irsb->next     = mkSzImm( ty, nextInsnAddr() );
-      irsb->jumpkind = Ijk_Boring;
-      dres->whatNext = Dis_StopHere;
+      putGST( PPC_GST_CIA, mkSzImm( ty, nextInsnAddr() ));
+      dres->jk_StopHere = Ijk_Boring;
+      dres->whatNext    = Dis_StopHere;
    }
 
    return True;
@@ -5706,9 +5709,9 @@
    if (uncond) {
       /* If the trap shows signs of being unconditional, don't
          continue decoding past it. */
-      irsb->next     = mkSzImm( ty, nextInsnAddr() );
-      irsb->jumpkind = Ijk_Boring;
-      dres->whatNext = Dis_StopHere;
+      putGST( PPC_GST_CIA, mkSzImm( ty, nextInsnAddr() ));
+      dres->jk_StopHere = Ijk_Boring;
+      dres->whatNext    = Dis_StopHere;
    }
 
    return True;
@@ -5739,12 +5742,12 @@
    /* It's important that all ArchRegs carry their up-to-date value
       at this point.  So we declare an end-of-block here, which
       forces any TempRegs caching ArchRegs to be flushed. */
-   irsb->next     = abiinfo->guest_ppc_sc_continues_at_LR
-                       ? getGST( PPC_GST_LR )
-                       : mkSzImm( ty, nextInsnAddr() );
-   irsb->jumpkind = Ijk_Sys_syscall;
+   putGST( PPC_GST_CIA, abiinfo->guest_ppc_sc_continues_at_LR
+                        ? getGST( PPC_GST_LR )
+                        : mkSzImm( ty, nextInsnAddr() ));
 
-   dres->whatNext = Dis_StopHere;
+   dres->whatNext    = Dis_StopHere;
+   dres->jk_StopHere = Ijk_Sys_syscall;
    return True;
 }
 
@@ -6722,9 +6725,9 @@
       /* be paranoid ... */
       stmt( IRStmt_MBE(Imbe_Fence) );
 
-      irsb->jumpkind = Ijk_TInval;
-      irsb->next     = mkSzImm(ty, nextInsnAddr());
-      dres->whatNext = Dis_StopHere;
+      putGST( PPC_GST_CIA, mkSzImm(ty, nextInsnAddr()));
+      dres->jk_StopHere = Ijk_TInval;
+      dres->whatNext    = Dis_StopHere;
       break;
    }
 
@@ -13761,7 +13764,6 @@
 
 static   
 DisResult disInstr_PPC_WRK ( 
-             Bool         put_IP,
              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
              Bool         resteerCisOk,
              void*        callback_opaque,
@@ -13805,9 +13807,10 @@
    delta = (Long)mkSzAddr(ty, (ULong)delta64);
 
    /* Set result defaults. */
-   dres.whatNext   = Dis_Continue;
-   dres.len        = 0;
-   dres.continueAt = 0;
+   dres.whatNext    = Dis_Continue;
+   dres.len         = 0;
+   dres.continueAt  = 0;
+   dres.jk_StopHere = Ijk_INVALID;
 
    /* At least this is simple on PPC32: insns are all 4 bytes long, and
       4-aligned.  So just fish the whole thing out of memory right now
@@ -13818,10 +13821,6 @@
 
    DIP("\t0x%llx:  ", (ULong)guest_CIA_curr_instr);
 
-   /* We may be asked to update the guest CIA before going further. */
-   if (put_IP)
-      putGST( PPC_GST_CIA, mkSzImm(ty, guest_CIA_curr_instr) );
-
    /* Spot "Special" instructions (see comment at top of file). */
    {
       UChar* code = (UChar*)(guest_code + delta);
@@ -13850,9 +13849,9 @@
             /* %R3 = client_request ( %R4 ) */
             DIP("r3 = client_request ( %%r4 )\n");
             delta += 20;
-            irsb->next     = mkSzImm( ty, guest_CIA_bbstart + delta );
-            irsb->jumpkind = Ijk_ClientReq;
-            dres.whatNext  = Dis_StopHere;
+            putGST( PPC_GST_CIA, mkSzImm( ty, guest_CIA_bbstart + delta ));
+            dres.jk_StopHere = Ijk_ClientReq;
+            dres.whatNext    = Dis_StopHere;
             goto decode_success;
          }
          else
@@ -13870,9 +13869,9 @@
             DIP("branch-and-link-to-noredir r11\n");
             delta += 20;
             putGST( PPC_GST_LR, mkSzImm(ty, guest_CIA_bbstart + (Long)delta) );
-            irsb->next     = getIReg(11);
-            irsb->jumpkind = Ijk_NoRedir;
-            dres.whatNext  = Dis_StopHere;
+            putGST( PPC_GST_CIA, getIReg(11));
+            dres.jk_StopHere = Ijk_NoRedir;
+            dres.whatNext    = Dis_StopHere;
             goto decode_success;
          }
          else
@@ -14493,9 +14492,9 @@
          Bool ok = dis_int_ldst_str( theInstr, &stopHere );
          if (!ok) goto decode_failure;
          if (stopHere) {
-            irsb->next     = mkSzImm(ty, nextInsnAddr());
-            irsb->jumpkind = Ijk_Boring;
-            dres.whatNext  = Dis_StopHere;
+            putGST( PPC_GST_CIA, mkSzImm(ty, nextInsnAddr()) );
+            dres.jk_StopHere = Ijk_Boring;
+            dres.whatNext    = Dis_StopHere;
          }
          goto decode_success;
       }
@@ -14848,16 +14847,28 @@
       insn, but nevertheless be paranoid and update it again right
       now. */
    putGST( PPC_GST_CIA, mkSzImm(ty, guest_CIA_curr_instr) );
-   irsb->next     = mkSzImm(ty, guest_CIA_curr_instr);
-   irsb->jumpkind = Ijk_NoDecode;
-   dres.whatNext  = Dis_StopHere;
-   dres.len       = 0;
+   dres.whatNext    = Dis_StopHere;
+   dres.jk_StopHere = Ijk_NoDecode;
+   dres.len         = 0;
    return dres;
 
    } /* switch (opc) for the main (primary) opcode switch. */
 
   decode_success:
    /* All decode successes end up here. */
+   switch (dres.whatNext) {
+      case Dis_Continue:
+         putGST( PPC_GST_CIA, mkSzImm(ty, guest_CIA_curr_instr + 4));
+         break;
+      case Dis_ResteerU:
+      case Dis_ResteerC:
+         putGST( PPC_GST_CIA, mkSzImm(ty, dres.continueAt));
+         break;
+      case Dis_StopHere:
+         break;
+      default:
+         vassert(0);
+   }
    DIP("\n");
 
    if (dres.len == 0) {
@@ -14880,7 +14891,6 @@
    is located in host memory at &guest_code[delta]. */
 
 DisResult disInstr_PPC ( IRSB*        irsb_IN,
-                         Bool         put_IP,
                          Bool         (*resteerOkFn) ( void*, Addr64 ),
                          Bool         resteerCisOk,
                          void*        callback_opaque,
@@ -14925,8 +14935,7 @@
    guest_CIA_curr_instr = mkSzAddr(ty, guest_IP);
    guest_CIA_bbstart    = mkSzAddr(ty, guest_IP - delta);
 
-   dres = disInstr_PPC_WRK ( put_IP, 
-                             resteerOkFn, resteerCisOk, callback_opaque,
+   dres = disInstr_PPC_WRK ( resteerOkFn, resteerCisOk, callback_opaque,
                              delta, archinfo, abiinfo );
 
    return dres;
diff --git a/priv/guest_s390_defs.h b/priv/guest_s390_defs.h
index 754ce3d..b7e57ba 100644
--- a/priv/guest_s390_defs.h
+++ b/priv/guest_s390_defs.h
@@ -43,7 +43,6 @@
 /* Convert one s390 insn to IR.  See the type DisOneInstrFn in
    bb_to_IR.h. */
 DisResult disInstr_S390 ( IRSB*        irbb,
-                          Bool         put_IP,
                           Bool         (*resteerOkFn) ( void*, Addr64 ),
                           Bool         resteerCisOk,
                           void*        callback_opaque,
diff --git a/priv/guest_s390_helpers.c b/priv/guest_s390_helpers.c
index 47a0635..167d426 100644
--- a/priv/guest_s390_helpers.c
+++ b/priv/guest_s390_helpers.c
@@ -130,6 +130,8 @@
    state->guest_TILEN = 0;
    state->guest_IP_AT_SYSCALL = 0;
    state->guest_EMWARN = EmWarn_NONE;
+   state->host_EvC_COUNTER = 0;
+   state->host_EvC_FAILADDR = 0;
 
 /*------------------------------------------------------------*/
 /*--- Initialise thunk                                     ---*/
diff --git a/priv/guest_s390_toIR.c b/priv/guest_s390_toIR.c
index d0dc00e..26190f0 100644
--- a/priv/guest_s390_toIR.c
+++ b/priv/guest_s390_toIR.c
@@ -120,6 +120,13 @@
    return IRExpr_RdTmp(tmp);
 }
 
+/* Generate an expression node for an address. */
+static __inline__ IRExpr *
+mkaddr_expr(Addr64 addr)
+{
+   return IRExpr_Const(IRConst_U64(addr));
+}
+
 /* Add a statement that assigns to a temporary */
 static __inline__ void
 assign(IRTemp dst, IRExpr *expr)
@@ -127,6 +134,22 @@
    stmt(IRStmt_WrTmp(dst, expr));
 }
 
+/* Write an address into the guest_IA */
+static __inline__ void
+put_IA(IRExpr *address)
+{
+   stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_IA), address));
+}
+
+/* Add a dummy put to the guest_IA to satisfy an assert in bb_to_IR
+   that wants the last statement in an IRSB to be a put to the guest_IA.
+   Mostly used for insns that use the "counter" pseudo guest reg. */
+static __inline__ void
+dummy_put_IA(void)
+{
+   put_IA(IRExpr_Get(S390X_GUEST_OFFSET(guest_IA), Ity_I64));
+}
+
 /* Create a temporary of the given type and assign the expression to it */
 static __inline__ IRTemp
 mktemp(IRType type, IRExpr *expr)
@@ -242,10 +265,10 @@
 static void
 call_function(IRExpr *callee_address)
 {
-   irsb->next = callee_address;
-   irsb->jumpkind = Ijk_Call;
+   put_IA(callee_address);
 
-   dis_res->whatNext = Dis_StopHere;
+   dis_res->whatNext    = Dis_StopHere;
+   dis_res->jk_StopHere = Ijk_Call;
 }
 
 /* Function call with known target. */
@@ -256,9 +279,10 @@
       dis_res->whatNext   = Dis_ResteerU;
       dis_res->continueAt = callee_address;
    } else {
-      irsb->next = mkU64(callee_address);
-      irsb->jumpkind = Ijk_Call;
+      put_IA(mkaddr_expr(callee_address));
+
       dis_res->whatNext = Dis_StopHere;
+      dis_res->jk_StopHere = Ijk_Call;
    }
 }
 
@@ -266,10 +290,10 @@
 static void
 return_from_function(IRExpr *return_address)
 {
-   irsb->next = return_address;
-   irsb->jumpkind = Ijk_Ret;
+   put_IA(return_address);
 
-   dis_res->whatNext = Dis_StopHere;
+   dis_res->whatNext    = Dis_StopHere;
+   dis_res->jk_StopHere = Ijk_Ret;
 }
 
 /* A conditional branch whose target is not known at instrumentation time.
@@ -289,12 +313,13 @@
 {
    vassert(typeOfIRExpr(irsb->tyenv, condition) == Ity_I1);
 
-   stmt(IRStmt_Exit(condition, Ijk_Boring, IRConst_U64(guest_IA_next_instr)));
+   stmt(IRStmt_Exit(condition, Ijk_Boring, IRConst_U64(guest_IA_next_instr),
+                    S390X_GUEST_OFFSET(guest_IA)));
 
-   irsb->next = target;
-   irsb->jumpkind = Ijk_Boring;
+   put_IA(target);
 
-   dis_res->whatNext = Dis_StopHere;
+   dis_res->whatNext    = Dis_StopHere;
+   dis_res->jk_StopHere = Ijk_Boring;
 }
 
 /* A conditional branch whose target is known at instrumentation time. */
@@ -303,8 +328,13 @@
 {
    vassert(typeOfIRExpr(irsb->tyenv, condition) == Ity_I1);
 
-   stmt(IRStmt_Exit(condition, Ijk_Boring, IRConst_U64(target)));
-   dis_res->whatNext = Dis_Continue;
+   stmt(IRStmt_Exit(condition, Ijk_Boring, IRConst_U64(target),
+                    S390X_GUEST_OFFSET(guest_IA)));
+
+   put_IA(mkaddr_expr(guest_IA_next_instr));
+
+   dis_res->whatNext    = Dis_StopHere;
+   dis_res->jk_StopHere = Ijk_Boring;
 }
 
 /* An unconditional branch. Target may or may not be known at instrumentation
@@ -312,23 +342,26 @@
 static void
 always_goto(IRExpr *target)
 {
-   irsb->next = target;
-   irsb->jumpkind = Ijk_Boring;
+   put_IA(target);
 
-   dis_res->whatNext = Dis_StopHere;
+   dis_res->whatNext    = Dis_StopHere;
+   dis_res->jk_StopHere = Ijk_Boring;
 }
 
+
 /* An unconditional branch to a known target. */
 static void
 always_goto_and_chase(Addr64 target)
 {
    if (resteer_fn(resteer_data, target)) {
+      /* Follow into the target */
       dis_res->whatNext   = Dis_ResteerU;
       dis_res->continueAt = target;
    } else {
-      irsb->next = mkU64(target);
-      irsb->jumpkind = Ijk_Boring;
-      dis_res->whatNext = Dis_StopHere;
+      put_IA(mkaddr_expr(target));
+
+      dis_res->whatNext    = Dis_StopHere;
+      dis_res->jk_StopHere = Ijk_Boring;
    }
 }
 
@@ -343,14 +376,13 @@
    stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_IP_AT_SYSCALL),
                    mkU64(guest_IA_curr_instr)));
 
+   put_IA(mkaddr_expr(guest_IA_next_instr));
+
    /* It's important that all ArchRegs carry their up-to-date value
       at this point.  So we declare an end-of-block here, which
       forces any TempRegs caching ArchRegs to be flushed. */
-   irsb->next = mkU64(guest_IA_next_instr);
-
-   irsb->jumpkind = Ijk_Sys_syscall;
-
-   dis_res->whatNext = Dis_StopHere;
+   dis_res->whatNext    = Dis_StopHere;
+   dis_res->jk_StopHere = Ijk_Sys_syscall;
 }
 
 /* Encode the s390 rounding mode as it appears in the m3/m4 fields of certain
@@ -1796,6 +1828,7 @@
           mkU64(0)));
 
    irgen(r1, op2addr);
+   dummy_put_IA();
 
    if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
       s390_disasm(ENC3(XMNM, GPR, SDXB), xmnm_kind, m3, r1, dh2, dl2, 0, b2);
@@ -5841,6 +5874,7 @@
    if_condition_goto(binop(Iop_CmpEQ32, s390_call_calculate_cond(m3), mkU32(0)),
                      guest_IA_next_instr);
    put_gpr_w1(r1, get_gpr_w1(r2));
+   dummy_put_IA();
 
    return "locr";
 }
@@ -5851,6 +5885,7 @@
    if_condition_goto(binop(Iop_CmpEQ32, s390_call_calculate_cond(m3), mkU32(0)),
                      guest_IA_next_instr);
    put_gpr_dw0(r1, get_gpr_dw0(r2));
+   dummy_put_IA();
 
    return "locgr";
 }
@@ -8576,6 +8611,7 @@
    if_condition_goto(binop(Iop_CmpNE64, mkexpr(counter), mkU64(length)),
                      guest_IA_curr_instr);
    put_counter_dw0(mkU64(0));
+   dummy_put_IA();
 
    return "clc";
 }
@@ -8869,8 +8905,8 @@
    stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TISTART),
                    mkU64(guest_IA_curr_instr)));
    stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TILEN), mkU64(4)));
-   stmt(IRStmt_Exit(mkexpr(cond), Ijk_TInval,
-        IRConst_U64(guest_IA_curr_instr)));
+   stmt(IRStmt_Exit(mkexpr(cond), Ijk_TInval, IRConst_U64(guest_IA_curr_instr),
+                    S390X_GUEST_OFFSET(guest_IA)));
 
    ss.bytes = last_execute_target;
    assign(start1, binop(Iop_Add64, mkU64(ss.dec.d1),
@@ -8880,6 +8916,8 @@
    assign(len, unop(lensize == 64 ? Iop_8Uto64 : Iop_8Uto32, binop(Iop_Or8,
           r != 0 ? get_gpr_b7(r): mkU8(0), mkU8(ss.dec.l))));
    irgen(len, start1, start2);
+   dummy_put_IA();
+
    last_execute_target = 0;
 }
 
@@ -8901,10 +8939,12 @@
                       mkU64(guest_IA_curr_instr)));
       stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TILEN), mkU64(4)));
       stmt(IRStmt_Exit(IRExpr_Const(IRConst_U1(True)), Ijk_TInval,
-           IRConst_U64(guest_IA_curr_instr)));
+                       IRConst_U64(guest_IA_curr_instr),
+                       S390X_GUEST_OFFSET(guest_IA)));
       /* we know that this will be invalidated */
-      irsb->next = mkU64(guest_IA_next_instr);
+      put_IA(mkaddr_expr(guest_IA_next_instr));
       dis_res->whatNext = Dis_StopHere;
+      dis_res->jk_StopHere = Ijk_TInval;
       break;
    }
 
@@ -8959,7 +8999,8 @@
       stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TISTART), mkU64(guest_IA_curr_instr)));
       stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_TILEN), mkU64(4)));
       stmt(IRStmt_Exit(mkexpr(cond), Ijk_TInval,
-           IRConst_U64(guest_IA_curr_instr)));
+                       IRConst_U64(guest_IA_curr_instr),
+                       S390X_GUEST_OFFSET(guest_IA)));
 
       /* Now comes the actual translation */
       bytes = (UChar *) &last_execute_target;
@@ -8969,6 +9010,7 @@
          vex_printf("    which was executed by\n");
       /* dont make useless translations in the next execute */
       last_execute_target = 0;
+      dummy_put_IA();
    }
    }
    return "ex";
@@ -9033,10 +9075,12 @@
    put_gpr_dw0(r1, mkexpr(next));
    put_gpr_dw0(r2, binop(Iop_Add64, mkexpr(address), mkU64(1)));
    stmt(IRStmt_Exit(binop(Iop_CmpNE64, mkexpr(counter), mkU64(255)),
-                    Ijk_Boring, IRConst_U64(guest_IA_curr_instr)));
+                    Ijk_Boring, IRConst_U64(guest_IA_curr_instr),
+                    S390X_GUEST_OFFSET(guest_IA)));
    // >= 256 bytes done CC=3
    s390_cc_set(3);
    put_counter_dw0(mkU64(0));
+   dummy_put_IA();
 
    return "srst";
 }
@@ -9099,10 +9143,12 @@
    put_gpr_dw0(r1, binop(Iop_Add64, get_gpr_dw0(r1), mkU64(1)));
    put_gpr_dw0(r2, binop(Iop_Add64, get_gpr_dw0(r2), mkU64(1)));
    stmt(IRStmt_Exit(binop(Iop_CmpNE64, mkexpr(counter), mkU64(255)),
-                    Ijk_Boring, IRConst_U64(guest_IA_curr_instr)));
+                    Ijk_Boring, IRConst_U64(guest_IA_curr_instr),
+                    S390X_GUEST_OFFSET(guest_IA)));
    // >= 256 bytes done CC=3
    s390_cc_set(3);
    put_counter_dw0(mkU64(0));
+   dummy_put_IA();
 
    return "clst";
 }
@@ -9297,6 +9343,7 @@
    s390_cc_thunk_put1(S390_CC_OP_BITWISE, mktemp(Ity_I32, get_counter_w1()),
                       False);
    put_counter_dw0(mkU64(0));
+   dummy_put_IA();
 }
 
 static HChar *
@@ -9341,6 +9388,7 @@
    }
 
    s390_cc_thunk_put1(S390_CC_OP_BITWISE, mktemp(Ity_I32, mkU32(0)), False);
+   dummy_put_IA();
 
    if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
       s390_disasm(ENC3(MNM, UDLB, UDXB), "xc", d, length, b, d, 0, b);
@@ -9378,6 +9426,7 @@
    if_condition_goto(binop(Iop_CmpNE64, mkexpr(counter), mkU64(length)),
                      guest_IA_curr_instr);
    put_counter_dw0(mkU64(0));
+   dummy_put_IA();
 
    return "mvc";
 }
@@ -9558,6 +9607,7 @@
    s390_cc_set(1);
    put_gpr_dw0(r1, binop(Iop_Add64, mkexpr(addr1), mkexpr(counter)));
    put_counter_dw0(mkU64(0));
+   dummy_put_IA();
 
    return "mvst";
 }
@@ -9824,7 +9874,8 @@
    assign(nequal, binop(Iop_CmpNE32, s390_call_calculate_cc(), mkU32(0)));
    put_gpr_w1(r1, mkite(mkexpr(nequal), mkexpr(old_mem), mkexpr(op1)));
    stmt(IRStmt_Exit(mkexpr(nequal), Ijk_Yield,
-        IRConst_U64(guest_IA_next_instr)));
+                    IRConst_U64(guest_IA_next_instr),
+                    S390X_GUEST_OFFSET(guest_IA)));
 }
 
 static HChar *
@@ -9873,7 +9924,8 @@
    assign(nequal, binop(Iop_CmpNE32, s390_call_calculate_cc(), mkU32(0)));
    put_gpr_dw0(r1, mkite(mkexpr(nequal), mkexpr(old_mem), mkexpr(op1)));
    stmt(IRStmt_Exit(mkexpr(nequal), Ijk_Yield,
-        IRConst_U64(guest_IA_next_instr)));
+                    IRConst_U64(guest_IA_next_instr),
+                    S390X_GUEST_OFFSET(guest_IA)));
 
    return "csg";
 }
@@ -11059,6 +11111,7 @@
                      guest_IA_curr_instr);
 
    put_counter_dw0(mkU64(0));
+   dummy_put_IA();
 
    return "tr";
 }
@@ -11118,12 +11171,13 @@
    if (0)
       vex_printf("%%R3 = client_request ( %%R2 )\n");
 
-   irsb->next = mkU64((ULong)(guest_IA_curr_instr
-                              + S390_SPECIAL_OP_PREAMBLE_SIZE
-                              + S390_SPECIAL_OP_SIZE));
-   irsb->jumpkind = Ijk_ClientReq;
+   Addr64 next = guest_IA_curr_instr + S390_SPECIAL_OP_PREAMBLE_SIZE
+                                     + S390_SPECIAL_OP_SIZE;
 
+   dis_res->jk_StopHere = Ijk_ClientReq;
    dis_res->whatNext = Dis_StopHere;
+
+   put_IA(mkaddr_expr(next));
 }
 
 static void
@@ -11138,16 +11192,17 @@
 static void
 s390_irgen_call_noredir(void)
 {
+   Addr64 next = guest_IA_curr_instr + S390_SPECIAL_OP_PREAMBLE_SIZE
+                                     + S390_SPECIAL_OP_SIZE;
+
    /* Continue after special op */
-   put_gpr_dw0(14, mkU64(guest_IA_curr_instr
-                         + S390_SPECIAL_OP_PREAMBLE_SIZE
-                         + S390_SPECIAL_OP_SIZE));
+   put_gpr_dw0(14, mkaddr_expr(next));
 
    /* The address is in REG1, all parameters are in the right (guest) places */
-   irsb->next     = get_gpr_dw0(1);
-   irsb->jumpkind = Ijk_NoRedir;
+   put_IA(get_gpr_dw0(1));
 
    dis_res->whatNext = Dis_StopHere;
+   dis_res->jk_StopHere = Ijk_NoRedir;
 }
 
 /* Force proper alignment for the structures below. */
@@ -13475,11 +13530,10 @@
       }
    }
    /* If next instruction is execute, stop here */
-   if (irsb->next == NULL && dis_res->whatNext == Dis_Continue
-       && bytes[insn_length] == 0x44) {
-      irsb->next = IRExpr_Const(IRConst_U64(guest_IA_next_instr));
+   if (dis_res->whatNext == Dis_Continue && bytes[insn_length] == 0x44) {
+      put_IA(mkaddr_expr(guest_IA_next_instr));
       dis_res->whatNext = Dis_StopHere;
-      dis_res->continueAt = 0;
+      dis_res->jk_StopHere = Ijk_Boring;
    }
 
    if (status == S390_DECODE_OK) return insn_length;  /* OK */
@@ -13518,14 +13572,6 @@
 }
 
 
-/* Generate an IRExpr for an address. */
-static __inline__ IRExpr *
-mkaddr_expr(Addr64 addr)
-{
-   return IRExpr_Const(IRConst_U64(addr));
-}
-
-
 /* Disassemble a single instruction INSN into IR. */
 static DisResult
 disInstr_S390_WRK(UChar *insn)
@@ -13553,6 +13599,7 @@
    dres.whatNext   = Dis_Continue;
    dres.len        = insn_length;
    dres.continueAt = 0;
+   dres.jk_StopHere = Ijk_INVALID;
 
    /* fixs390: consider chasing of conditional jumps */
 
@@ -13561,17 +13608,28 @@
       /* All decode failures end up here. The decoder has already issued an
          error message.
          Tell the dispatcher that this insn cannot be decoded, and so has
-         not been executed, and (is currently) the next to be executed.
-         IA should be up-to-date since it made so at the start of each
-         insn, but nevertheless be paranoid and update it again right
-         now. */
-      stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_IA),
-                      mkaddr_expr(guest_IA_curr_instr)));
+         not been executed, and (is currently) the next to be executed. */
+      put_IA(mkaddr_expr(guest_IA_curr_instr));
 
-      irsb->next = mkaddr_expr(guest_IA_next_instr);
-      irsb->jumpkind = Ijk_NoDecode;
-      dres.whatNext = Dis_StopHere;
-      dres.len = 0;
+      dres.whatNext    = Dis_StopHere;
+      dres.jk_StopHere = Ijk_NoDecode;
+      dres.continueAt  = 0;
+      dres.len         = 0;
+   } else {
+      /* Decode success */
+      switch (dres.whatNext) {
+      case Dis_Continue:
+         put_IA(mkaddr_expr(guest_IA_next_instr));
+         break;
+      case Dis_ResteerU:
+      case Dis_ResteerC:
+         put_IA(mkaddr_expr(dres.continueAt));
+         break;
+      case Dis_StopHere:
+         break;
+      default:
+         vassert(0);
+      }
    }
 
    return dres;
@@ -13587,7 +13645,6 @@
 
 DisResult
 disInstr_S390(IRSB        *irsb_IN,
-              Bool         put_IP __attribute__((unused)),
               Bool       (*resteerOkFn)(void *, Addr64),
               Bool         resteerCisOk,
               void        *callback_opaque,
@@ -13610,10 +13667,6 @@
    resteer_fn = resteerOkFn;
    resteer_data = callback_opaque;
 
-   /* Always update the guest IA. See comment in s390_isel_stmt for Ist_Put. */
-   stmt(IRStmt_Put(S390X_GUEST_OFFSET(guest_IA),
-                   mkaddr_expr(guest_IA_curr_instr)));
-
    return disInstr_S390_WRK(guest_code + delta);
 }
 
diff --git a/priv/guest_x86_defs.h b/priv/guest_x86_defs.h
index 130d84d..e0b1526 100644
--- a/priv/guest_x86_defs.h
+++ b/priv/guest_x86_defs.h
@@ -47,7 +47,6 @@
    bb_to_IR.h. */
 extern
 DisResult disInstr_X86 ( IRSB*        irbb,
-                         Bool         put_IP,
                          Bool         (*resteerOkFn) ( void*, Addr64 ),
                          Bool         resteerCisOk,
                          void*        callback_opaque,
diff --git a/priv/guest_x86_helpers.c b/priv/guest_x86_helpers.c
index d14d08b..9f7a8f5 100644
--- a/priv/guest_x86_helpers.c
+++ b/priv/guest_x86_helpers.c
@@ -2670,6 +2670,9 @@
 /* VISIBLE TO LIBVEX CLIENT */
 void LibVEX_GuestX86_initialise ( /*OUT*/VexGuestX86State* vex_state )
 {
+   vex_state->host_EvC_FAILADDR = 0;
+   vex_state->host_EvC_COUNTER = 0;
+
    vex_state->guest_EAX = 0;
    vex_state->guest_ECX = 0;
    vex_state->guest_EDX = 0;
@@ -2727,8 +2730,6 @@
    vex_state->guest_IP_AT_SYSCALL = 0;
 
    vex_state->padding1 = 0;
-   vex_state->padding2 = 0;
-   vex_state->padding3 = 0;
 }
 
 
diff --git a/priv/guest_x86_toIR.c b/priv/guest_x86_toIR.c
index 4b15c61..8db5b54 100644
--- a/priv/guest_x86_toIR.c
+++ b/priv/guest_x86_toIR.c
@@ -768,7 +768,8 @@
             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
                    mkexpr(oldTmp), mkexpr(expTmp) ),
             Ijk_Boring, /*Ijk_NoRedir*/
-            IRConst_U32( restart_point ) 
+            IRConst_U32( restart_point ),
+            OFFB_EIP
          ));
 }
 
@@ -1340,36 +1341,55 @@
 /*--- JMP helpers                                          ---*/
 /*------------------------------------------------------------*/
 
-static void jmp_lit( IRJumpKind kind, Addr32 d32 )
+static void jmp_lit( /*MOD*/DisResult* dres,
+                     IRJumpKind kind, Addr32 d32 )
 {
-   irsb->next     = mkU32(d32);
-   irsb->jumpkind = kind;
+   vassert(dres->whatNext    == Dis_Continue);
+   vassert(dres->len         == 0);
+   vassert(dres->continueAt  == 0);
+   vassert(dres->jk_StopHere == Ijk_INVALID);
+   dres->whatNext    = Dis_StopHere;
+   dres->jk_StopHere = kind;
+   stmt( IRStmt_Put( OFFB_EIP, mkU32(d32) ) );
 }
 
-static void jmp_treg( IRJumpKind kind, IRTemp t )
+static void jmp_treg( /*MOD*/DisResult* dres,
+                      IRJumpKind kind, IRTemp t )
 {
-   irsb->next = mkexpr(t);
-   irsb->jumpkind = kind;
+   vassert(dres->whatNext    == Dis_Continue);
+   vassert(dres->len         == 0);
+   vassert(dres->continueAt  == 0);
+   vassert(dres->jk_StopHere == Ijk_INVALID);
+   dres->whatNext    = Dis_StopHere;
+   dres->jk_StopHere = kind;
+   stmt( IRStmt_Put( OFFB_EIP, mkexpr(t) ) );
 }
 
 static 
-void jcc_01( X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
+void jcc_01( /*MOD*/DisResult* dres,
+             X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
 {
    Bool        invert;
    X86Condcode condPos;
+   vassert(dres->whatNext    == Dis_Continue);
+   vassert(dres->len         == 0);
+   vassert(dres->continueAt  == 0);
+   vassert(dres->jk_StopHere == Ijk_INVALID);
+   dres->whatNext    = Dis_StopHere;
+   dres->jk_StopHere = Ijk_Boring;
    condPos = positiveIse_X86Condcode ( cond, &invert );
    if (invert) {
       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
                          Ijk_Boring,
-                         IRConst_U32(d32_false) ) );
-      irsb->next     = mkU32(d32_true);
-      irsb->jumpkind = Ijk_Boring;
+                         IRConst_U32(d32_false),
+                         OFFB_EIP ) );
+      stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_true) ) );
    } else {
       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
                          Ijk_Boring,
-                         IRConst_U32(d32_true) ) );
-      irsb->next     = mkU32(d32_false);
-      irsb->jumpkind = Ijk_Boring;
+                         IRConst_U32(d32_true),
+                         OFFB_EIP ) );
+      stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_false) ) );
    }
 }
 
@@ -1450,7 +1470,8 @@
       IRStmt_Exit(
          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
          Ijk_MapFail,
-         IRConst_U32( guest_EIP_curr_instr )
+         IRConst_U32( guest_EIP_curr_instr ),
+         OFFB_EIP
       )
    );
 
@@ -3009,7 +3030,7 @@
 /* Group 5 extended opcodes. */
 static
 UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta, 
-                DisResult* dres, Bool* decode_OK )
+                /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
 {
    Int     len;
    UChar   modrm;
@@ -3054,13 +3075,13 @@
             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
             putIReg(4, R_ESP, mkexpr(t2));
             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
-            jmp_treg(Ijk_Call,t1);
-            dres->whatNext = Dis_StopHere;
+            jmp_treg(dres, Ijk_Call, t1);
+            vassert(dres->whatNext == Dis_StopHere);
             break;
          case 4: /* jmp Ev */
             vassert(sz == 4);
-            jmp_treg(Ijk_Boring,t1);
-            dres->whatNext = Dis_StopHere;
+            jmp_treg(dres, Ijk_Boring, t1);
+            vassert(dres->whatNext == Dis_StopHere);
             break;
          case 6: /* PUSH Ev */
             vassert(sz == 4 || sz == 2);
@@ -3110,13 +3131,13 @@
             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
             putIReg(4, R_ESP, mkexpr(t2));
             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
-            jmp_treg(Ijk_Call,t1);
-            dres->whatNext = Dis_StopHere;
+            jmp_treg(dres, Ijk_Call, t1);
+            vassert(dres->whatNext == Dis_StopHere);
             break;
          case 4: /* JMP Ev */
             vassert(sz == 4);
-            jmp_treg(Ijk_Boring,t1);
-            dres->whatNext = Dis_StopHere;
+            jmp_treg(dres, Ijk_Boring, t1);
+            vassert(dres->whatNext == Dis_StopHere);
             break;
          case 6: /* PUSH Ev */
             vassert(sz == 4 || sz == 2);
@@ -3253,7 +3274,8 @@
    We assume the insn is the last one in the basic block, and so emit a jump
    to the next insn, rather than just falling through. */
 static 
-void dis_REP_op ( X86Condcode cond,
+void dis_REP_op ( /*MOD*/DisResult* dres,
+                  X86Condcode cond,
                   void (*dis_OP)(Int, IRTemp),
                   Int sz, Addr32 eip, Addr32 eip_next, HChar* name )
 {
@@ -3264,7 +3286,7 @@
 
    stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
                       Ijk_Boring,
-                      IRConst_U32(eip_next) ) );
+                      IRConst_U32(eip_next), OFFB_EIP ) );
 
    putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
 
@@ -3272,12 +3294,14 @@
    dis_OP (sz, t_inc);
 
    if (cond == X86CondAlways) {
-      jmp_lit(Ijk_Boring,eip);
+      jmp_lit(dres, Ijk_Boring, eip);
+      vassert(dres->whatNext == Dis_StopHere);
    } else {
       stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
                          Ijk_Boring,
-                         IRConst_U32(eip) ) );
-      jmp_lit(Ijk_Boring,eip_next);
+                         IRConst_U32(eip), OFFB_EIP ) );
+      jmp_lit(dres, Ijk_Boring, eip_next);
+      vassert(dres->whatNext == Dis_StopHere);
    }
    DIP("%s%c\n", name, nameISize(sz));
 }
@@ -3958,7 +3982,8 @@
                   IRStmt_Exit(
                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
                      Ijk_EmWarn,
-                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
+                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
+                     OFFB_EIP
                   )
                );
 
@@ -4000,7 +4025,8 @@
                   IRStmt_Exit(
                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
                      Ijk_EmWarn,
-                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
+                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
+                     OFFB_EIP
                   )
                );
                break;
@@ -4948,7 +4974,8 @@
                   IRStmt_Exit(
                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
                      Ijk_EmWarn,
-                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
+                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
+                     OFFB_EIP
                   )
                );
 
@@ -6811,13 +6838,15 @@
 }
 
 static
-void dis_ret ( UInt d32 )
+void dis_ret ( /*MOD*/DisResult* dres, UInt d32 )
 {
-   IRTemp t1 = newTemp(Ity_I32), t2 = newTemp(Ity_I32);
+   IRTemp t1 = newTemp(Ity_I32);
+   IRTemp t2 = newTemp(Ity_I32);
    assign(t1, getIReg(4,R_ESP));
    assign(t2, loadLE(Ity_I32,mkexpr(t1)));
    putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
-   jmp_treg(Ijk_Ret,t2);
+   jmp_treg(dres, Ijk_Ret, t2);
+   vassert(dres->whatNext == Dis_StopHere);
 }
 
 /*------------------------------------------------------------*/
@@ -7523,7 +7552,8 @@
                    binop(Iop_And32, mkexpr(t1), mkU32(1<<18)), 
                    mkU32(0) ),
             Ijk_EmWarn,
-            IRConst_U32( next_insn_EIP )
+            IRConst_U32( next_insn_EIP ),
+            OFFB_EIP
          )
       );
    }
@@ -7700,7 +7730,8 @@
                binop(Iop_And32,mkexpr(effective_addr),mkU32(0xF)),
                mkU32(0)),
          Ijk_SigSEGV,
-         IRConst_U32(guest_EIP_curr_instr)
+         IRConst_U32(guest_EIP_curr_instr),
+         OFFB_EIP
       )
    );
 }
@@ -7854,7 +7885,6 @@
 static
 DisResult disInstr_X86_WRK (
              /*OUT*/Bool* expect_CAS,
-             Bool         put_IP,
              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
              Bool         resteerCisOk,
              void*        callback_opaque,
@@ -7893,9 +7923,10 @@
    Bool pfx_lock = False;
 
    /* Set result defaults. */
-   dres.whatNext   = Dis_Continue;
-   dres.len        = 0;
-   dres.continueAt = 0;
+   dres.whatNext    = Dis_Continue;
+   dres.len         = 0;
+   dres.continueAt  = 0;
+   dres.jk_StopHere = Ijk_INVALID;
 
    *expect_CAS = False;
 
@@ -7904,10 +7935,6 @@
    vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
    DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
 
-   /* We may be asked to update the guest EIP before going further. */
-   if (put_IP)
-      stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr)) );
-
    /* Spot "Special" instructions (see comment at top of file). */
    {
       UChar* code = (UChar*)(guest_code + delta);
@@ -7926,8 +7953,8 @@
             /* %EDX = client_request ( %EAX ) */
             DIP("%%edx = client_request ( %%eax )\n");
             delta += 14;
-            jmp_lit(Ijk_ClientReq, guest_EIP_bbstart+delta);
-            dres.whatNext = Dis_StopHere;
+            jmp_lit(&dres, Ijk_ClientReq, guest_EIP_bbstart+delta);
+            vassert(dres.whatNext == Dis_StopHere);
             goto decode_success;
          }
          else
@@ -7949,8 +7976,8 @@
             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
             putIReg(4, R_ESP, mkexpr(t2));
             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
-            jmp_treg(Ijk_NoRedir,t1);
-            dres.whatNext = Dis_StopHere;
+            jmp_treg(&dres, Ijk_NoRedir, t1);
+            vassert(dres.whatNext == Dis_StopHere);
             goto decode_success;
          }
          /* We don't know what it is. */
@@ -8537,7 +8564,8 @@
          IRStmt_Exit(
             binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
             Ijk_EmWarn,
-            IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
+            IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
+            OFFB_EIP
          )
       );
       goto decode_success;
@@ -11521,9 +11549,7 @@
 
       stmt( IRStmt_Put(OFFB_TILEN, mkU32(lineszB) ) );
 
-      irsb->jumpkind = Ijk_TInval;
-      irsb->next     = mkU32(guest_EIP_bbstart+delta);
-      dres.whatNext  = Dis_StopHere;
+      jmp_lit(&dres, Ijk_TInval, (Addr32)(guest_EIP_bbstart+delta));
 
       DIP("clflush %s\n", dis_buf);
       goto decode_success;
@@ -12729,7 +12755,8 @@
       stmt( IRStmt_Exit(
                binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
                Ijk_Boring,
-               IRConst_U32(d32)
+               IRConst_U32(d32),
+               OFFB_EIP
             ));
        DIP("jcxz 0x%x\n", d32);
        goto decode_success;
@@ -12752,13 +12779,11 @@
    case 0xC2: /* RET imm16 */
       d32 = getUDisp16(delta); 
       delta += 2;
-      dis_ret(d32);
-      dres.whatNext = Dis_StopHere;
+      dis_ret(&dres, d32);
       DIP("ret %d\n", (Int)d32);
       break;
    case 0xC3: /* RET */
-      dis_ret(0);
-      dres.whatNext = Dis_StopHere;
+      dis_ret(&dres, 0);
       DIP("ret\n");
       break;
 
@@ -12782,8 +12807,8 @@
       /* set %EFLAGS */
       set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
       /* goto new EIP value */
-      jmp_treg(Ijk_Ret,t2);
-      dres.whatNext = Dis_StopHere;
+      jmp_treg(&dres, Ijk_Ret, t2);
+      vassert(dres.whatNext == Dis_StopHere);
       DIP("iret (very kludgey)\n");
       break;
 
@@ -12815,8 +12840,8 @@
             dres.whatNext   = Dis_ResteerU;
             dres.continueAt = (Addr64)(Addr32)d32;
          } else {
-            jmp_lit(Ijk_Call,d32);
-            dres.whatNext = Dis_StopHere;
+            jmp_lit(&dres, Ijk_Call, d32);
+            vassert(dres.whatNext == Dis_StopHere);
          }
          DIP("call 0x%x\n",d32);
       }
@@ -13060,8 +13085,8 @@
    /* ------------------------ INT ------------------------ */
 
    case 0xCC: /* INT 3 */
-      jmp_lit(Ijk_SigTRAP,((Addr32)guest_EIP_bbstart)+delta);
-      dres.whatNext = Dis_StopHere;
+      jmp_lit(&dres, Ijk_SigTRAP, ((Addr32)guest_EIP_bbstart)+delta);
+      vassert(dres.whatNext == Dis_StopHere);
       DIP("int $0x3\n");
       break;
 
@@ -13082,8 +13107,8 @@
          This used to handle just 0x40-0x43; Jikes RVM uses a larger
          range (0x3F-0x49), and this allows some slack as well. */
       if (d32 >= 0x3F && d32 <= 0x4F) {
-         jmp_lit(Ijk_SigSEGV,((Addr32)guest_EIP_bbstart)+delta-2);
-         dres.whatNext = Dis_StopHere;
+         jmp_lit(&dres, Ijk_SigSEGV, ((Addr32)guest_EIP_bbstart)+delta-2);
+         vassert(dres.whatNext == Dis_StopHere);
          DIP("int $0x%x\n", (Int)d32);
          break;
       }
@@ -13095,24 +13120,24 @@
       if (d32 == 0x80) {
          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
                            mkU32(guest_EIP_curr_instr) ) );
-         jmp_lit(Ijk_Sys_int128,((Addr32)guest_EIP_bbstart)+delta);
-         dres.whatNext = Dis_StopHere;
+         jmp_lit(&dres, Ijk_Sys_int128, ((Addr32)guest_EIP_bbstart)+delta);
+         vassert(dres.whatNext == Dis_StopHere);
          DIP("int $0x80\n");
          break;
       }
       if (d32 == 0x81) {
          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
                            mkU32(guest_EIP_curr_instr) ) );
-         jmp_lit(Ijk_Sys_int129,((Addr32)guest_EIP_bbstart)+delta);
-         dres.whatNext = Dis_StopHere;
+         jmp_lit(&dres, Ijk_Sys_int129, ((Addr32)guest_EIP_bbstart)+delta);
+         vassert(dres.whatNext == Dis_StopHere);
          DIP("int $0x81\n");
          break;
       }
       if (d32 == 0x82) {
          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
                            mkU32(guest_EIP_curr_instr) ) );
-         jmp_lit(Ijk_Sys_int130,((Addr32)guest_EIP_bbstart)+delta);
-         dres.whatNext = Dis_StopHere;
+         jmp_lit(&dres, Ijk_Sys_int130, ((Addr32)guest_EIP_bbstart)+delta);
+         vassert(dres.whatNext == Dis_StopHere);
          DIP("int $0x82\n");
          break;
       }
@@ -13129,8 +13154,8 @@
          dres.whatNext   = Dis_ResteerU;
          dres.continueAt = (Addr64)(Addr32)d32;
       } else {
-         jmp_lit(Ijk_Boring,d32);
-         dres.whatNext = Dis_StopHere;
+         jmp_lit(&dres, Ijk_Boring, d32);
+         vassert(dres.whatNext == Dis_StopHere);
       }
       DIP("jmp-8 0x%x\n", d32);
       break;
@@ -13143,8 +13168,8 @@
          dres.whatNext   = Dis_ResteerU;
          dres.continueAt = (Addr64)(Addr32)d32;
       } else {
-         jmp_lit(Ijk_Boring,d32);
-         dres.whatNext = Dis_StopHere;
+         jmp_lit(&dres, Ijk_Boring, d32);
+         vassert(dres.whatNext == Dis_StopHere);
       }
       DIP("jmp 0x%x\n", d32);
       break;
@@ -13185,7 +13210,8 @@
          stmt( IRStmt_Exit( 
                   mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
                   Ijk_Boring,
-                  IRConst_U32(guest_EIP_bbstart+delta) ) );
+                  IRConst_U32(guest_EIP_bbstart+delta),
+                  OFFB_EIP ) );
          dres.whatNext   = Dis_ResteerC;
          dres.continueAt = (Addr64)(Addr32)d32;
          comment = "(assumed taken)";
@@ -13204,7 +13230,8 @@
          stmt( IRStmt_Exit( 
                   mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
                   Ijk_Boring,
-                  IRConst_U32(d32) ) );
+                  IRConst_U32(d32),
+                  OFFB_EIP ) );
          dres.whatNext   = Dis_ResteerC;
          dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
          comment = "(assumed not taken)";
@@ -13212,9 +13239,9 @@
       else {
          /* Conservative default translation - end the block at this
             point. */
-         jcc_01( (X86Condcode)(opc - 0x70), 
+         jcc_01( &dres, (X86Condcode)(opc - 0x70), 
                  (Addr32)(guest_EIP_bbstart+delta), d32);
-         dres.whatNext = Dis_StopHere;
+         vassert(dres.whatNext == Dis_StopHere);
       }
       DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
       break;
@@ -13227,7 +13254,8 @@
       stmt( IRStmt_Exit(
                binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
             Ijk_Boring,
-            IRConst_U32(d32)
+            IRConst_U32(d32),
+            OFFB_EIP
           ));
       DIP("jecxz 0x%x\n", d32);
       break;
@@ -13268,7 +13296,7 @@
          default:
 	    vassert(0);
       }
-      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32)) );
+      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32), OFFB_EIP) );
 
       DIP("loop%s 0x%x\n", xtra, d32);
       break;
@@ -13948,33 +13976,32 @@
       abyte = getIByte(delta); delta++;
 
       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
-      dres.whatNext = Dis_StopHere;         
 
       switch (abyte) {
       /* According to the Intel manual, "repne movs" should never occur, but
        * in practice it has happened, so allow for it here... */
       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
       case 0xA5: 
-         dis_REP_op ( X86CondNZ, dis_MOVS, sz, eip_orig,
-                                 guest_EIP_bbstart+delta, "repne movs" );
+         dis_REP_op ( &dres, X86CondNZ, dis_MOVS, sz, eip_orig,
+                             guest_EIP_bbstart+delta, "repne movs" );
          break;
 
       case 0xA6: sz = 1;   /* REPNE CMP<sz> */
       case 0xA7:
-         dis_REP_op ( X86CondNZ, dis_CMPS, sz, eip_orig, 
-                                 guest_EIP_bbstart+delta, "repne cmps" );
+         dis_REP_op ( &dres, X86CondNZ, dis_CMPS, sz, eip_orig, 
+                             guest_EIP_bbstart+delta, "repne cmps" );
          break;
 
       case 0xAA: sz = 1;   /* REPNE STOS<sz> */
       case 0xAB:
-         dis_REP_op ( X86CondNZ, dis_STOS, sz, eip_orig, 
-                                 guest_EIP_bbstart+delta, "repne stos" );
+         dis_REP_op ( &dres, X86CondNZ, dis_STOS, sz, eip_orig, 
+                             guest_EIP_bbstart+delta, "repne stos" );
          break;
 
       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
       case 0xAF:
-         dis_REP_op ( X86CondNZ, dis_SCAS, sz, eip_orig,
-                                 guest_EIP_bbstart+delta, "repne scas" );
+         dis_REP_op ( &dres, X86CondNZ, dis_SCAS, sz, eip_orig,
+                             guest_EIP_bbstart+delta, "repne scas" );
          break;
 
       default:
@@ -13991,37 +14018,36 @@
       abyte = getIByte(delta); delta++;
 
       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
-      dres.whatNext = Dis_StopHere;
 
       switch (abyte) {
       case 0xA4: sz = 1;   /* REP MOVS<sz> */
       case 0xA5:
-         dis_REP_op ( X86CondAlways, dis_MOVS, sz, eip_orig, 
-                                     guest_EIP_bbstart+delta, "rep movs" );
+         dis_REP_op ( &dres, X86CondAlways, dis_MOVS, sz, eip_orig, 
+                             guest_EIP_bbstart+delta, "rep movs" );
          break;
 
       case 0xA6: sz = 1;   /* REPE CMP<sz> */
       case 0xA7:
-         dis_REP_op ( X86CondZ, dis_CMPS, sz, eip_orig, 
-                                guest_EIP_bbstart+delta, "repe cmps" );
+         dis_REP_op ( &dres, X86CondZ, dis_CMPS, sz, eip_orig, 
+                             guest_EIP_bbstart+delta, "repe cmps" );
          break;
 
       case 0xAA: sz = 1;   /* REP STOS<sz> */
       case 0xAB:
-         dis_REP_op ( X86CondAlways, dis_STOS, sz, eip_orig, 
-                                     guest_EIP_bbstart+delta, "rep stos" );
+         dis_REP_op ( &dres, X86CondAlways, dis_STOS, sz, eip_orig, 
+                             guest_EIP_bbstart+delta, "rep stos" );
          break;
 
       case 0xAC: sz = 1;   /* REP LODS<sz> */
       case 0xAD:
-         dis_REP_op ( X86CondAlways, dis_LODS, sz, eip_orig, 
-                                     guest_EIP_bbstart+delta, "rep lods" );
+         dis_REP_op ( &dres, X86CondAlways, dis_LODS, sz, eip_orig, 
+                             guest_EIP_bbstart+delta, "rep lods" );
          break;
 
       case 0xAE: sz = 1;   /* REPE SCAS<sz> */
       case 0xAF: 
-         dis_REP_op ( X86CondZ, dis_SCAS, sz, eip_orig, 
-                                guest_EIP_bbstart+delta, "repe scas" );
+         dis_REP_op ( &dres, X86CondZ, dis_SCAS, sz, eip_orig, 
+                             guest_EIP_bbstart+delta, "repe scas" );
          break;
       
       case 0x90:           /* REP NOP (PAUSE) */
@@ -14029,13 +14055,12 @@
          DIP("rep nop (P4 pause)\n");
          /* "observe" the hint.  The Vex client needs to be careful not
             to cause very long delays as a result, though. */
-         jmp_lit(Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
-         dres.whatNext = Dis_StopHere;
+         jmp_lit(&dres, Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
+         vassert(dres.whatNext == Dis_StopHere);
          break;
 
       case 0xC3:           /* REP RET -- same as normal ret? */
-         dis_ret(0);
-         dres.whatNext = Dis_StopHere;
+         dis_ret(&dres, 0);
          DIP("rep ret\n");
          break;
 
@@ -14741,7 +14766,8 @@
                      mk_x86g_calculate_condition((X86Condcode)
                                                  (1 ^ (opc - 0x80))),
                      Ijk_Boring,
-                     IRConst_U32(guest_EIP_bbstart+delta) ) );
+                     IRConst_U32(guest_EIP_bbstart+delta),
+                     OFFB_EIP ) );
             dres.whatNext   = Dis_ResteerC;
             dres.continueAt = (Addr64)(Addr32)d32;
             comment = "(assumed taken)";
@@ -14760,7 +14786,8 @@
             stmt( IRStmt_Exit( 
                      mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
                      Ijk_Boring,
-                     IRConst_U32(d32) ) );
+                     IRConst_U32(d32),
+                     OFFB_EIP ) );
             dres.whatNext   = Dis_ResteerC;
             dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
             comment = "(assumed not taken)";
@@ -14768,9 +14795,9 @@
          else {
             /* Conservative default translation - end the block at
                this point. */
-            jcc_01( (X86Condcode)(opc - 0x80), 
+            jcc_01( &dres, (X86Condcode)(opc - 0x80), 
                     (Addr32)(guest_EIP_bbstart+delta), d32);
-            dres.whatNext = Dis_StopHere;
+            vassert(dres.whatNext == Dis_StopHere);
          }
          DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
          break;
@@ -14896,8 +14923,8 @@
             point if the syscall needs to be restarted. */
          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
                            mkU32(guest_EIP_curr_instr) ) );
-         jmp_lit(Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
-         dres.whatNext = Dis_StopHere;
+         jmp_lit(&dres, Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
+         vassert(dres.whatNext == Dis_StopHere);
          DIP("sysenter");
          break;
 
@@ -15073,8 +15100,8 @@
       insn, but nevertheless be paranoid and update it again right
       now. */
    stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
-   jmp_lit(Ijk_NoDecode, guest_EIP_curr_instr);
-   dres.whatNext = Dis_StopHere;
+   jmp_lit(&dres, Ijk_NoDecode, guest_EIP_curr_instr);
+   vassert(dres.whatNext == Dis_StopHere);
    dres.len = 0;
    /* We also need to say that a CAS is not expected now, regardless
       of what it might have been set to at the start of the function,
@@ -15088,6 +15115,20 @@
 
   decode_success:
    /* All decode successes end up here. */
+   switch (dres.whatNext) {
+      case Dis_Continue:
+         stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
+         break;
+      case Dis_ResteerU:
+      case Dis_ResteerC:
+         stmt( IRStmt_Put( OFFB_EIP, mkU32(dres.continueAt) ) );
+         break;
+      case Dis_StopHere:
+         break;
+      default:
+         vassert(0);
+   }
+
    DIP("\n");
    dres.len = delta - delta_start;
    return dres;
@@ -15105,7 +15146,6 @@
    is located in host memory at &guest_code[delta]. */
 
 DisResult disInstr_X86 ( IRSB*        irsb_IN,
-                         Bool         put_IP,
                          Bool         (*resteerOkFn) ( void*, Addr64 ),
                          Bool         resteerCisOk,
                          void*        callback_opaque,
@@ -15131,7 +15171,7 @@
 
    x1 = irsb_IN->stmts_used;
    expect_CAS = False;
-   dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
+   dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
                              resteerCisOk,
                              callback_opaque,
                              delta, archinfo, abiinfo );
@@ -15151,7 +15191,7 @@
       /* inconsistency detected.  re-disassemble the instruction so as
          to generate a useful error message; then assert. */
       vex_traceflags |= VEX_TRACE_FE;
-      dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
+      dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
                                 resteerCisOk,
                                 callback_opaque,
                                 delta, archinfo, abiinfo );
diff --git a/priv/host_amd64_defs.c b/priv/host_amd64_defs.c
index 8b97772..479a0c5 100644
--- a/priv/host_amd64_defs.c
+++ b/priv/host_amd64_defs.c
@@ -118,13 +118,6 @@
 HReg hregAMD64_R14 ( void ) { return mkHReg(14, HRcInt64, False); }
 HReg hregAMD64_R15 ( void ) { return mkHReg(15, HRcInt64, False); }
 
-//.. HReg hregAMD64_FAKE0 ( void ) { return mkHReg(0, HRcFlt64, False); }
-//.. HReg hregAMD64_FAKE1 ( void ) { return mkHReg(1, HRcFlt64, False); }
-//.. HReg hregAMD64_FAKE2 ( void ) { return mkHReg(2, HRcFlt64, False); }
-//.. HReg hregAMD64_FAKE3 ( void ) { return mkHReg(3, HRcFlt64, False); }
-//.. HReg hregAMD64_FAKE4 ( void ) { return mkHReg(4, HRcFlt64, False); }
-//.. HReg hregAMD64_FAKE5 ( void ) { return mkHReg(5, HRcFlt64, False); }
-//.. 
 HReg hregAMD64_XMM0  ( void ) { return mkHReg( 0, HRcVec128, False); }
 HReg hregAMD64_XMM1  ( void ) { return mkHReg( 1, HRcVec128, False); }
 HReg hregAMD64_XMM2  ( void ) { return mkHReg( 2, HRcVec128, False); }
@@ -231,18 +224,6 @@
    return am;
 }
 
-//.. AMD64AMode* dopyAMD64AMode ( AMD64AMode* am ) {
-//..    switch (am->tag) {
-//..       case Xam_IR: 
-//..          return AMD64AMode_IR( am->Xam.IR.imm, am->Xam.IR.reg );
-//..       case Xam_IRRS: 
-//..          return AMD64AMode_IRRS( am->Xam.IRRS.imm, am->Xam.IRRS.base, 
-//..                                am->Xam.IRRS.index, am->Xam.IRRS.shift );
-//..       default:
-//..          vpanic("dopyAMD64AMode");
-//..    }
-//.. }
-
 void ppAMD64AMode ( AMD64AMode* am ) {
    switch (am->tag) {
       case Aam_IR: 
@@ -538,10 +519,6 @@
 
 HChar* showA87FpOp ( A87FpOp op ) {
    switch (op) {
-//..       case Xfp_ADD:    return "add";
-//..       case Xfp_SUB:    return "sub";
-//..       case Xfp_MUL:    return "mul";
-//..       case Xfp_DIV:    return "div";
       case Afp_SCALE:  return "scale";
       case Afp_ATAN:   return "atan";
       case Afp_YL2X:   return "yl2x";
@@ -549,9 +526,6 @@
       case Afp_PREM:   return "prem";
       case Afp_PREM1:  return "prem1";
       case Afp_SQRT:   return "sqrt";
-//..       case Xfp_ABS:    return "abs";
-//..       case Xfp_NEG:    return "chs";
-//..       case Xfp_MOV:    return "mov";
       case Afp_SIN:    return "sin";
       case Afp_COS:    return "cos";
       case Afp_TAN:    return "tan";
@@ -717,16 +691,6 @@
    vassert(sz == 4 || sz == 8);
    return i;
 }
-//.. AMD64Instr* AMD64Instr_Sh3232  ( AMD64ShiftOp op, UInt amt, HReg src, HReg dst ) {
-//..    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
-//..    i->tag            = Xin_Sh3232;
-//..    i->Xin.Sh3232.op  = op;
-//..    i->Xin.Sh3232.amt = amt;
-//..    i->Xin.Sh3232.src = src;
-//..    i->Xin.Sh3232.dst = dst;
-//..    vassert(op == Xsh_SHL || op == Xsh_SHR);
-//..    return i;
-//.. }
 AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
    AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
    i->tag          = Ain_Push;
@@ -742,14 +706,37 @@
    vassert(regparms >= 0 && regparms <= 6);
    return i;
 }
-AMD64Instr* AMD64Instr_Goto ( IRJumpKind jk, AMD64CondCode cond, AMD64RI* dst ) {
-   AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
-   i->tag           = Ain_Goto;
-   i->Ain.Goto.cond = cond;
-   i->Ain.Goto.dst  = dst;
-   i->Ain.Goto.jk   = jk;
+
+AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
+                                 AMD64CondCode cond, Bool toFastEP ) {
+   AMD64Instr* i           = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag                  = Ain_XDirect;
+   i->Ain.XDirect.dstGA    = dstGA;
+   i->Ain.XDirect.amRIP    = amRIP;
+   i->Ain.XDirect.cond     = cond;
+   i->Ain.XDirect.toFastEP = toFastEP;
    return i;
 }
+AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
+                                AMD64CondCode cond ) {
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_XIndir;
+   i->Ain.XIndir.dstGA = dstGA;
+   i->Ain.XIndir.amRIP = amRIP;
+   i->Ain.XIndir.cond  = cond;
+   return i;
+}
+AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
+                                   AMD64CondCode cond, IRJumpKind jk ) {
+   AMD64Instr* i          = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag                 = Ain_XAssisted;
+   i->Ain.XAssisted.dstGA = dstGA;
+   i->Ain.XAssisted.amRIP = amRIP;
+   i->Ain.XAssisted.cond  = cond;
+   i->Ain.XAssisted.jk    = jk;
+   return i;
+}
+
 AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, AMD64RM* src, HReg dst ) {
    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    i->tag             = Ain_CMov64;
@@ -863,72 +850,12 @@
    i->Ain.A87StSW.addr = addr;
    return i;
 }
-
-//.. AMD64Instr* AMD64Instr_FpUnary ( AMD64FpOp op, HReg src, HReg dst ) {
-//..    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
-//..    i->tag             = Xin_FpUnary;
-//..    i->Xin.FpUnary.op  = op;
-//..    i->Xin.FpUnary.src = src;
-//..    i->Xin.FpUnary.dst = dst;
-//..    return i;
-//.. }
-//.. AMD64Instr* AMD64Instr_FpBinary ( AMD64FpOp op, HReg srcL, HReg srcR, HReg dst ) {
-//..    AMD64Instr* i          = LibVEX_Alloc(sizeof(AMD64Instr));
-//..    i->tag               = Xin_FpBinary;
-//..    i->Xin.FpBinary.op   = op;
-//..    i->Xin.FpBinary.srcL = srcL;
-//..    i->Xin.FpBinary.srcR = srcR;
-//..    i->Xin.FpBinary.dst  = dst;
-//..    return i;
-//.. }
-//.. AMD64Instr* AMD64Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, AMD64AMode* addr ) {
-//..    AMD64Instr* i          = LibVEX_Alloc(sizeof(AMD64Instr));
-//..    i->tag               = Xin_FpLdSt;
-//..    i->Xin.FpLdSt.isLoad = isLoad;
-//..    i->Xin.FpLdSt.sz     = sz;
-//..    i->Xin.FpLdSt.reg    = reg;
-//..    i->Xin.FpLdSt.addr   = addr;
-//..    vassert(sz == 4 || sz == 8);
-//..    return i;
-//.. }
-//.. AMD64Instr* AMD64Instr_FpLdStI ( Bool isLoad, UChar sz,  
-//..                              HReg reg, AMD64AMode* addr ) {
-//..    AMD64Instr* i           = LibVEX_Alloc(sizeof(AMD64Instr));
-//..    i->tag                = Xin_FpLdStI;
-//..    i->Xin.FpLdStI.isLoad = isLoad;
-//..    i->Xin.FpLdStI.sz     = sz;
-//..    i->Xin.FpLdStI.reg    = reg;
-//..    i->Xin.FpLdStI.addr   = addr;
-//..    vassert(sz == 2 || sz == 4 || sz == 8);
-//..    return i;
-//.. }
-//.. AMD64Instr* AMD64Instr_Fp64to32 ( HReg src, HReg dst ) {
-//..    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
-//..    i->tag              = Xin_Fp64to32;
-//..    i->Xin.Fp64to32.src = src;
-//..    i->Xin.Fp64to32.dst = dst;
-//..    return i;
-//.. }
-//.. AMD64Instr* AMD64Instr_FpCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
-//..    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
-//..    i->tag             = Xin_FpCMov;
-//..    i->Xin.FpCMov.cond = cond;
-//..    i->Xin.FpCMov.src  = src;
-//..    i->Xin.FpCMov.dst  = dst;
-//..    vassert(cond != Xcc_ALWAYS);
-//..    return i;
-//.. }
 AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    i->tag                = Ain_LdMXCSR;
    i->Ain.LdMXCSR.addr   = addr;
    return i;
 }
-//.. AMD64Instr* AMD64Instr_FpStSW_AX ( void ) {
-//..    AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
-//..    i->tag      = Xin_FpStSW_AX;
-//..    return i;
-//.. }
 AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    i->tag                = Ain_SseUComIS;
@@ -970,15 +897,6 @@
    i->Ain.SseSDSS.dst    = dst;
    return i;
 }
-
-//.. AMD64Instr* AMD64Instr_SseConst ( UShort con, HReg dst ) {
-//..    AMD64Instr* i            = LibVEX_Alloc(sizeof(AMD64Instr));
-//..    i->tag                 = Xin_SseConst;
-//..    i->Xin.SseConst.con    = con;
-//..    i->Xin.SseConst.dst    = dst;
-//..    vassert(hregClass(dst) == HRcVec128);
-//..    return i;
-//.. }
 AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz, 
                                  HReg reg, AMD64AMode* addr ) {
    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
@@ -1062,6 +980,19 @@
    vassert(order >= 0 && order <= 0xFF);
    return i;
 }
+AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
+                                 AMD64AMode* amFailAddr ) {
+   AMD64Instr* i             = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag                    = Ain_EvCheck;
+   i->Ain.EvCheck.amCounter  = amCounter;
+   i->Ain.EvCheck.amFailAddr = amFailAddr;
+   return i;
+}
+AMD64Instr* AMD64Instr_ProfInc ( void ) {
+   AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag        = Ain_ProfInc;
+   return i;
+}
 
 void ppAMD64Instr ( AMD64Instr* i, Bool mode64 ) 
 {
@@ -1121,16 +1052,6 @@
                     showAMD64ScalarSz(i->Ain.Div.sz));
          ppAMD64RM(i->Ain.Div.src);
          return;
-//..       case Xin_Sh3232:
-//..          vex_printf("%sdl ", showAMD64ShiftOp(i->Xin.Sh3232.op));
-//..          if (i->Xin.Sh3232.amt == 0)
-//..            vex_printf(" %%cl,"); 
-//..          else 
-//..             vex_printf(" $%d,", i->Xin.Sh3232.amt);
-//..          ppHRegAMD64(i->Xin.Sh3232.src);
-//..          vex_printf(",");
-//..          ppHRegAMD64(i->Xin.Sh3232.dst);
-//..          return;
       case Ain_Push:
          vex_printf("pushq ");
          ppAMD64RMI(i->Ain.Push.src);
@@ -1142,25 +1063,41 @@
                     i->Ain.Call.regparms );
          vex_printf("0x%llx", i->Ain.Call.target);
          break;
-      case Ain_Goto:
-         if (i->Ain.Goto.cond != Acc_ALWAYS) {
-            vex_printf("if (%%rflags.%s) { ", 
-                       showAMD64CondCode(i->Ain.Goto.cond));
-         }
-         if (i->Ain.Goto.jk != Ijk_Boring
-             && i->Ain.Goto.jk != Ijk_Call
-             && i->Ain.Goto.jk != Ijk_Ret) {
-            vex_printf("movl $");
-            ppIRJumpKind(i->Ain.Goto.jk);
-            vex_printf(",%%ebp ; ");
-         }
-         vex_printf("movq ");
-         ppAMD64RI(i->Ain.Goto.dst);
-         vex_printf(",%%rax ; movabsq $dispatcher_addr,%%rdx ; jmp *%%rdx");
-         if (i->Ain.Goto.cond != Acc_ALWAYS) {
-            vex_printf(" }");
-         }
+
+      case Ain_XDirect:
+         vex_printf("(xDirect) ");
+         vex_printf("if (%%rflags.%s) { ",
+                    showAMD64CondCode(i->Ain.XDirect.cond));
+         vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
+         vex_printf("movq %%r11,");
+         ppAMD64AMode(i->Ain.XDirect.amRIP);
+         vex_printf("; ");
+         vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
+                    i->Ain.XDirect.toFastEP ? "fast" : "slow");
          return;
+      case Ain_XIndir:
+         vex_printf("(xIndir) ");
+         vex_printf("if (%%rflags.%s) { ",
+                    showAMD64CondCode(i->Ain.XIndir.cond));
+         vex_printf("movq ");
+         ppHRegAMD64(i->Ain.XIndir.dstGA);
+         vex_printf(",");
+         ppAMD64AMode(i->Ain.XIndir.amRIP);
+         vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
+         return;
+      case Ain_XAssisted:
+         vex_printf("(xAssisted) ");
+         vex_printf("if (%%rflags.%s) { ",
+                    showAMD64CondCode(i->Ain.XAssisted.cond));
+         vex_printf("movq ");
+         ppHRegAMD64(i->Ain.XAssisted.dstGA);
+         vex_printf(",");
+         ppAMD64AMode(i->Ain.XAssisted.amRIP);
+         vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
+                    (Int)i->Ain.XAssisted.jk);
+         vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
+         return;
+
       case Ain_CMov64:
          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
          ppAMD64RM(i->Ain.CMov64.src);
@@ -1241,67 +1178,6 @@
          vex_printf("fstsw ");
          ppAMD64AMode(i->Ain.A87StSW.addr);
          break;
-//..       case Xin_FpUnary:
-//..          vex_printf("g%sD ", showAMD64FpOp(i->Xin.FpUnary.op));
-//..          ppHRegAMD64(i->Xin.FpUnary.src);
-//..          vex_printf(",");
-//..          ppHRegAMD64(i->Xin.FpUnary.dst);
-//..          break;
-//..       case Xin_FpBinary:
-//..          vex_printf("g%sD ", showAMD64FpOp(i->Xin.FpBinary.op));
-//..          ppHRegAMD64(i->Xin.FpBinary.srcL);
-//..          vex_printf(",");
-//..          ppHRegAMD64(i->Xin.FpBinary.srcR);
-//..          vex_printf(",");
-//..          ppHRegAMD64(i->Xin.FpBinary.dst);
-//..          break;
-//..       case Xin_FpLdSt:
-//..          if (i->Xin.FpLdSt.isLoad) {
-//..             vex_printf("gld%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
-//..             ppAMD64AMode(i->Xin.FpLdSt.addr);
-//..             vex_printf(", ");
-//..             ppHRegAMD64(i->Xin.FpLdSt.reg);
-//..          } else {
-//..             vex_printf("gst%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
-//..             ppHRegAMD64(i->Xin.FpLdSt.reg);
-//..             vex_printf(", ");
-//..             ppAMD64AMode(i->Xin.FpLdSt.addr);
-//..          }
-//..          return;
-//..       case Xin_FpLdStI:
-//..          if (i->Xin.FpLdStI.isLoad) {
-//..             vex_printf("gild%s ", i->Xin.FpLdStI.sz==8 ? "ll" : 
-//..                                   i->Xin.FpLdStI.sz==4 ? "l" : "w");
-//..             ppAMD64AMode(i->Xin.FpLdStI.addr);
-//..             vex_printf(", ");
-//..             ppHRegAMD64(i->Xin.FpLdStI.reg);
-//..          } else {
-//..             vex_printf("gist%s ", i->Xin.FpLdStI.sz==8 ? "ll" : 
-//..                                   i->Xin.FpLdStI.sz==4 ? "l" : "w");
-//..             ppHRegAMD64(i->Xin.FpLdStI.reg);
-//..             vex_printf(", ");
-//..             ppAMD64AMode(i->Xin.FpLdStI.addr);
-//..          }
-//..          return;
-//..       case Xin_Fp64to32:
-//..          vex_printf("gdtof ");
-//..          ppHRegAMD64(i->Xin.Fp64to32.src);
-//..          vex_printf(",");
-//..          ppHRegAMD64(i->Xin.Fp64to32.dst);
-//..          return;
-//..       case Xin_FpCMov:
-//..          vex_printf("gcmov%s ", showAMD64CondCode(i->Xin.FpCMov.cond));
-//..          ppHRegAMD64(i->Xin.FpCMov.src);
-//..          vex_printf(",");
-//..          ppHRegAMD64(i->Xin.FpCMov.dst);
-//..          return;
-//..       case Xin_FpLdStCW:
-//..          vex_printf(i->Xin.FpLdStCW.isLoad ? "fldcw " : "fstcw ");
-//..          ppAMD64AMode(i->Xin.FpLdStCW.addr);
-//..          return;
-//..       case Xin_FpStSW_AX:
-//..          vex_printf("fstsw %%ax");
-//..          return;
       case Ain_LdMXCSR:
          vex_printf("ldmxcsr ");
          ppAMD64AMode(i->Ain.LdMXCSR.addr);
@@ -1334,10 +1210,6 @@
          vex_printf(",");
          ppHRegAMD64(i->Ain.SseSDSS.dst);
          break;
-//..       case Xin_SseConst:
-//..          vex_printf("const $0x%04x,", (Int)i->Xin.SseConst.con);
-//..          ppHRegAMD64(i->Xin.SseConst.dst);
-//..          break;
       case Ain_SseLdSt:
          switch (i->Ain.SseLdSt.sz) {
             case 4:  vex_printf("movss "); break;
@@ -1403,7 +1275,16 @@
          vex_printf(",");
          ppHRegAMD64(i->Ain.SseShuf.dst);
          return;
-
+      case Ain_EvCheck:
+         vex_printf("(evCheck) decl ");
+         ppAMD64AMode(i->Ain.EvCheck.amCounter);
+         vex_printf("; jns nofail; jmp *");
+         ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
+         vex_printf("; nofail:");
+         return;
+      case Ain_ProfInc:
+         vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
+         return;
       default:
          vpanic("ppAMD64Instr");
    }
@@ -1470,12 +1351,6 @@
          addHRegUse(u, HRmModify, hregAMD64_RAX());
          addHRegUse(u, HRmModify, hregAMD64_RDX());
          return;
-//..       case Xin_Sh3232:
-//..          addHRegUse(u, HRmRead, i->Xin.Sh3232.src);
-//..          addHRegUse(u, HRmModify, i->Xin.Sh3232.dst);
-//..          if (i->Xin.Sh3232.amt == 0)
-//..             addHRegUse(u, HRmRead, hregAMD64_ECX());
-//..          return;
       case Ain_Push:
          addRegUsage_AMD64RMI(u, i->Ain.Push.src);
          addHRegUse(u, HRmModify, hregAMD64_RSP());
@@ -1533,16 +1408,25 @@
          /* Upshot of this is that the assembler really must use r11,
             and no other, as a destination temporary. */
          return;
-      case Ain_Goto:
-         addRegUsage_AMD64RI(u, i->Ain.Goto.dst);
-         addHRegUse(u, HRmWrite, hregAMD64_RAX()); /* used for next guest addr */
-         addHRegUse(u, HRmWrite, hregAMD64_RDX()); /* used for dispatcher addr */
-         if (i->Ain.Goto.jk != Ijk_Boring
-             && i->Ain.Goto.jk != Ijk_Call
-             && i->Ain.Goto.jk != Ijk_Ret)
-            /* note, this is irrelevant since rbp is not actually
-               available to the allocator.  But still .. */
-            addHRegUse(u, HRmWrite, hregAMD64_RBP());
+      /* XDirect/XIndir/XAssisted are also a bit subtle.  They
+         conditionally exit the block.  Hence we only need to list (1)
+         the registers that they read, and (2) the registers that they
+         write in the case where the block is not exited.  (2) is
+         empty, hence only (1) is relevant here. */
+      case Ain_XDirect:
+         /* Don't bother to mention the write to %r11, since it is not
+            available to the allocator. */
+         addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
+         return;
+      case Ain_XIndir:
+         /* Ditto re %r11 */
+         addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
+         addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
+         return;
+      case Ain_XAssisted:
+         /* Ditto re %r11 and %rbp (the baseblock ptr) */
+         addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
+         addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
          return;
       case Ain_CMov64:
          addRegUsage_AMD64RM(u, i->Ain.CMov64.src, HRmRead);
@@ -1594,39 +1478,9 @@
       case Ain_A87StSW:
          addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
          return;
-//..       case Xin_FpUnary:
-//..          addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
-//..          addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
-//..          return;
-//..       case Xin_FpBinary:
-//..          addHRegUse(u, HRmRead, i->Xin.FpBinary.srcL);
-//..          addHRegUse(u, HRmRead, i->Xin.FpBinary.srcR);
-//..          addHRegUse(u, HRmWrite, i->Xin.FpBinary.dst);
-//..          return;
-//..       case Xin_FpLdSt:
-//..          addRegUsage_AMD64AMode(u, i->Xin.FpLdSt.addr);
-//..          addHRegUse(u, i->Xin.FpLdSt.isLoad ? HRmWrite : HRmRead,
-//..                        i->Xin.FpLdSt.reg);
-//..          return;
-//..       case Xin_FpLdStI:
-//..          addRegUsage_AMD64AMode(u, i->Xin.FpLdStI.addr);
-//..          addHRegUse(u, i->Xin.FpLdStI.isLoad ? HRmWrite : HRmRead,
-//..                        i->Xin.FpLdStI.reg);
-//..          return;
-//..       case Xin_Fp64to32:
-//..          addHRegUse(u, HRmRead,  i->Xin.Fp64to32.src);
-//..          addHRegUse(u, HRmWrite, i->Xin.Fp64to32.dst);
-//..          return;
-//..       case Xin_FpCMov:
-//..          addHRegUse(u, HRmRead,   i->Xin.FpCMov.src);
-//..          addHRegUse(u, HRmModify, i->Xin.FpCMov.dst);
-//..          return;
       case Ain_LdMXCSR:
          addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
          return;
-//..       case Xin_FpStSW_AX:
-//..          addHRegUse(u, HRmWrite, hregAMD64_EAX());
-//..          return;
       case Ain_SseUComIS:
          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
@@ -1653,9 +1507,6 @@
          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
          return;
-//..       case Xin_SseConst:
-//..          addHRegUse(u, HRmWrite, i->Xin.SseConst.dst);
-//..          return;
       case Ain_Sse32Fx4:
          vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
          unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
@@ -1716,6 +1567,15 @@
          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
          return;
+      case Ain_EvCheck:
+         /* We expect both amodes only to mention %rbp, so this is in
+            fact pointless, since %rbp isn't allocatable, but anyway.. */
+         addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
+         addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
+         return;
+      case Ain_ProfInc:
+         addHRegUse(u, HRmWrite, hregAMD64_R11());
+         return;
       default:
          ppAMD64Instr(i, mode64);
          vpanic("getRegUsage_AMD64Instr");
@@ -1766,17 +1626,21 @@
       case Ain_Div:
          mapRegs_AMD64RM(m, i->Ain.Div.src);
          return;
-//..       case Xin_Sh3232:
-//..          mapReg(m, &i->Xin.Sh3232.src);
-//..          mapReg(m, &i->Xin.Sh3232.dst);
-//..          return;
       case Ain_Push:
          mapRegs_AMD64RMI(m, i->Ain.Push.src);
          return;
       case Ain_Call:
          return;
-      case Ain_Goto:
-         mapRegs_AMD64RI(m, i->Ain.Goto.dst);
+      case Ain_XDirect:
+         mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
+         return;
+      case Ain_XIndir:
+         mapReg(m, &i->Ain.XIndir.dstGA);
+         mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
+         return;
+      case Ain_XAssisted:
+         mapReg(m, &i->Ain.XAssisted.dstGA);
+         mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
          return;
       case Ain_CMov64:
          mapRegs_AMD64RM(m, i->Ain.CMov64.src);
@@ -1822,36 +1686,9 @@
       case Ain_A87StSW:
          mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
          return;
-//..       case Xin_FpUnary:
-//..          mapReg(m, &i->Xin.FpUnary.src);
-//..          mapReg(m, &i->Xin.FpUnary.dst);
-//..          return;
-//..       case Xin_FpBinary:
-//..          mapReg(m, &i->Xin.FpBinary.srcL);
-//..          mapReg(m, &i->Xin.FpBinary.srcR);
-//..          mapReg(m, &i->Xin.FpBinary.dst);
-//..          return;
-//..       case Xin_FpLdSt:
-//..          mapRegs_AMD64AMode(m, i->Xin.FpLdSt.addr);
-//..          mapReg(m, &i->Xin.FpLdSt.reg);
-//..          return;
-//..       case Xin_FpLdStI:
-//..          mapRegs_AMD64AMode(m, i->Xin.FpLdStI.addr);
-//..          mapReg(m, &i->Xin.FpLdStI.reg);
-//..          return;
-//..       case Xin_Fp64to32:
-//..          mapReg(m, &i->Xin.Fp64to32.src);
-//..          mapReg(m, &i->Xin.Fp64to32.dst);
-//..          return;
-//..       case Xin_FpCMov:
-//..          mapReg(m, &i->Xin.FpCMov.src);
-//..          mapReg(m, &i->Xin.FpCMov.dst);
-//..          return;
       case Ain_LdMXCSR:
          mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
          return;
-//..       case Xin_FpStSW_AX:
-//..          return;
       case Ain_SseUComIS:
          mapReg(m, &i->Ain.SseUComIS.srcL);
          mapReg(m, &i->Ain.SseUComIS.srcR);
@@ -1869,9 +1706,6 @@
          mapReg(m, &i->Ain.SseSDSS.src);
          mapReg(m, &i->Ain.SseSDSS.dst);
          return;
-//..       case Xin_SseConst:
-//..          mapReg(m, &i->Xin.SseConst.dst);
-//..          return;
       case Ain_SseLdSt:
          mapReg(m, &i->Ain.SseLdSt.reg);
          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
@@ -1908,6 +1742,15 @@
          mapReg(m, &i->Ain.SseShuf.src);
          mapReg(m, &i->Ain.SseShuf.dst);
          return;
+      case Ain_EvCheck:
+         /* We expect both amodes only to mention %rbp, so this is in
+            fact pointless, since %rbp isn't allocatable, but anyway.. */
+         mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
+         mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
+         return;
+      case Ain_ProfInc:
+         /* hardwires r11 -- nothing to modify. */
+         return;
       default:
          ppAMD64Instr(i, mode64);
          vpanic("mapRegs_AMD64Instr");
@@ -2252,101 +2095,19 @@
    return p;
 }
 
-//.. /* Emit fstp %st(i), 1 <= i <= 7 */
-//.. static UChar* do_fstp_st ( UChar* p, Int i )
-//.. {
-//..    vassert(1 <= i && i <= 7);
-//..    *p++ = 0xDD;
-//..    *p++ = 0xD8+i;
-//..    return p;
-//.. }
-//.. 
-//.. /* Emit fld %st(i), 0 <= i <= 6 */
-//.. static UChar* do_fld_st ( UChar* p, Int i )
-//.. {
-//..    vassert(0 <= i && i <= 6);
-//..    *p++ = 0xD9;
-//..    *p++ = 0xC0+i;
-//..    return p;
-//.. }
-//.. 
-//.. /* Emit f<op> %st(0) */
-//.. static UChar* do_fop1_st ( UChar* p, AMD64FpOp op )
-//.. {
-//..    switch (op) {
-//..       case Xfp_NEG:    *p++ = 0xD9; *p++ = 0xE0; break;
-//..       case Xfp_ABS:    *p++ = 0xD9; *p++ = 0xE1; break;
-//..       case Xfp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
-//..       case Xfp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
-//..       case Xfp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
-//..       case Xfp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
-//..       case Xfp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
-//..       case Xfp_MOV:    break;
-//..       case Xfp_TAN:    p = do_ffree_st7(p); /* since fptan pushes 1.0 */
-//..                        *p++ = 0xD9; *p++ = 0xF2; /* fptan */
-//..                        *p++ = 0xD9; *p++ = 0xF7; /* fincstp */
-//..                        break;
-//..       default: vpanic("do_fop1_st: unknown op");
-//..    }
-//..    return p;
-//.. }
-//.. 
-//.. /* Emit f<op> %st(i), 1 <= i <= 5 */
-//.. static UChar* do_fop2_st ( UChar* p, AMD64FpOp op, Int i )
-//.. {
-//.. #  define fake(_n) mkHReg((_n), HRcInt32, False)
-//..    Int subopc;
-//..    switch (op) {
-//..       case Xfp_ADD: subopc = 0; break;
-//..       case Xfp_SUB: subopc = 4; break;
-//..       case Xfp_MUL: subopc = 1; break;
-//..       case Xfp_DIV: subopc = 6; break;
-//..       default: vpanic("do_fop2_st: unknown op");
-//..    }
-//..    *p++ = 0xD8;
-//..    p    = doAMode_R(p, fake(subopc), fake(i));
-//..    return p;
-//.. #  undef fake
-//.. }
-//.. 
-//.. /* Push a 32-bit word on the stack.  The word depends on tags[3:0];
-//.. each byte is either 0x00 or 0xFF depending on the corresponding bit in tags[].
-//.. */
-//.. static UChar* push_word_from_tags ( UChar* p, UShort tags )
-//.. {
-//..    UInt w;
-//..    vassert(0 == (tags & ~0xF));
-//..    if (tags == 0) {
-//..       /* pushl $0x00000000 */
-//..       *p++ = 0x6A;
-//..       *p++ = 0x00;
-//..    }
-//..    else 
-//..    /* pushl $0xFFFFFFFF */
-//..    if (tags == 0xF) {
-//..       *p++ = 0x6A;
-//..       *p++ = 0xFF;
-//..    } else {
-//..       vassert(0); /* awaiting test case */
-//..       w = 0;
-//..       if (tags & 1) w |= 0x000000FF;
-//..       if (tags & 2) w |= 0x0000FF00;
-//..       if (tags & 4) w |= 0x00FF0000;
-//..       if (tags & 8) w |= 0xFF000000;
-//..       *p++ = 0x68;
-//..       p = emit32(p, w);
-//..    }
-//..    return p;
-//.. }
-
 /* Emit an instruction into buf and return the number of bytes used.
    Note that buf is not the insn's final place, and therefore it is
-   imperative to emit position-independent code. */
+   imperative to emit position-independent code.  If the emitted
+   instruction was a profiler inc, set *is_profInc to True, else
+   leave it unchanged. */
 
-Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i, 
+Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
+                      UChar* buf, Int nbuf, AMD64Instr* i, 
                       Bool mode64,
-                      void* dispatch_unassisted,
-                      void* dispatch_assisted )
+                      void* disp_cp_chain_me_to_slowEP,
+                      void* disp_cp_chain_me_to_fastEP,
+                      void* disp_cp_xindir,
+                      void* disp_cp_xassisted )
 {
    UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
    UInt   xtra;
@@ -2545,35 +2306,6 @@
                goto bad;
          }
       }
-//..       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
-//..          allowed here. */
-//..       opc = subopc_imm = opc_imma = 0;
-//..       switch (i->Xin.Alu32M.op) {
-//..          case Xalu_ADD: opc = 0x01; subopc_imm = 0; break;
-//..          case Xalu_SUB: opc = 0x29; subopc_imm = 5; break;
-//..          default: goto bad;
-//..       }
-//..       switch (i->Xin.Alu32M.src->tag) {
-//..          case Xri_Reg:
-//..             *p++ = opc;
-//..             p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
-//..                              i->Xin.Alu32M.dst);
-//..             goto done;
-//..          case Xri_Imm:
-//..             if (fits8bits(i->Xin.Alu32M.src->Xri.Imm.imm32)) {
-//..                *p++ = 0x83;
-//..                p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
-//..                *p++ = 0xFF & i->Xin.Alu32M.src->Xri.Imm.imm32;
-//..                goto done;
-//..             } else {
-//..                *p++ = 0x81;
-//..                p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
-//..                p    = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
-//..                goto done;
-//..             }
-//..          default: 
-//..             goto bad;
-//..       }
       break;
 
    case Ain_Sh64:
@@ -2756,21 +2488,6 @@
       }
       break;
 
-//..    case Xin_Sh3232:
-//..       vassert(i->Xin.Sh3232.op == Xsh_SHL || i->Xin.Sh3232.op == Xsh_SHR);
-//..       if (i->Xin.Sh3232.amt == 0) {
-//..          /* shldl/shrdl by %cl */
-//..          *p++ = 0x0F;
-//..          if (i->Xin.Sh3232.op == Xsh_SHL) {
-//..             *p++ = 0xA5;
-//..          } else {
-//..             *p++ = 0xAD;
-//..          }
-//..          p = doAMode_R(p, i->Xin.Sh3232.src, i->Xin.Sh3232.dst);
-//..          goto done;
-//..       }
-//..       break;
-
    case Ain_Push:
       switch (i->Ain.Push.src->tag) {
          case Armi_Mem: 
@@ -2822,117 +2539,167 @@
       goto done;
    }
 
-   case Ain_Goto: {
-      void* dispatch_to_use = NULL;
-      vassert(dispatch_unassisted != NULL);
-      vassert(dispatch_assisted != NULL);
+   case Ain_XDirect: {
+      /* NB: what goes on here has to be very closely coordinated with the
+         chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
+      /* We're generating chain-me requests here, so we need to be
+         sure this is actually allowed -- no-redir translations can't
+         use chain-me's.  Hence: */
+      vassert(disp_cp_chain_me_to_slowEP != NULL);
+      vassert(disp_cp_chain_me_to_fastEP != NULL);
+
+      HReg r11 = hregAMD64_R11();
 
       /* Use ptmp for backpatching conditional jumps. */
       ptmp = NULL;
 
       /* First off, if this is conditional, create a conditional
          jump over the rest of it. */
-      if (i->Ain.Goto.cond != Acc_ALWAYS) {
+      if (i->Ain.XDirect.cond != Acc_ALWAYS) {
          /* jmp fwds if !condition */
-         *p++ = toUChar(0x70 + (i->Ain.Goto.cond ^ 1));
+         *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
          ptmp = p; /* fill in this bit later */
          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
       }
 
-      /* If a non-boring, set %rbp (the guest state pointer)
-         appropriately.  Since these numbers are all small positive
-         integers, we can get away with "movl $N, %ebp" rather than
-         the longer "movq $N, %rbp".  Also, decide which dispatcher we
-         need to use. */
-      dispatch_to_use = dispatch_assisted;
+      /* Update the guest RIP. */
+      /* movabsq $dstGA, %r11 */
+      *p++ = 0x49;
+      *p++ = 0xBB;
+      p = emit64(p, i->Ain.XDirect.dstGA);
+      /* movq %r11, amRIP */
+      *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
+      *p++ = 0x89;
+      p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
 
-      /* movl $magic_number, %ebp */
-      switch (i->Ain.Goto.jk) {
-         case Ijk_ClientReq: 
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_CLIENTREQ); break;
-         case Ijk_Sys_syscall: 
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_SYS_SYSCALL); break;
-         case Ijk_Sys_int32: 
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_SYS_INT32); break;
-         case Ijk_Yield: 
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_YIELD); break;
-         case Ijk_EmWarn:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_EMWARN); break;
-         case Ijk_MapFail:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_MAPFAIL); break;
-         case Ijk_NoDecode:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_NODECODE); break;
-         case Ijk_TInval:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_TINVAL); break;
-         case Ijk_NoRedir:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_NOREDIR); break;
-         case Ijk_SigTRAP:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_SIGTRAP); break;
-         case Ijk_SigSEGV:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_SIGSEGV); break;
-         case Ijk_Ret:
-         case Ijk_Call:
-         case Ijk_Boring:
-            dispatch_to_use = dispatch_unassisted;
-            break;
-         default: 
-            ppIRJumpKind(i->Ain.Goto.jk);
-            vpanic("emit_AMD64Instr.Ain_Goto: unknown jump kind");
-      }
-
-      /* Get the destination address into %rax */
-      if (i->Ain.Goto.dst->tag == Ari_Imm) {
-         /* movl sign-ext($immediate), %rax ; ret */
-         *p++ = 0x48;
-         *p++ = 0xC7;
-         *p++ = 0xC0;
-         p = emit32(p, i->Ain.Goto.dst->Ari.Imm.imm32);
-      } else {
-         vassert(i->Ain.Goto.dst->tag == Ari_Reg);
-         /* movq %reg, %rax ; ret */
-         if (i->Ain.Goto.dst->Ari.Reg.reg != hregAMD64_RAX()) {
-            *p++ = rexAMode_R(i->Ain.Goto.dst->Ari.Reg.reg, hregAMD64_RAX());
-            *p++ = 0x89;
-            p = doAMode_R(p, i->Ain.Goto.dst->Ari.Reg.reg, hregAMD64_RAX());
-         }
-      }
-
-      /* Get the dispatcher address into %rdx.  This has to happen
-         after the load of %rax since %rdx might be carrying the value
-         destined for %rax immediately prior to this Ain_Goto. */
-      vassert(sizeof(ULong) == sizeof(void*));
-
-      if (fitsIn32Bits(Ptr_to_ULong(dispatch_to_use))) {
-         /* movl sign-extend(imm32), %rdx */
-         *p++ = 0x48;
-         *p++ = 0xC7;
-         *p++ = 0xC2;
-         p = emit32(p, (UInt)Ptr_to_ULong(dispatch_to_use));
-      } else {
-         /* movabsq $imm64, %rdx */
-         *p++ = 0x48;
-         *p++ = 0xBA;
-         p = emit64(p, Ptr_to_ULong(dispatch_to_use));
-      }
-      /* jmp *%rdx */
+      /* --- FIRST PATCHABLE BYTE follows --- */
+      /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
+         to) backs up the return address, so as to find the address of
+         the first patchable byte.  So: don't change the length of the
+         two instructions below. */
+      /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
+      *p++ = 0x49;
+      *p++ = 0xBB;
+      void* disp_cp_chain_me
+               = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP 
+                                         : disp_cp_chain_me_to_slowEP;
+      p = emit64(p, Ptr_to_ULong(disp_cp_chain_me));
+      /* call *%r11 */
+      *p++ = 0x41;
       *p++ = 0xFF;
-      *p++ = 0xE2;
+      *p++ = 0xD3;
+      /* --- END of PATCHABLE BYTES --- */
 
       /* Fix up the conditional jump, if there was one. */
-      if (i->Ain.Goto.cond != Acc_ALWAYS) {
+      if (i->Ain.XDirect.cond != Acc_ALWAYS) {
          Int delta = p - ptmp;
-         vassert(delta > 0 && delta < 30);
+         vassert(delta > 0 && delta < 40);
+         *ptmp = toUChar(delta-1);
+      }
+      goto done;
+   }
+
+   case Ain_XIndir: {
+      /* We're generating transfers that could lead indirectly to a
+         chain-me, so we need to be sure this is actually allowed --
+         no-redir translations are not allowed to reach normal
+         translations without going through the scheduler.  That means
+         no XDirects or XIndirs out from no-redir translations.
+         Hence: */
+      vassert(disp_cp_xindir != NULL);
+
+      /* Use ptmp for backpatching conditional jumps. */
+      ptmp = NULL;
+
+      /* First off, if this is conditional, create a conditional
+         jump over the rest of it. */
+      if (i->Ain.XIndir.cond != Acc_ALWAYS) {
+         /* jmp fwds if !condition */
+         *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
+         ptmp = p; /* fill in this bit later */
+         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+      }
+
+      /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
+      *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
+      *p++ = 0x89;
+      p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
+      /* movabsq $disp_indir, %r11 */
+      *p++ = 0x49;
+      *p++ = 0xBB;
+      p = emit64(p, Ptr_to_ULong(disp_cp_xindir));
+      /* jmp *%r11 */
+      *p++ = 0x41;
+      *p++ = 0xFF;
+      *p++ = 0xE3;
+
+      /* Fix up the conditional jump, if there was one. */
+      if (i->Ain.XIndir.cond != Acc_ALWAYS) {
+         Int delta = p - ptmp;
+         vassert(delta > 0 && delta < 40);
+         *ptmp = toUChar(delta-1);
+      }
+      goto done;
+   }
+
+   case Ain_XAssisted: {
+      /* Use ptmp for backpatching conditional jumps. */
+      ptmp = NULL;
+
+      /* First off, if this is conditional, create a conditional
+         jump over the rest of it. */
+      if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
+         /* jmp fwds if !condition */
+         *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
+         ptmp = p; /* fill in this bit later */
+         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+      }
+
+      /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
+      *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
+      *p++ = 0x89;
+      p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
+      /* movl $magic_number, %ebp.  Since these numbers are all small positive
+         integers, we can get away with "movl $N, %ebp" rather than
+         the longer "movq $N, %rbp". */
+      UInt trcval = 0;
+      switch (i->Ain.XAssisted.jk) {
+         case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
+         case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
+         case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
+         case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
+         case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
+         case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
+         case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
+         case Ijk_TInval:      trcval = VEX_TRC_JMP_TINVAL;      break;
+         case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
+         case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
+         case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
+         case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
+         /* We don't expect to see the following being assisted. */
+         case Ijk_Ret:
+         case Ijk_Call:
+         /* fallthrough */
+         default: 
+            ppIRJumpKind(i->Ain.XAssisted.jk);
+            vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
+      }
+      vassert(trcval != 0);
+      *p++ = 0xBD;
+      p = emit32(p, trcval);
+      /* movabsq $disp_assisted, %r11 */
+      *p++ = 0x49;
+      *p++ = 0xBB;
+      p = emit64(p, Ptr_to_ULong(disp_cp_xassisted));
+      /* jmp *%r11 */
+      *p++ = 0x41;
+      *p++ = 0xFF;
+      *p++ = 0xE3;
+
+      /* Fix up the conditional jump, if there was one. */
+      if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
+         Int delta = p - ptmp;
+         vassert(delta > 0 && delta < 40);
          *ptmp = toUChar(delta-1);
       }
       goto done;
@@ -3164,165 +2931,6 @@
       }
       break;
 
-//..    case Xin_FpUnary:
-//..       /* gop %src, %dst
-//..          --> ffree %st7 ; fld %st(src) ; fop %st(0) ; fstp %st(1+dst)
-//..       */
-//..       p = do_ffree_st7(p);
-//..       p = do_fld_st(p, 0+hregNumber(i->Xin.FpUnary.src));
-//..       p = do_fop1_st(p, i->Xin.FpUnary.op);
-//..       p = do_fstp_st(p, 1+hregNumber(i->Xin.FpUnary.dst));
-//..       goto done;
-//.. 
-//..    case Xin_FpBinary:
-//..       if (i->Xin.FpBinary.op == Xfp_YL2X
-//..           || i->Xin.FpBinary.op == Xfp_YL2XP1) {
-//..          /* Have to do this specially. */
-//..          /* ffree %st7 ; fld %st(srcL) ; 
-//..             ffree %st7 ; fld %st(srcR+1) ; fyl2x{p1} ; fstp(1+dst) */
-//..          p = do_ffree_st7(p);
-//..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
-//..          p = do_ffree_st7(p);
-//..          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
-//..          *p++ = 0xD9; 
-//..          *p++ = i->Xin.FpBinary.op==Xfp_YL2X ? 0xF1 : 0xF9;
-//..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
-//..          goto done;
-//..       }
-//..       if (i->Xin.FpBinary.op == Xfp_ATAN) {
-//..          /* Have to do this specially. */
-//..          /* ffree %st7 ; fld %st(srcL) ; 
-//..             ffree %st7 ; fld %st(srcR+1) ; fpatan ; fstp(1+dst) */
-//..          p = do_ffree_st7(p);
-//..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
-//..          p = do_ffree_st7(p);
-//..          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
-//..          *p++ = 0xD9; *p++ = 0xF3;
-//..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
-//..          goto done;
-//..       }
-//..       if (i->Xin.FpBinary.op == Xfp_PREM
-//..           || i->Xin.FpBinary.op == Xfp_PREM1
-//..           || i->Xin.FpBinary.op == Xfp_SCALE) {
-//..          /* Have to do this specially. */
-//..          /* ffree %st7 ; fld %st(srcR) ; 
-//..             ffree %st7 ; fld %st(srcL+1) ; fprem/fprem1/fscale ; fstp(2+dst) ; 
-//..             fincstp ; ffree %st7 */
-//..          p = do_ffree_st7(p);
-//..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcR));
-//..          p = do_ffree_st7(p);
-//..          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcL));
-//..          *p++ = 0xD9;
-//..          switch (i->Xin.FpBinary.op) {
-//..             case Xfp_PREM: *p++ = 0xF8; break;
-//..             case Xfp_PREM1: *p++ = 0xF5; break;
-//..             case Xfp_SCALE: *p++ =  0xFD; break;
-//..             default: vpanic("emitAMD64Instr(FpBinary,PREM/PREM1/SCALE)");
-//..          }
-//..          p = do_fstp_st(p, 2+hregNumber(i->Xin.FpBinary.dst));
-//..          *p++ = 0xD9; *p++ = 0xF7;
-//..          p = do_ffree_st7(p);
-//..          goto done;
-//..       }
-//..       /* General case */
-//..       /* gop %srcL, %srcR, %dst
-//..          --> ffree %st7 ; fld %st(srcL) ; fop %st(1+srcR) ; fstp %st(1+dst)
-//..       */
-//..       p = do_ffree_st7(p);
-//..       p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
-//..       p = do_fop2_st(p, i->Xin.FpBinary.op, 
-//..                         1+hregNumber(i->Xin.FpBinary.srcR));
-//..       p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
-//..       goto done;
-//.. 
-//..    case Xin_FpLdSt:
-//..       vassert(i->Xin.FpLdSt.sz == 4 || i->Xin.FpLdSt.sz == 8);
-//..       if (i->Xin.FpLdSt.isLoad) {
-//..          /* Load from memory into %fakeN.  
-//..             --> ffree %st(7) ; fld{s/l} amode ; fstp st(N+1) 
-//..          */
-//..          p = do_ffree_st7(p);
-//..          *p++ = i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD;
-//.. 	 p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
-//..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg));
-//..          goto done;
-//..       } else {
-//..          /* Store from %fakeN into memory.
-//..             --> ffree %st(7) ; fld st(N) ; fstp{l|s} amode
-//.. 	 */
-//..          p = do_ffree_st7(p);
-//..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg));
-//..          *p++ = i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD;
-//..          p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
-//..          goto done;
-//..       }
-//..       break;
-//.. 
-//..    case Xin_FpLdStI:
-//..       if (i->Xin.FpLdStI.isLoad) {
-//..          /* Load from memory into %fakeN, converting from an int.  
-//..             --> ffree %st(7) ; fild{w/l/ll} amode ; fstp st(N+1) 
-//..          */
-//..          switch (i->Xin.FpLdStI.sz) {
-//..             case 8:  opc = 0xDF; subopc_imm = 5; break;
-//..             case 4:  opc = 0xDB; subopc_imm = 0; break;
-//..             case 2:  vassert(0); opc = 0xDF; subopc_imm = 0; break;
-//..             default: vpanic("emitAMD64Instr(Xin_FpLdStI-load)");
-//..          }
-//..          p = do_ffree_st7(p);
-//..          *p++ = opc;
-//..          p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
-//..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdStI.reg));
-//..          goto done;
-//..       } else {
-//..          /* Store from %fakeN into memory, converting to an int.
-//..             --> ffree %st(7) ; fld st(N) ; fistp{w/l/ll} amode
-//.. 	 */
-//..          switch (i->Xin.FpLdStI.sz) {
-//..             case 8:  opc = 0xDF; subopc_imm = 7; break;
-//..             case 4:  opc = 0xDB; subopc_imm = 3; break;
-//..             case 2:  opc = 0xDF; subopc_imm = 3; break;
-//..             default: vpanic("emitAMD64Instr(Xin_FpLdStI-store)");
-//..          }
-//..          p = do_ffree_st7(p);
-//..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdStI.reg));
-//..          *p++ = opc;
-//..          p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
-//..          goto done;
-//..       }
-//..       break;
-//.. 
-//..    case Xin_Fp64to32:
-//..       /* ffree %st7 ; fld %st(src) */
-//..       p = do_ffree_st7(p);
-//..       p = do_fld_st(p, 0+fregNo(i->Xin.Fp64to32.src));
-//..       /* subl $4, %esp */
-//..       *p++ = 0x83; *p++ = 0xEC; *p++ = 0x04;
-//..       /* fstps (%esp) */
-//..       *p++ = 0xD9; *p++ = 0x1C; *p++ = 0x24;
-//..       /* flds (%esp) */
-//..       *p++ = 0xD9; *p++ = 0x04; *p++ = 0x24;
-//..       /* addl $4, %esp */
-//..       *p++ = 0x83; *p++ = 0xC4; *p++ = 0x04;
-//..       /* fstp %st(1+dst) */
-//..       p = do_fstp_st(p, 1+fregNo(i->Xin.Fp64to32.dst));
-//..       goto done;
-//.. 
-//..    case Xin_FpCMov:
-//..       /* jmp fwds if !condition */
-//..       *p++ = 0x70 + (i->Xin.FpCMov.cond ^ 1);
-//..       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
-//..       ptmp = p;
-//.. 
-//..       /* ffree %st7 ; fld %st(src) ; fstp %st(1+dst) */
-//..       p = do_ffree_st7(p);
-//..       p = do_fld_st(p, 0+fregNo(i->Xin.FpCMov.src));
-//..       p = do_fstp_st(p, 1+fregNo(i->Xin.FpCMov.dst));
-//.. 
-//..       /* Fill in the jump offset. */
-//..       *(ptmp-1) = p - ptmp;
-//..       goto done;
-
    case Ain_LdMXCSR:
       *p++ = clearWBit(rexAMode_M( fake(0), i->Ain.LdMXCSR.addr));
       *p++ = 0x0F;
@@ -3330,12 +2938,6 @@
       p = doAMode_M(p, fake(2)/*subopcode*/, i->Ain.LdMXCSR.addr);
       goto done;
 
-//..    case Xin_FpStSW_AX:
-//..       /* note, this emits fnstsw %ax, not fstsw %ax */
-//..       *p++ = 0xDF;
-//..       *p++ = 0xE0;
-//..       goto done;
-
    case Ain_SseUComIS:
       /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
       /* ucomi[sd] %srcL, %srcR */
@@ -3395,45 +2997,6 @@
                         vreg2ireg(i->Ain.SseSDSS.src) );
       goto done;
 
-//.. 
-//..    case Xin_FpCmp:
-//..       /* gcmp %fL, %fR, %dst
-//..          -> ffree %st7; fpush %fL ; fucomp %(fR+1) ; 
-//..             fnstsw %ax ; movl %eax, %dst 
-//..       */
-//..       /* ffree %st7 */
-//..       p = do_ffree_st7(p);
-//..       /* fpush %fL */
-//..       p = do_fld_st(p, 0+fregNo(i->Xin.FpCmp.srcL));
-//..       /* fucomp %(fR+1) */
-//..       *p++ = 0xDD;
-//..       *p++ = 0xE8 + (7 & (1+fregNo(i->Xin.FpCmp.srcR)));
-//..       /* fnstsw %ax */
-//..       *p++ = 0xDF;
-//..       *p++ = 0xE0;
-//..       /*  movl %eax, %dst */
-//..       *p++ = 0x89;
-//..       p = doAMode_R(p, hregAMD64_EAX(), i->Xin.FpCmp.dst);
-//..       goto done;
-//.. 
-//..    case Xin_SseConst: {
-//..       UShort con = i->Xin.SseConst.con;
-//..       p = push_word_from_tags(p, (con >> 12) & 0xF);
-//..       p = push_word_from_tags(p, (con >> 8) & 0xF);
-//..       p = push_word_from_tags(p, (con >> 4) & 0xF);
-//..       p = push_word_from_tags(p, con & 0xF);
-//..       /* movl (%esp), %xmm-dst */
-//..       *p++ = 0x0F;
-//..       *p++ = 0x10;
-//..       *p++ = 0x04 + 8 * (7 & vregNo(i->Xin.SseConst.dst));
-//..       *p++ = 0x24;
-//..       /* addl $16, %esp */
-//..       *p++ = 0x83;
-//..       *p++ = 0xC4;
-//..       *p++ = 0x10;
-//..       goto done;
-//..    }
-
    case Ain_SseLdSt:
       if (i->Ain.SseLdSt.sz == 8) {
          *p++ = 0xF2;
@@ -3505,8 +3068,6 @@
          case Asse_MAXF:   *p++ = 0x5F; break;
          case Asse_MINF:   *p++ = 0x5D; break;
          case Asse_MULF:   *p++ = 0x59; break;
-//..          case Xsse_RCPF:   *p++ = 0x53; break;
-//..          case Xsse_RSQRTF: *p++ = 0x52; break;
          case Asse_SQRTF:  *p++ = 0x51; break;
          case Asse_SUBF:   *p++ = 0x5C; break;
          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
@@ -3563,8 +3124,6 @@
          case Asse_MAXF:   *p++ = 0x5F; break;
          case Asse_MINF:   *p++ = 0x5D; break;
          case Asse_MULF:   *p++ = 0x59; break;
-//..          case Xsse_RCPF:   *p++ = 0x53; break;
-//..          case Xsse_RSQRTF: *p++ = 0x52; break;
          case Asse_SQRTF:  *p++ = 0x51; break;
          case Asse_SUBF:   *p++ = 0x5C; break;
          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
@@ -3680,6 +3239,70 @@
       *p++ = (UChar)(i->Ain.SseShuf.order);
       goto done;
 
+   case Ain_EvCheck: {
+      /* We generate:
+            (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
+            (2 bytes)  jns  nofail     expected taken
+            (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
+            nofail:
+      */
+      /* This is heavily asserted re instruction lengths.  It needs to
+         be.  If we get given unexpected forms of .amCounter or
+         .amFailAddr -- basically, anything that's not of the form
+         uimm7(%rbp) -- they are likely to fail. */
+      /* Note also that after the decl we must be very careful not to
+         read the carry flag, else we get a partial flags stall.
+         js/jns avoids that, though. */
+      UChar* p0 = p;
+      /* ---  decl 8(%rbp) --- */
+      /* Need to compute the REX byte for the decl in order to prove
+         that we don't need it, since this is a 32-bit inc and all
+         registers involved in the amode are < r8.  "fake(1)" because
+         there's no register in this encoding; instead the register
+         field is used as a sub opcode.  The encoding for "decl r/m32"
+         is FF /1, hence the fake(1). */
+      rex = clearWBit(rexAMode_M(fake(1), i->Ain.EvCheck.amCounter));
+      if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
+      *p++ = 0xFF;
+      p = doAMode_M(p, fake(1), i->Ain.EvCheck.amCounter);
+      vassert(p - p0 == 3);
+      /* --- jns nofail --- */
+      *p++ = 0x79;
+      *p++ = 0x03; /* need to check this 0x03 after the next insn */
+      vassert(p - p0 == 5);
+      /* --- jmp* 0(%rbp) --- */
+      /* Once again, verify we don't need REX.  The encoding is FF /4.
+         We don't need REX.W since by default FF /4 in 64-bit mode
+         implies a 64 bit load. */
+      rex = clearWBit(rexAMode_M(fake(4), i->Ain.EvCheck.amFailAddr));
+      if (rex != 0x40) goto bad;
+      *p++ = 0xFF;
+      p = doAMode_M(p, fake(4), i->Ain.EvCheck.amFailAddr);
+      vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
+      /* And crosscheck .. */
+      vassert(evCheckSzB_AMD64() == 8);
+      goto done;
+   }
+
+   case Ain_ProfInc: {
+      /* We generate   movabsq $0, %r11
+                       incq (%r11)
+         in the expectation that a later call to LibVEX_patchProfCtr
+         will be used to fill in the immediate field once the right
+         value is known.
+         49 BB 00 00 00 00 00 00 00 00
+         49 FF 03
+      */
+      *p++ = 0x49; *p++ = 0xBB;
+      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
+      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
+      *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
+      /* Tell the caller .. */
+      vassert(!(*is_profInc));
+      *is_profInc = True;
+      goto done;
+   }
+
    default: 
       goto bad;
    }
@@ -3696,6 +3319,200 @@
 #  undef fake
 }
 
+
+/* How big is an event check?  See case for Ain_EvCheck in
+   emit_AMD64Instr just above.  That crosschecks what this returns, so
+   we can tell if we're inconsistent. */
+Int evCheckSzB_AMD64 ( void )
+{
+   return 8;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   emitInstr case for XDirect, above. */
+VexInvalRange chainXDirect_AMD64 ( void* place_to_chain,
+                                   void* disp_cp_chain_me_EXPECTED,
+                                   void* place_to_jump_to )
+{
+   /* What we're expecting to see is:
+        movabsq $disp_cp_chain_me_EXPECTED, %r11
+        call *%r11
+      viz
+        49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
+        41 FF D3
+   */
+   UChar* p = (UChar*)place_to_chain;
+   vassert(p[0] == 0x49);
+   vassert(p[1] == 0xBB);
+   vassert(*(ULong*)(&p[2]) == Ptr_to_ULong(disp_cp_chain_me_EXPECTED));
+   vassert(p[10] == 0x41);
+   vassert(p[11] == 0xFF);
+   vassert(p[12] == 0xD3);
+   /* And what we want to change it to is either:
+        (general case):
+          movabsq $place_to_jump_to, %r11
+          jmpq *%r11
+        viz
+          49 BB <8 bytes value == place_to_jump_to>
+          41 FF E3
+        So it's the same length (convenient, huh) and we don't
+        need to change all the bits.
+      ---OR---
+        in the case where the displacement falls within 32 bits
+          jmpq disp32   where disp32 is relative to the next insn
+          ud2; ud2; ud2; ud2
+        viz
+          E9 <4 bytes == disp32>
+          0F 0B 0F 0B 0F 0B 0F 0B 
+
+      In both cases the replacement has the same length as the original.
+      To remain sane & verifiable,
+      (1) limit the displacement for the short form to 
+          (say) +/- one billion, so as to avoid wraparound
+          off-by-ones
+      (2) even if the short form is applicable, once every (say)
+          1024 times use the long form anyway, so as to maintain
+          verifiability
+   */
+   /* This is the delta we need to put into a JMP d32 insn.  It's
+      relative to the start of the next insn, hence the -5.  */
+   Long delta   = (Long)((UChar*)place_to_jump_to - (UChar*)p) - (Long)5;
+   Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
+
+   static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
+   if (shortOK) {
+      shortCTR++; // thread safety bleh
+      if (0 == (shortCTR & 0x3FF)) {
+         shortOK = False;
+         if (0)
+            vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
+                       "using long jmp\n", shortCTR);
+      }
+   }
+
+   /* And make the modifications. */
+   if (shortOK) {
+      p[0]  = 0xE9;
+      p[1]  = (delta >> 0) & 0xFF;
+      p[2]  = (delta >> 8) & 0xFF;
+      p[3]  = (delta >> 16) & 0xFF;
+      p[4]  = (delta >> 24) & 0xFF;
+      p[5]  = 0x0F; p[6]  = 0x0B;
+      p[7]  = 0x0F; p[8]  = 0x0B;
+      p[9]  = 0x0F; p[10] = 0x0B;
+      p[11] = 0x0F; p[12] = 0x0B;
+      /* sanity check on the delta -- top 32 are all 0 or all 1 */
+      delta >>= 32;
+      vassert(delta == 0LL || delta == -1LL);
+   } else {
+      /* Minimal modifications from the starting sequence. */   
+      *(ULong*)(&p[2]) = Ptr_to_ULong(place_to_jump_to);
+      p[12] = 0xE3;
+   }
+   VexInvalRange vir = {0, 0};
+   return vir;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   emitInstr case for XDirect, above. */
+VexInvalRange unchainXDirect_AMD64 ( void* place_to_unchain,
+                                     void* place_to_jump_to_EXPECTED,
+                                     void* disp_cp_chain_me )
+{
+   /* What we're expecting to see is either:
+        (general case)
+          movabsq $place_to_jump_to_EXPECTED, %r11
+          jmpq *%r11
+        viz
+          49 BB <8 bytes value == place_to_jump_to_EXPECTED>
+          41 FF E3
+      ---OR---
+        in the case where the displacement falls within 32 bits
+          jmpq d32
+          ud2; ud2; ud2; ud2
+        viz
+          E9 <4 bytes == disp32>
+          0F 0B 0F 0B 0F 0B 0F 0B
+   */
+   UChar* p     = (UChar*)place_to_unchain;
+   Bool   valid = False;
+   if (p[0] == 0x49 && p[1] == 0xBB
+       && *(ULong*)(&p[2]) == Ptr_to_ULong(place_to_jump_to_EXPECTED)
+       && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
+      /* it's the long form */
+      valid = True;
+   }
+   else
+   if (p[0] == 0xE9 
+       && p[5]  == 0x0F && p[6]  == 0x0B
+       && p[7]  == 0x0F && p[8]  == 0x0B
+       && p[9]  == 0x0F && p[10] == 0x0B
+       && p[11] == 0x0F && p[12] == 0x0B) {
+      /* It's the short form.  Check the offset is right. */
+      Int  s32 = *(Int*)(&p[1]);
+      Long s64 = (Long)s32;
+      if ((UChar*)p + 5 + s64 == (UChar*)place_to_jump_to_EXPECTED) {
+         valid = True;
+         if (0)
+            vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
+      }
+   }
+   vassert(valid);
+   /* And what we want to change it to is:
+        movabsq $disp_cp_chain_me, %r11
+        call *%r11
+      viz
+        49 BB <8 bytes value == disp_cp_chain_me>
+        41 FF D3
+      So it's the same length (convenient, huh).
+   */
+   p[0] = 0x49;
+   p[1] = 0xBB;
+   *(ULong*)(&p[2]) = Ptr_to_ULong(disp_cp_chain_me);
+   p[10] = 0x41;
+   p[11] = 0xFF;
+   p[12] = 0xD3;
+   VexInvalRange vir = {0, 0};
+   return vir;
+}
+
+
+/* Patch the counter address into a profile inc point, as previously
+   created by the Ain_ProfInc case for emit_AMD64Instr. */
+VexInvalRange patchProfInc_AMD64 ( void*  place_to_patch,
+                                   ULong* location_of_counter )
+{
+   vassert(sizeof(ULong*) == 8);
+   UChar* p = (UChar*)place_to_patch;
+   vassert(p[0] == 0x49);
+   vassert(p[1] == 0xBB);
+   vassert(p[2] == 0x00);
+   vassert(p[3] == 0x00);
+   vassert(p[4] == 0x00);
+   vassert(p[5] == 0x00);
+   vassert(p[6] == 0x00);
+   vassert(p[7] == 0x00);
+   vassert(p[8] == 0x00);
+   vassert(p[9] == 0x00);
+   vassert(p[10] == 0x49);
+   vassert(p[11] == 0xFF);
+   vassert(p[12] == 0x03);
+   ULong imm64 = (ULong)Ptr_to_ULong(location_of_counter);
+   p[2] = imm64 & 0xFF; imm64 >>= 8;
+   p[3] = imm64 & 0xFF; imm64 >>= 8;
+   p[4] = imm64 & 0xFF; imm64 >>= 8;
+   p[5] = imm64 & 0xFF; imm64 >>= 8;
+   p[6] = imm64 & 0xFF; imm64 >>= 8;
+   p[7] = imm64 & 0xFF; imm64 >>= 8;
+   p[8] = imm64 & 0xFF; imm64 >>= 8;
+   p[9] = imm64 & 0xFF; imm64 >>= 8;
+   VexInvalRange vir = {0, 0};
+   return vir;
+}
+
+
 /*---------------------------------------------------------------*/
 /*--- end                                   host_amd64_defs.c ---*/
 /*---------------------------------------------------------------*/
diff --git a/priv/host_amd64_defs.h b/priv/host_amd64_defs.h
index 4e7ae05..bc63bd2 100644
--- a/priv/host_amd64_defs.h
+++ b/priv/host_amd64_defs.h
@@ -363,10 +363,11 @@
       Ain_Alu32R,      /* 32-bit add/sub/and/or/xor/cmp, dst=REG (a la Alu64R) */
       Ain_MulL,        /* widening multiply */
       Ain_Div,         /* div and mod */
-//..       Xin_Sh3232,    /* shldl or shrdl */
       Ain_Push,        /* push 64-bit value on stack */
       Ain_Call,        /* call to address in register */
-      Ain_Goto,        /* conditional/unconditional jmp to dst */
+      Ain_XDirect,     /* direct transfer to GA */
+      Ain_XIndir,      /* indirect transfer to GA */
+      Ain_XAssisted,   /* assisted transfer to GA */
       Ain_CMov64,      /* conditional move */
       Ain_MovxLQ,      /* reg-reg move, zx-ing/sx-ing top half */
       Ain_LoadEX,      /* mov{s,z}{b,w,l}q from mem to reg */
@@ -377,28 +378,17 @@
       Ain_ACAS,        /* 8/16/32/64-bit lock;cmpxchg */
       Ain_DACAS,       /* lock;cmpxchg8b/16b (doubleword ACAS, 2 x
                           32-bit or 2 x 64-bit only) */
-
       Ain_A87Free,     /* free up x87 registers */
       Ain_A87PushPop,  /* x87 loads/stores */
       Ain_A87FpOp,     /* x87 operations */
       Ain_A87LdCW,     /* load x87 control word */
       Ain_A87StSW,     /* store x87 status word */
-//.. 
-//..       Xin_FpUnary,   /* FP fake unary op */
-//..       Xin_FpBinary,  /* FP fake binary op */
-//..       Xin_FpLdSt,    /* FP fake load/store */
-//..       Xin_FpLdStI,   /* FP fake load/store, converting to/from Int */
-//..       Xin_Fp64to32,  /* FP round IEEE754 double to IEEE754 single */
-//..       Xin_FpCMov,    /* FP fake floating point conditional move */
       Ain_LdMXCSR,     /* load %mxcsr */
-//..       Xin_FpStSW_AX, /* fstsw %ax */
       Ain_SseUComIS,   /* ucomisd/ucomiss, then get %rflags into int
                           register */
       Ain_SseSI2SF,    /* scalar 32/64 int to 32/64 float conversion */
       Ain_SseSF2SI,    /* scalar 32/64 float to 32/64 int conversion */
       Ain_SseSDSS,     /* scalar float32 to/from float64 */
-//.. 
-//..       Xin_SseConst,  /* Generate restricted SSE literal */
       Ain_SseLdSt,     /* SSE load/store 32/64/128 bits, no alignment
                           constraints, upper 96/64/0 bits arbitrary */
       Ain_SseLdzLO,    /* SSE load low 32/64 bits, zero remainder of reg */
@@ -408,7 +398,9 @@
       Ain_Sse64FLo,    /* SSE binary, 64F in lowest lane only */
       Ain_SseReRg,     /* SSE binary general reg-reg, Re, Rg */
       Ain_SseCMov,     /* SSE conditional move */
-      Ain_SseShuf      /* SSE2 shuffle (pshufd) */
+      Ain_SseShuf,     /* SSE2 shuffle (pshufd) */
+      Ain_EvCheck,     /* Event check */
+      Ain_ProfInc      /* 64-bit profile counter increment */
    }
    AMD64InstrTag;
 
@@ -470,13 +462,6 @@
             Int      sz; /* 4 or 8 only */
             AMD64RM* src;
          } Div;
-//..          /* shld/shrd.  op may only be Xsh_SHL or Xsh_SHR */
-//..          struct {
-//..             X86ShiftOp op;
-//..             UInt       amt;   /* shift amount, or 0 means %cl */
-//..             HReg       src;
-//..             HReg       dst;
-//..          } Sh3232;
          struct {
             AMD64RMI* src;
          } Push;
@@ -487,13 +472,29 @@
             Addr64        target;
             Int           regparms; /* 0 .. 6 */
          } Call;
-         /* Pseudo-insn.  Goto dst, on given condition (which could be
-            Acc_ALWAYS). */
+         /* Update the guest RIP value, then exit requesting to chain
+            to it.  May be conditional. */
          struct {
+            Addr64        dstGA;    /* next guest address */
+            AMD64AMode*   amRIP;    /* amode in guest state for RIP */
+            AMD64CondCode cond;     /* can be Acc_ALWAYS */
+            Bool          toFastEP; /* chain to the slow or fast point? */
+         } XDirect;
+         /* Boring transfer to a guest address not known at JIT time.
+            Not chainable.  May be conditional. */
+         struct {
+            HReg          dstGA;
+            AMD64AMode*   amRIP;
+            AMD64CondCode cond; /* can be Acc_ALWAYS */
+         } XIndir;
+         /* Assisted transfer to a guest address, most general case.
+            Not chainable.  May be conditional. */
+         struct {
+            HReg          dstGA;
+            AMD64AMode*   amRIP;
+            AMD64CondCode cond; /* can be Acc_ALWAYS */
             IRJumpKind    jk;
-            AMD64CondCode cond;
-            AMD64RI*      dst;
-         } Goto;
+         } XAssisted;
          /* Mov src to dst on the given condition, which may not
             be the bogus Acc_ALWAYS. */
          struct {
@@ -588,11 +589,6 @@
             AMD64AMode* addr;
          }
          LdMXCSR;
-//..          /* fstsw %ax */
-//..          struct {
-//..             /* no fields */
-//..          }
-//..          FpStSW_AX;
          /* ucomisd/ucomiss, then get %rflags into int register */
          struct {
             UChar   sz;   /* 4 or 8 only */
@@ -620,12 +616,6 @@
             HReg src;
             HReg dst;
          } SseSDSS;
-//.. 
-//..          /* Simplistic SSE[123] */
-//..          struct {
-//..             UShort  con;
-//..             HReg    dst;
-//..          } SseConst;
          struct {
             Bool        isLoad;
             UChar       sz; /* 4, 8 or 16 only */
@@ -674,6 +664,15 @@
             HReg   src;
             HReg   dst;
          } SseShuf;
+         struct {
+            AMD64AMode* amCounter;
+            AMD64AMode* amFailAddr;
+         } EvCheck;
+         struct {
+            /* No fields.  The address of the counter to inc is
+               installed later, post-translation, by patching it in,
+               as it is not known at translation time. */
+         } ProfInc;
 
       } Ain;
    }
@@ -689,10 +688,14 @@
 extern AMD64Instr* AMD64Instr_Test64     ( UInt imm32, HReg dst );
 extern AMD64Instr* AMD64Instr_MulL       ( Bool syned, AMD64RM* );
 extern AMD64Instr* AMD64Instr_Div        ( Bool syned, Int sz, AMD64RM* );
-//.. extern AMD64Instr* AMD64Instr_Sh3232    ( AMD64ShiftOp, UInt amt, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_Push       ( AMD64RMI* );
 extern AMD64Instr* AMD64Instr_Call       ( AMD64CondCode, Addr64, Int );
-extern AMD64Instr* AMD64Instr_Goto       ( IRJumpKind, AMD64CondCode cond, AMD64RI* dst );
+extern AMD64Instr* AMD64Instr_XDirect    ( Addr64 dstGA, AMD64AMode* amRIP,
+                                           AMD64CondCode cond, Bool toFastEP );
+extern AMD64Instr* AMD64Instr_XIndir     ( HReg dstGA, AMD64AMode* amRIP,
+                                           AMD64CondCode cond );
+extern AMD64Instr* AMD64Instr_XAssisted  ( HReg dstGA, AMD64AMode* amRIP,
+                                           AMD64CondCode cond, IRJumpKind jk );
 extern AMD64Instr* AMD64Instr_CMov64     ( AMD64CondCode, AMD64RM* src, HReg dst );
 extern AMD64Instr* AMD64Instr_MovxLQ     ( Bool syned, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_LoadEX     ( UChar szSmall, Bool syned,
@@ -709,21 +712,11 @@
 extern AMD64Instr* AMD64Instr_A87FpOp    ( A87FpOp op );
 extern AMD64Instr* AMD64Instr_A87LdCW    ( AMD64AMode* addr );
 extern AMD64Instr* AMD64Instr_A87StSW    ( AMD64AMode* addr );
-//.. 
-//.. extern AMD64Instr* AMD64Instr_FpUnary   ( AMD64FpOp op, HReg src, HReg dst );
-//.. extern AMD64Instr* AMD64Instr_FpBinary  ( AMD64FpOp op, HReg srcL, HReg srcR, HReg dst );
-//.. extern AMD64Instr* AMD64Instr_FpLdSt    ( Bool isLoad, UChar sz, HReg reg, AMD64AMode* );
-//.. extern AMD64Instr* AMD64Instr_FpLdStI   ( Bool isLoad, UChar sz, HReg reg, AMD64AMode* );
-//.. extern AMD64Instr* AMD64Instr_Fp64to32  ( HReg src, HReg dst );
-//.. extern AMD64Instr* AMD64Instr_FpCMov    ( AMD64CondCode, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_LdMXCSR    ( AMD64AMode* );
-//.. extern AMD64Instr* AMD64Instr_FpStSW_AX ( void );
 extern AMD64Instr* AMD64Instr_SseUComIS  ( Int sz, HReg srcL, HReg srcR, HReg dst );
 extern AMD64Instr* AMD64Instr_SseSI2SF   ( Int szS, Int szD, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_SseSF2SI   ( Int szS, Int szD, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_SseSDSS    ( Bool from64, HReg src, HReg dst );
-//.. 
-//.. extern AMD64Instr* AMD64Instr_SseConst  ( UShort con, HReg dst );
 extern AMD64Instr* AMD64Instr_SseLdSt    ( Bool isLoad, Int sz, HReg, AMD64AMode* );
 extern AMD64Instr* AMD64Instr_SseLdzLO   ( Int sz, HReg, AMD64AMode* );
 extern AMD64Instr* AMD64Instr_Sse32Fx4   ( AMD64SseOp, HReg, HReg );
@@ -733,6 +726,9 @@
 extern AMD64Instr* AMD64Instr_SseReRg    ( AMD64SseOp, HReg, HReg );
 extern AMD64Instr* AMD64Instr_SseCMov    ( AMD64CondCode, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_SseShuf    ( Int order, HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_EvCheck    ( AMD64AMode* amCounter,
+                                           AMD64AMode* amFailAddr );
+extern AMD64Instr* AMD64Instr_ProfInc    ( void );
 
 
 extern void ppAMD64Instr ( AMD64Instr*, Bool );
@@ -742,10 +738,13 @@
 extern void         getRegUsage_AMD64Instr ( HRegUsage*, AMD64Instr*, Bool );
 extern void         mapRegs_AMD64Instr     ( HRegRemap*, AMD64Instr*, Bool );
 extern Bool         isMove_AMD64Instr      ( AMD64Instr*, HReg*, HReg* );
-extern Int          emit_AMD64Instr        ( UChar* buf, Int nbuf, AMD64Instr*, 
-                                             Bool,
-                                             void* dispatch_unassisted,
-                                             void* dispatch_assisted );
+extern Int          emit_AMD64Instr        ( /*MB_MOD*/Bool* is_profInc,
+                                             UChar* buf, Int nbuf, AMD64Instr* i, 
+                                             Bool mode64,
+                                             void* disp_cp_chain_me_to_slowEP,
+                                             void* disp_cp_chain_me_to_fastEP,
+                                             void* disp_cp_xindir,
+                                             void* disp_cp_xassisted );
 
 extern void genSpill_AMD64  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                               HReg rreg, Int offset, Bool );
@@ -753,9 +752,36 @@
                               HReg rreg, Int offset, Bool );
 
 extern void         getAllocableRegs_AMD64 ( Int*, HReg** );
-extern HInstrArray* iselSB_AMD64           ( IRSB*, VexArch,
-                                                    VexArchInfo*,
-                                                    VexAbiInfo* );
+extern HInstrArray* iselSB_AMD64           ( IRSB*, 
+                                             VexArch,
+                                             VexArchInfo*,
+                                             VexAbiInfo*,
+                                             Int offs_Host_EvC_Counter,
+                                             Int offs_Host_EvC_FailAddr,
+                                             Bool chainingAllowed,
+                                             Bool addProfInc,
+                                             Addr64 max_ga );
+
+/* How big is an event check?  This is kind of a kludge because it
+   depends on the offsets of host_EvC_FAILADDR and host_EvC_COUNTER,
+   and so assumes that they are both <= 128, and so can use the short
+   offset encoding.  This is all checked with assertions, so in the
+   worst case we will merely assert at startup. */
+extern Int evCheckSzB_AMD64 ( void );
+
+/* Perform a chaining and unchaining of an XDirect jump. */
+extern VexInvalRange chainXDirect_AMD64 ( void* place_to_chain,
+                                          void* disp_cp_chain_me_EXPECTED,
+                                          void* place_to_jump_to );
+
+extern VexInvalRange unchainXDirect_AMD64 ( void* place_to_unchain,
+                                            void* place_to_jump_to_EXPECTED,
+                                            void* disp_cp_chain_me );
+
+/* Patch the counter location into an existing ProfInc point. */
+extern VexInvalRange patchProfInc_AMD64 ( void*  place_to_patch,
+                                          ULong* location_of_counter );
+
 
 #endif /* ndef __VEX_HOST_AMD64_DEFS_H */
 
diff --git a/priv/host_amd64_isel.c b/priv/host_amd64_isel.c
index bcd213f..a365a5a 100644
--- a/priv/host_amd64_isel.c
+++ b/priv/host_amd64_isel.c
@@ -112,12 +112,24 @@
              64-bit virtual HReg, which holds the high half
              of the value.
 
+   - The host subarchitecture we are selecting insns for.  
+     This is set at the start and does not change.
+
    - The code array, that is, the insns selected so far.
 
    - A counter, for generating new virtual registers.
 
-   - The host subarchitecture we are selecting insns for.  
-     This is set at the start and does not change.
+   - A Bool for indicating whether we may generate chain-me
+     instructions for control flow transfers, or whether we must use
+     XAssisted.
+
+   - The maximum guest address of any guest insn in this block.
+     Actually, the address of the highest-addressed byte from any insn
+     in this block.  Is set at the start and does not change.  This is
+     used for detecting jumps which are definitely forward-edges from
+     this block, and therefore can be made (chained) to the fast entry
+     point of the destination, thereby avoiding the destination's
+     event check.
 
    Note, this is all host-independent.  (JRS 20050201: well, kinda
    ... not completely.  Compare with ISelEnv for X86.)
@@ -125,17 +137,21 @@
 
 typedef
    struct {
+      /* Constant -- are set at the start and do not change. */
       IRTypeEnv*   type_env;
 
       HReg*        vregmap;
       HReg*        vregmapHI;
       Int          n_vregmap;
 
-      HInstrArray* code;
-
-      Int          vreg_ctr;
-
       UInt         hwcaps;
+
+      Bool         chainingAllowed;
+      Addr64       max_ga;
+
+      /* These are modified as we go along. */
+      HInstrArray* code;
+      Int          vreg_ctr;
    }
    ISelEnv;
 
@@ -4131,14 +4147,47 @@
 
    /* --------- EXIT --------- */
    case Ist_Exit: {
-      AMD64RI*      dst;
-      AMD64CondCode cc;
       if (stmt->Ist.Exit.dst->tag != Ico_U64)
          vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
-      dst = iselIntExpr_RI(env, IRExpr_Const(stmt->Ist.Exit.dst));
-      cc  = iselCondCode(env,stmt->Ist.Exit.guard);
-      addInstr(env, AMD64Instr_Goto(stmt->Ist.Exit.jk, cc, dst));
-      return;
+
+      AMD64CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
+      AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
+                                          hregAMD64_RBP());
+
+      /* Case: boring transfer to known address */
+      if (stmt->Ist.Exit.jk == Ijk_Boring) {
+         if (env->chainingAllowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool toFastEP
+               = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
+            if (0) vex_printf("%s", toFastEP ? "Y" : ",");
+            addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
+                                             amRIP, cc, toFastEP));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an assisted transfer,
+               as that's the only alternative that is allowable. */
+            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+            addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
+         }
+         return;
+      }
+
+      /* Case: assisted transfer to arbitrary address */
+      switch (stmt->Ist.Exit.jk) {
+         case Ijk_SigSEGV: case Ijk_TInval: case Ijk_EmWarn: {
+            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+            addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
+            return;
+         }
+         default:
+            break;
+      }
+
+      /* Do we ever expect to see any other kind? */
+      goto stmt_fail;
    }
 
    default: break;
@@ -4153,18 +4202,83 @@
 /*--- ISEL: Basic block terminators (Nexts)             ---*/
 /*---------------------------------------------------------*/
 
-static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
+static void iselNext ( ISelEnv* env,
+                       IRExpr* next, IRJumpKind jk, Int offsIP )
 {
-   AMD64RI* ri;
    if (vex_traceflags & VEX_TRACE_VCODE) {
-      vex_printf("\n-- goto {");
+      vex_printf( "\n-- PUT(%d) = ", offsIP);
+      ppIRExpr( next );
+      vex_printf( "; exit-");
       ppIRJumpKind(jk);
-      vex_printf("} ");
-      ppIRExpr(next);
-      vex_printf("\n");
+      vex_printf( "\n");
    }
-   ri = iselIntExpr_RI(env, next);
-   addInstr(env, AMD64Instr_Goto(jk, Acc_ALWAYS,ri));
+
+   /* Case: boring transfer to known address */
+   if (next->tag == Iex_Const) {
+      IRConst* cdst = next->Iex.Const.con;
+      vassert(cdst->tag == Ico_U64);
+      if (jk == Ijk_Boring || jk == Ijk_Call) {
+         /* Boring transfer to known address */
+         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
+         if (env->chainingAllowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool toFastEP
+               = ((Addr64)cdst->Ico.U64) > env->max_ga;
+            if (0) vex_printf("%s", toFastEP ? "X" : ".");
+            addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64, 
+                                             amRIP, Acc_ALWAYS, 
+                                             toFastEP));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an indirect transfer,
+               as that's the cheapest alternative that is
+               allowable. */
+            HReg r = iselIntExpr_R(env, next);
+            addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
+                                               Ijk_Boring));
+         }
+         return;
+      }
+   }
+
+   /* Case: call/return (==boring) transfer to any address */
+   switch (jk) {
+      case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
+         HReg        r     = iselIntExpr_R(env, next);
+         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
+         if (env->chainingAllowed) {
+            addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
+         } else {
+            addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
+                                               Ijk_Boring));
+         }
+         return;
+      }
+      default:
+         break;
+   }
+
+   /* Case: some other kind of transfer to any address */
+   switch (jk) {
+      case Ijk_Sys_syscall: case Ijk_ClientReq: case Ijk_NoRedir:
+      case Ijk_Yield: case Ijk_SigTRAP: case Ijk_TInval: {
+         HReg        r     = iselIntExpr_R(env, next);
+         AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
+         addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
+         return;
+      }
+      default:
+         break;
+   }
+
+   vex_printf( "\n-- PUT(%d) = ", offsIP);
+   ppIRExpr( next );
+   vex_printf( "; exit-");
+   ppIRJumpKind(jk);
+   vex_printf( "\n");
+   vassert(0); // are we expecting any other kind?
 }
 
 
@@ -4174,14 +4288,21 @@
 
 /* Translate an entire SB to amd64 code. */
 
-HInstrArray* iselSB_AMD64 ( IRSB* bb, VexArch      arch_host,
-                                      VexArchInfo* archinfo_host,
-                                      VexAbiInfo*  vbi/*UNUSED*/ )
+HInstrArray* iselSB_AMD64 ( IRSB* bb,
+                            VexArch      arch_host,
+                            VexArchInfo* archinfo_host,
+                            VexAbiInfo*  vbi/*UNUSED*/,
+                            Int offs_Host_EvC_Counter,
+                            Int offs_Host_EvC_FailAddr,
+                            Bool chainingAllowed,
+                            Bool addProfInc,
+                            Addr64 max_ga )
 {
-   Int      i, j;
-   HReg     hreg, hregHI;
-   ISelEnv* env;
-   UInt     hwcaps_host = archinfo_host->hwcaps;
+   Int        i, j;
+   HReg       hreg, hregHI;
+   ISelEnv*   env;
+   UInt       hwcaps_host = archinfo_host->hwcaps;
+   AMD64AMode *amCounter, *amFailAddr;
 
    /* sanity ... */
    vassert(arch_host == VexArchAMD64);
@@ -4207,7 +4328,9 @@
    env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
 
    /* and finally ... */
-   env->hwcaps = hwcaps_host;
+   env->chainingAllowed = chainingAllowed;
+   env->hwcaps          = hwcaps_host;
+   env->max_ga          = max_ga;
 
    /* For each IR temporary, allocate a suitably-kinded virtual
       register. */
@@ -4233,12 +4356,25 @@
    }
    env->vreg_ctr = j;
 
+   /* The very first instruction must be an event check. */
+   amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
+   amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
+   addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
+
+   /* Possibly a block counter increment (for profiling).  At this
+      point we don't know the address of the counter, so just pretend
+      it is zero.  It will have to be patched later, but before this
+      translation is used, by a call to LibVEX_patchProfCtr. */
+   if (addProfInc) {
+      addInstr(env, AMD64Instr_ProfInc());
+   }
+
    /* Ok, finally we can iterate over the statements. */
    for (i = 0; i < bb->stmts_used; i++)
       if (bb->stmts[i])
-         iselStmt(env,bb->stmts[i]);
+         iselStmt(env, bb->stmts[i]);
 
-   iselNext(env,bb->next,bb->jumpkind);
+   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
 
    /* record the number of vregs we used. */
    env->code->n_vregs = env->vreg_ctr;
diff --git a/priv/host_arm_defs.c b/priv/host_arm_defs.c
index 2f0ebf0..755699e 100644
--- a/priv/host_arm_defs.c
+++ b/priv/host_arm_defs.c
@@ -1170,13 +1170,33 @@
    i->ARMin.LdSt8U.amode  = amode;
    return i;
 }
-//extern ARMInstr* ARMInstr_Ld8S   ( HReg, ARMAMode2* );
-ARMInstr* ARMInstr_Goto ( IRJumpKind jk, ARMCondCode cond, HReg gnext ) {
-   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
-   i->tag              = ARMin_Goto;
-   i->ARMin.Goto.jk    = jk;
-   i->ARMin.Goto.cond  = cond;
-   i->ARMin.Goto.gnext = gnext;
+ARMInstr* ARMInstr_XDirect ( Addr32 dstGA, ARMAMode1* amR15T,
+                             ARMCondCode cond, Bool toFastEP ) {
+   ARMInstr* i               = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                    = ARMin_XDirect;
+   i->ARMin.XDirect.dstGA    = dstGA;
+   i->ARMin.XDirect.amR15T   = amR15T;
+   i->ARMin.XDirect.cond     = cond;
+   i->ARMin.XDirect.toFastEP = toFastEP;
+   return i;
+}
+ARMInstr* ARMInstr_XIndir ( HReg dstGA, ARMAMode1* amR15T,
+                            ARMCondCode cond ) {
+   ARMInstr* i            = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                 = ARMin_XIndir;
+   i->ARMin.XIndir.dstGA  = dstGA;
+   i->ARMin.XIndir.amR15T = amR15T;
+   i->ARMin.XIndir.cond   = cond;
+   return i;
+}
+ARMInstr* ARMInstr_XAssisted ( HReg dstGA, ARMAMode1* amR15T,
+                               ARMCondCode cond, IRJumpKind jk ) {
+   ARMInstr* i               = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                    = ARMin_XAssisted;
+   i->ARMin.XAssisted.dstGA  = dstGA;
+   i->ARMin.XAssisted.amR15T = amR15T;
+   i->ARMin.XAssisted.cond   = cond;
+   i->ARMin.XAssisted.jk     = jk;
    return i;
 }
 ARMInstr* ARMInstr_CMov ( ARMCondCode cond, HReg dst, ARMRI84* src ) {
@@ -1479,6 +1499,21 @@
    return i;
 }
 
+ARMInstr* ARMInstr_EvCheck ( ARMAMode1* amCounter,
+                             ARMAMode1* amFailAddr ) {
+   ARMInstr* i                 = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                      = ARMin_EvCheck;
+   i->ARMin.EvCheck.amCounter  = amCounter;
+   i->ARMin.EvCheck.amFailAddr = amFailAddr;
+   return i;
+}
+
+ARMInstr* ARMInstr_ProfInc ( void ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag      = ARMin_ProfInc;
+   return i;
+}
+
 /* ... */
 
 void ppARMInstr ( ARMInstr* i ) {
@@ -1564,28 +1599,47 @@
          return;
       case ARMin_Ld8S:
          goto unhandled;
-      case ARMin_Goto:
-         if (i->ARMin.Goto.cond != ARMcc_AL) {
-            vex_printf("if (%%cpsr.%s) { ",
-                       showARMCondCode(i->ARMin.Goto.cond));
-         } else {
-            vex_printf("if (1) { ");
-         }
-         if (i->ARMin.Goto.jk != Ijk_Boring
-             && i->ARMin.Goto.jk != Ijk_Call
-             && i->ARMin.Goto.jk != Ijk_Ret) {
-            vex_printf("mov r8, $");
-            ppIRJumpKind(i->ARMin.Goto.jk);
-            vex_printf(" ; ");
-         }
-         vex_printf("mov r0, ");
-         ppHRegARM(i->ARMin.Goto.gnext);
-         vex_printf(" ; bx r14");
-         if (i->ARMin.Goto.cond != ARMcc_AL) {
-            vex_printf(" }");
-         } else {
-            vex_printf(" }");
-         }
+      case ARMin_XDirect:
+         vex_printf("(xDirect) ");
+         vex_printf("if (%%cpsr.%s) { ",
+                    showARMCondCode(i->ARMin.XDirect.cond));
+         vex_printf("movw r12,0x%x; ",
+                    (UInt)(i->ARMin.XDirect.dstGA & 0xFFFF));
+         vex_printf("movt r12,0x%x; ",
+                    (UInt)((i->ARMin.XDirect.dstGA >> 16) & 0xFFFF));
+         vex_printf("str r12,");
+         ppARMAMode1(i->ARMin.XDirect.amR15T);
+         vex_printf("; movw r12,LO16($disp_cp_chain_me_to_%sEP); ",
+                    i->ARMin.XDirect.toFastEP ? "fast" : "slow");
+         vex_printf("movt r12,HI16($disp_cp_chain_me_to_%sEP); ",
+                    i->ARMin.XDirect.toFastEP ? "fast" : "slow");
+         vex_printf("blx r12 }");
+         return;
+      case ARMin_XIndir:
+         vex_printf("(xIndir) ");
+         vex_printf("if (%%cpsr.%s) { ",
+                    showARMCondCode(i->ARMin.XIndir.cond));
+         vex_printf("str ");
+         ppHRegARM(i->ARMin.XIndir.dstGA);
+         vex_printf(",");
+         ppARMAMode1(i->ARMin.XIndir.amR15T);
+         vex_printf("; movw r12,LO16($disp_cp_xindir); ");
+         vex_printf("movt r12,HI16($disp_cp_xindir); ");
+         vex_printf("blx r12 }");
+         return;
+      case ARMin_XAssisted:
+         vex_printf("(xAssisted) ");
+         vex_printf("if (%%cpsr.%s) { ",
+                    showARMCondCode(i->ARMin.XAssisted.cond));
+         vex_printf("str ");
+         ppHRegARM(i->ARMin.XAssisted.dstGA);
+         vex_printf(",");
+         ppARMAMode1(i->ARMin.XAssisted.amR15T);
+         vex_printf("movw r8,$IRJumpKind_to_TRCVAL(%d); ",
+                    (Int)i->ARMin.XAssisted.jk);
+         vex_printf("movw r12,LO16($disp_cp_xassisted); ");
+         vex_printf("movt r12,HI16($disp_cp_xassisted); ");
+         vex_printf("blx r12 }");
          return;
       case ARMin_CMov:
          vex_printf("mov%s ", showARMCondCode(i->ARMin.CMov.cond));
@@ -1761,8 +1815,7 @@
          }
          return;
       case ARMin_MFence:
-         vex_printf("mfence (mcr 15,0,r0,c7,c10,4; 15,0,r0,c7,c10,5; "
-                    "15,0,r0,c7,c5,4)");
+         vex_printf("(mfence) dsb sy; dmb sy; isb");
          return;
       case ARMin_CLREX:
          vex_printf("clrex");
@@ -1878,6 +1931,25 @@
          vex_printf(", ");
          vex_printf("%d", i->ARMin.Add32.imm32);
          return;
+      case ARMin_EvCheck:
+         vex_printf("(evCheck) ldr r12,");
+         ppARMAMode1(i->ARMin.EvCheck.amCounter);
+         vex_printf("; subs r12,r12,$1; str r12,");
+         ppARMAMode1(i->ARMin.EvCheck.amCounter);
+         vex_printf("; bpl nofail; ldr r12,");
+         ppARMAMode1(i->ARMin.EvCheck.amFailAddr);
+         vex_printf("; bx r12; nofail:");
+         return;
+      case ARMin_ProfInc:
+         vex_printf("(profInc) movw r12,LO16($NotKnownYet); "
+                    "movw r12,HI16($NotKnownYet); "
+                    "ldr r11,[r12]; "
+                    "adds r11,r11,$1; "
+                    "str r11,[r12]; "
+                    "ldr r11,[r12+4]; "
+                    "adc r11,r11,$0; "
+                    "str r11,[r12+4]");
+         return;
       default:
       unhandled:
          vex_printf("ppARMInstr: unhandled case (tag %d)", (Int)i->tag);
@@ -1945,18 +2017,21 @@
          return;
       case ARMin_Ld8S:
          goto unhandled;
-      case ARMin_Goto:
-         /* reads the reg holding the next guest addr */
-         addHRegUse(u, HRmRead, i->ARMin.Goto.gnext);
-         /* writes it to the standard integer return register */
-         addHRegUse(u, HRmWrite, hregARM_R0());
-         /* possibly messes with the baseblock pointer */
-         if (i->ARMin.Goto.jk != Ijk_Boring
-             && i->ARMin.Goto.jk != Ijk_Call
-             && i->ARMin.Goto.jk != Ijk_Ret)
-            /* note, this is irrelevant since r8 is not actually
-               available to the allocator.  But still .. */
-            addHRegUse(u, HRmWrite, hregARM_R8());
+      /* XDirect/XIndir/XAssisted are also a bit subtle.  They
+         conditionally exit the block.  Hence we only need to list (1)
+         the registers that they read, and (2) the registers that they
+         write in the case where the block is not exited.  (2) is
+         empty, hence only (1) is relevant here. */
+      case ARMin_XDirect:
+         addRegUsage_ARMAMode1(u, i->ARMin.XDirect.amR15T);
+         return;
+      case ARMin_XIndir:
+         addHRegUse(u, HRmRead, i->ARMin.XIndir.dstGA);
+         addRegUsage_ARMAMode1(u, i->ARMin.XIndir.amR15T);
+         return;
+      case ARMin_XAssisted:
+         addHRegUse(u, HRmRead, i->ARMin.XAssisted.dstGA);
+         addRegUsage_ARMAMode1(u, i->ARMin.XAssisted.amR15T);
          return;
       case ARMin_CMov:
          addHRegUse(u, HRmWrite, i->ARMin.CMov.dst);
@@ -2159,6 +2234,18 @@
          addHRegUse(u, HRmWrite, i->ARMin.Add32.rD);
          addHRegUse(u, HRmRead, i->ARMin.Add32.rN);
          return;
+      case ARMin_EvCheck:
+         /* We expect both amodes only to mention r8, so this is in
+            fact pointless, since r8 isn't allocatable, but
+            anyway.. */
+         addRegUsage_ARMAMode1(u, i->ARMin.EvCheck.amCounter);
+         addRegUsage_ARMAMode1(u, i->ARMin.EvCheck.amFailAddr);
+         addHRegUse(u, HRmWrite, hregARM_R12()); /* also unavail to RA */
+         return;
+      case ARMin_ProfInc:
+         addHRegUse(u, HRmWrite, hregARM_R12());
+         addHRegUse(u, HRmWrite, hregARM_R11());
+         return;
       unhandled:
       default:
          ppARMInstr(i);
@@ -2210,8 +2297,18 @@
          return;
       case ARMin_Ld8S:
          goto unhandled;
-      case ARMin_Goto:
-         i->ARMin.Goto.gnext = lookupHRegRemap(m, i->ARMin.Goto.gnext);
+      case ARMin_XDirect:
+         mapRegs_ARMAMode1(m, i->ARMin.XDirect.amR15T);
+         return;
+      case ARMin_XIndir:
+         i->ARMin.XIndir.dstGA
+            = lookupHRegRemap(m, i->ARMin.XIndir.dstGA);
+         mapRegs_ARMAMode1(m, i->ARMin.XIndir.amR15T);
+         return;
+      case ARMin_XAssisted:
+         i->ARMin.XAssisted.dstGA
+            = lookupHRegRemap(m, i->ARMin.XAssisted.dstGA);
+         mapRegs_ARMAMode1(m, i->ARMin.XAssisted.amR15T);
          return;
       case ARMin_CMov:
          i->ARMin.CMov.dst = lookupHRegRemap(m, i->ARMin.CMov.dst);
@@ -2329,6 +2426,17 @@
       case ARMin_Add32:
          i->ARMin.Add32.rD = lookupHRegRemap(m, i->ARMin.Add32.rD);
          i->ARMin.Add32.rN = lookupHRegRemap(m, i->ARMin.Add32.rN);
+         return;
+      case ARMin_EvCheck:
+         /* We expect both amodes only to mention r8, so this is in
+            fact pointless, since r8 isn't allocatable, but
+            anyway.. */
+         mapRegs_ARMAMode1(m, i->ARMin.EvCheck.amCounter);
+         mapRegs_ARMAMode1(m, i->ARMin.EvCheck.amFailAddr);
+         return;
+      case ARMin_ProfInc:
+         /* hardwires r11 and r12 -- nothing to modify. */
+         return;
       unhandled:
       default:
          ppARMInstr(i);
@@ -2586,6 +2694,9 @@
     (((zzx3) & 0xF) << 12) | (((zzx2) & 0xF) <<  8) |  \
     (((zzx1) & 0xF) <<  4) | (((zzx0) & 0xF) <<  0))
 
+#define XX______(zzx7,zzx6) \
+   ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24))
+
 /* Generate a skeletal insn that involves an a RI84 shifter operand.
    Returns a word which is all zeroes apart from bits 25 and 11..0,
    since it is those that encode the shifter operand (at least to the
@@ -2704,10 +2815,92 @@
    return p;
 }
 
+/* Get an immediate into a register, using only that register, and
+   generating exactly 2 instructions, regardless of the value of the
+   immediate. This is used when generating sections of code that need
+   to be patched later, so as to guarantee a specific size. */
+static UInt* imm32_to_iregNo_EXACTLY2 ( UInt* p, Int rD, UInt imm32 )
+{
+   if (VEX_ARM_ARCHLEVEL(arm_hwcaps) > 6) {
+      /* Generate movw rD, #low16 ;  movt rD, #high16. */
+      UInt lo16 = imm32 & 0xFFFF;
+      UInt hi16 = (imm32 >> 16) & 0xFFFF;
+      UInt instr;
+      instr = XXXXXXXX(0xE, 0x3, 0x0, (lo16 >> 12) & 0xF, rD,
+                       (lo16 >> 8) & 0xF, (lo16 >> 4) & 0xF,
+                       lo16 & 0xF);
+      *p++ = instr;
+      instr = XXXXXXXX(0xE, 0x3, 0x4, (hi16 >> 12) & 0xF, rD,
+                       (hi16 >> 8) & 0xF, (hi16 >> 4) & 0xF,
+                       hi16 & 0xF);
+      *p++ = instr;
+   } else {
+      vassert(0); /* lose */
+   }
+   return p;
+}
 
-Int emit_ARMInstr ( UChar* buf, Int nbuf, ARMInstr* i,
+/* Check whether p points at a 2-insn sequence cooked up by
+   imm32_to_iregNo_EXACTLY2(). */
+static Bool is_imm32_to_iregNo_EXACTLY2 ( UInt* p, Int rD, UInt imm32 )
+{
+   if (VEX_ARM_ARCHLEVEL(arm_hwcaps) > 6) {
+      /* Generate movw rD, #low16 ;  movt rD, #high16. */
+      UInt lo16 = imm32 & 0xFFFF;
+      UInt hi16 = (imm32 >> 16) & 0xFFFF;
+      UInt i0, i1;
+      i0 = XXXXXXXX(0xE, 0x3, 0x0, (lo16 >> 12) & 0xF, rD,
+                    (lo16 >> 8) & 0xF, (lo16 >> 4) & 0xF,
+                    lo16 & 0xF);
+      i1 = XXXXXXXX(0xE, 0x3, 0x4, (hi16 >> 12) & 0xF, rD,
+                    (hi16 >> 8) & 0xF, (hi16 >> 4) & 0xF,
+                    hi16 & 0xF);
+      return p[0] == i0 && p[1] == i1;
+   } else {
+      vassert(0); /* lose */
+   }
+}
+
+
+static UInt* do_load_or_store32 ( UInt* p,
+                                  Bool isLoad, UInt rD, ARMAMode1* am )
+{
+   vassert(rD <= 12);
+   vassert(am->tag == ARMam1_RI); // RR case is not handled
+   UInt bB = 0;
+   UInt bL = isLoad ? 1 : 0;
+   Int  simm12;
+   UInt instr, bP;
+   if (am->ARMam1.RI.simm13 < 0) {
+      bP = 0;
+      simm12 = -am->ARMam1.RI.simm13;
+   } else {
+      bP = 1;
+      simm12 = am->ARMam1.RI.simm13;
+   }
+   vassert(simm12 >= 0 && simm12 <= 4095);
+   instr = XXXXX___(X1110,X0101,BITS4(bP,bB,0,bL),
+                    iregNo(am->ARMam1.RI.reg),
+                    rD);
+   instr |= simm12;
+   *p++ = instr;
+   return p;
+}
+
+
+/* Emit an instruction into buf and return the number of bytes used.
+   Note that buf is not the insn's final place, and therefore it is
+   imperative to emit position-independent code.  If the emitted
+   instruction was a profiler inc, set *is_profInc to True, else
+   leave it unchanged. */
+
+Int emit_ARMInstr ( /*MB_MOD*/Bool* is_profInc,
+                    UChar* buf, Int nbuf, ARMInstr* i, 
                     Bool mode64,
-                    void* dispatch_unassisted, void* dispatch_assisted ) 
+                    void* disp_cp_chain_me_to_slowEP,
+                    void* disp_cp_chain_me_to_fastEP,
+                    void* disp_cp_xindir,
+                    void* disp_cp_xassisted )
 {
    UInt* p = (UInt*)buf;
    vassert(nbuf >= 32);
@@ -2894,59 +3087,177 @@
       }
       case ARMin_Ld8S:
          goto bad;
-      case ARMin_Goto: {
-         UInt        instr;
-         IRJumpKind  jk    = i->ARMin.Goto.jk;
-         ARMCondCode cond  = i->ARMin.Goto.cond;
-         UInt        rnext = iregNo(i->ARMin.Goto.gnext);
-         Int         trc   = -1;
-         /* since we branch to lr(r13) to get back to dispatch: */
-         vassert(dispatch_unassisted == NULL);
-         vassert(dispatch_assisted == NULL);
-         switch (jk) {
-            case Ijk_Ret: case Ijk_Call: case Ijk_Boring:
-               break; /* no need to set GST in these common cases */
-            case Ijk_ClientReq:
-               trc = VEX_TRC_JMP_CLIENTREQ; break;
-            case Ijk_Sys_int128:
-            case Ijk_Sys_int129:
-            case Ijk_Sys_int130:
-            case Ijk_Yield:
-            case Ijk_EmWarn:
-            case Ijk_MapFail:
-               goto unhandled_jk;
-            case Ijk_NoDecode:
-               trc = VEX_TRC_JMP_NODECODE; break;
-            case Ijk_TInval:
-               trc = VEX_TRC_JMP_TINVAL; break;
-            case Ijk_NoRedir:
-               trc = VEX_TRC_JMP_NOREDIR; break;
-            case Ijk_Sys_sysenter:
-            case Ijk_SigTRAP:
-            case Ijk_SigSEGV:
-               goto unhandled_jk;
-            case Ijk_Sys_syscall:
-               trc = VEX_TRC_JMP_SYS_SYSCALL; break;
-            unhandled_jk:
-            default:
-               goto bad;
+
+      case ARMin_XDirect: {
+         /* NB: what goes on here has to be very closely coordinated
+            with the chainXDirect_ARM and unchainXDirect_ARM below. */
+         /* We're generating chain-me requests here, so we need to be
+            sure this is actually allowed -- no-redir translations
+            can't use chain-me's.  Hence: */
+         vassert(disp_cp_chain_me_to_slowEP != NULL);
+         vassert(disp_cp_chain_me_to_fastEP != NULL);
+
+         /* Use ptmp for backpatching conditional jumps. */
+         UInt* ptmp = NULL;
+
+         /* First off, if this is conditional, create a conditional
+            jump over the rest of it.  Or at least, leave a space for
+            it that we will shortly fill in. */
+         if (i->ARMin.XDirect.cond != ARMcc_AL) {
+            vassert(i->ARMin.XDirect.cond != ARMcc_NV);
+            ptmp = p;
+            *p++ = 0;
          }
-         if (trc != -1) {
-            // mov{cond} r8, #trc
-            vassert(trc >= 0 && trc <= 255);
-            instr = (cond << 28) | 0x03A08000 | (0xFF & (UInt)trc);
-            *p++ = instr;
+
+         /* Update the guest R15T. */
+         /* movw r12, lo16(dstGA) */
+         /* movt r12, hi16(dstGA) */
+         /* str r12, amR15T */
+         p = imm32_to_iregNo(p, /*r*/12, i->ARMin.XDirect.dstGA);
+         p = do_load_or_store32(p, False/*!isLoad*/,
+                                /*r*/12, i->ARMin.XDirect.amR15T);
+
+         /* --- FIRST PATCHABLE BYTE follows --- */
+         /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're
+            calling to) backs up the return address, so as to find the
+            address of the first patchable byte.  So: don't change the
+            number of instructions (3) below. */
+         /* movw r12, lo16(VG_(disp_cp_chain_me_to_{slowEP,fastEP})) */
+         /* movt r12, hi16(VG_(disp_cp_chain_me_to_{slowEP,fastEP})) */
+         /* blx  r12  (A1) */
+         void* disp_cp_chain_me
+                  = i->ARMin.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP 
+                                              : disp_cp_chain_me_to_slowEP;
+         p = imm32_to_iregNo_EXACTLY2(p, /*r*/12,
+                                      (UInt)Ptr_to_ULong(disp_cp_chain_me));
+         *p++ = 0xE12FFF3C;
+         /* --- END of PATCHABLE BYTES --- */
+
+         /* Fix up the conditional jump, if there was one. */
+         if (i->ARMin.XDirect.cond != ARMcc_AL) {
+            Int delta = (UChar*)p - (UChar*)ptmp; /* must be signed */
+            vassert(delta > 0 && delta < 40);
+            vassert((delta & 3) == 0);
+            UInt notCond = 1 ^ (UInt)i->ARMin.XDirect.cond;
+            vassert(notCond <= 13); /* Neither AL nor NV */
+            delta = (delta >> 2) - 2;
+            *ptmp = XX______(notCond, X1010) | (delta & 0xFFFFFF);
          }
-         // mov{cond} r0, rnext
-         if (rnext != 0) {
-            instr = (cond << 28) | 0x01A00000 | rnext;
-            *p++ = instr;
-         }
-         // bx{cond} r14
-         instr =(cond << 28) | 0x012FFF1E;
-         *p++ = instr;
          goto done;
       }
+
+      case ARMin_XIndir: {
+         /* We're generating transfers that could lead indirectly to a
+            chain-me, so we need to be sure this is actually allowed
+            -- no-redir translations are not allowed to reach normal
+            translations without going through the scheduler.  That
+            means no XDirects or XIndirs out from no-redir
+            translations.  Hence: */
+         vassert(disp_cp_xindir != NULL);
+
+         /* Use ptmp for backpatching conditional jumps. */
+         UInt* ptmp = NULL;
+
+         /* First off, if this is conditional, create a conditional
+            jump over the rest of it.  Or at least, leave a space for
+            it that we will shortly fill in. */
+         if (i->ARMin.XIndir.cond != ARMcc_AL) {
+            vassert(i->ARMin.XIndir.cond != ARMcc_NV);
+            ptmp = p;
+            *p++ = 0;
+         }
+
+         /* Update the guest R15T. */
+         /* str r-dstGA, amR15T */
+         p = do_load_or_store32(p, False/*!isLoad*/,
+                                iregNo(i->ARMin.XIndir.dstGA),
+                                i->ARMin.XIndir.amR15T);
+
+         /* movw r12, lo16(VG_(disp_cp_xindir)) */
+         /* movt r12, hi16(VG_(disp_cp_xindir)) */
+         /* bx   r12  (A1) */
+         p = imm32_to_iregNo(p, /*r*/12,
+                             (UInt)Ptr_to_ULong(disp_cp_xindir));
+         *p++ = 0xE12FFF1C;
+
+         /* Fix up the conditional jump, if there was one. */
+         if (i->ARMin.XIndir.cond != ARMcc_AL) {
+            Int delta = (UChar*)p - (UChar*)ptmp; /* must be signed */
+            vassert(delta > 0 && delta < 40);
+            vassert((delta & 3) == 0);
+            UInt notCond = 1 ^ (UInt)i->ARMin.XIndir.cond;
+            vassert(notCond <= 13); /* Neither AL nor NV */
+            delta = (delta >> 2) - 2;
+            *ptmp = XX______(notCond, X1010) | (delta & 0xFFFFFF);
+         }
+         goto done;
+      }
+
+      case ARMin_XAssisted: {
+         /* Use ptmp for backpatching conditional jumps. */
+         UInt* ptmp = NULL;
+
+         /* First off, if this is conditional, create a conditional
+            jump over the rest of it.  Or at least, leave a space for
+            it that we will shortly fill in. */
+         if (i->ARMin.XAssisted.cond != ARMcc_AL) {
+            vassert(i->ARMin.XAssisted.cond != ARMcc_NV);
+            ptmp = p;
+            *p++ = 0;
+         }
+
+         /* Update the guest R15T. */
+         /* str r-dstGA, amR15T */
+         p = do_load_or_store32(p, False/*!isLoad*/,
+                                iregNo(i->ARMin.XAssisted.dstGA),
+                                i->ARMin.XAssisted.amR15T);
+
+         /* movw r8,  $magic_number */
+         UInt trcval = 0;
+         switch (i->ARMin.XAssisted.jk) {
+            case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
+            case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
+            //case Ijk_Sys_int128:  trcval = VEX_TRC_JMP_SYS_INT128;  break;
+            //case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
+            //case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
+            //case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
+            case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
+            //case Ijk_TInval:      trcval = VEX_TRC_JMP_TINVAL;      break;
+            case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
+            //case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
+            //case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
+            case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
+            /* We don't expect to see the following being assisted. */
+            //case Ijk_Ret:
+            //case Ijk_Call:
+            /* fallthrough */
+            default: 
+               ppIRJumpKind(i->ARMin.XAssisted.jk);
+               vpanic("emit_ARMInstr.ARMin_XAssisted: unexpected jump kind");
+         }
+         vassert(trcval != 0);
+         p = imm32_to_iregNo(p, /*r*/8, trcval);
+
+         /* movw r12, lo16(VG_(disp_cp_xassisted)) */
+         /* movt r12, hi16(VG_(disp_cp_xassisted)) */
+         /* bx   r12  (A1) */
+         p = imm32_to_iregNo(p, /*r*/12,
+                             (UInt)Ptr_to_ULong(disp_cp_xassisted));
+         *p++ = 0xE12FFF1C;
+
+         /* Fix up the conditional jump, if there was one. */
+         if (i->ARMin.XAssisted.cond != ARMcc_AL) {
+            Int delta = (UChar*)p - (UChar*)ptmp; /* must be signed */
+            vassert(delta > 0 && delta < 40);
+            vassert((delta & 3) == 0);
+            UInt notCond = 1 ^ (UInt)i->ARMin.XAssisted.cond;
+            vassert(notCond <= 13); /* Neither AL nor NV */
+            delta = (delta >> 2) - 2;
+            *ptmp = XX______(notCond, X1010) | (delta & 0xFFFFFF);
+         }
+         goto done;
+      }
+
       case ARMin_CMov: {
          UInt instr  = skeletal_RI84(i->ARMin.CMov.src);
          UInt subopc = X1101; /* MOV */
@@ -3293,9 +3604,15 @@
          goto bad; // FPSCR -> iReg case currently ATC
       }
       case ARMin_MFence: {
-         *p++ = 0xEE070F9A; /* mcr 15,0,r0,c7,c10,4 (DSB) */
-         *p++ = 0xEE070FBA; /* mcr 15,0,r0,c7,c10,5 (DMB) */
-         *p++ = 0xEE070F95; /* mcr 15,0,r0,c7,c5,4  (ISB) */
+         // It's not clear (to me) how these relate to the ARMv7
+         // versions, so let's just use the v7 versions as they
+         // are at least well documented.
+         //*p++ = 0xEE070F9A; /* mcr 15,0,r0,c7,c10,4 (DSB) */
+         //*p++ = 0xEE070FBA; /* mcr 15,0,r0,c7,c10,5 (DMB) */
+         //*p++ = 0xEE070F95; /* mcr 15,0,r0,c7,c5,4  (ISB) */
+         *p++ = 0xF57FF04F; /* DSB sy */
+         *p++ = 0xF57FF05F; /* DMB sy */
+         *p++ = 0xF57FF06F; /* ISB */
          goto done;
       }
       case ARMin_CLREX: {
@@ -4099,6 +4416,62 @@
          *p++ = insn;
          goto done;
       }
+
+      case ARMin_EvCheck: {
+         /* We generate:
+               ldr  r12, [r8 + #4]   4 == offsetof(host_EvC_COUNTER)
+               subs r12, r12, #1  (A1)
+               str  r12, [r8 + #4]   4 == offsetof(host_EvC_COUNTER)
+               bpl  nofail
+               ldr  r12, [r8 + #0]   0 == offsetof(host_EvC_FAILADDR)
+               bx   r12
+              nofail:
+         */
+         UInt* p0 = p;
+         p = do_load_or_store32(p, True/*isLoad*/, /*r*/12,
+                                i->ARMin.EvCheck.amCounter);
+         *p++ = 0xE25CC001; /* subs r12, r12, #1 */
+         p = do_load_or_store32(p, False/*!isLoad*/, /*r*/12,
+                                i->ARMin.EvCheck.amCounter);
+         *p++ = 0x5A000001; /* bpl nofail */
+         p = do_load_or_store32(p, True/*isLoad*/, /*r*/12,
+                                i->ARMin.EvCheck.amFailAddr);
+         *p++ = 0xE12FFF1C; /* bx r12 */
+         /* nofail: */
+
+         /* Crosscheck */
+         vassert(evCheckSzB_ARM() == (UChar*)p - (UChar*)p0);
+         goto done;
+      }
+
+      case ARMin_ProfInc: {
+         /* We generate:
+              (ctrP is unknown now, so use 0x65556555 in the
+              expectation that a later call to LibVEX_patchProfCtr
+              will be used to fill in the immediate fields once the
+              right value is known.)
+            movw r12, lo16(0x65556555)
+            movt r12, lo16(0x65556555)
+            ldr  r11, [r12]
+            adds r11, r11, #1
+            str  r11, [r12]
+            ldr  r11, [r12+4]
+            adc  r11, r11, #0
+            str  r11, [r12+4]
+         */
+         p = imm32_to_iregNo_EXACTLY2(p, /*r*/12, 0x65556555);
+         *p++ = 0xE59CB000;
+         *p++ = 0xE29BB001;
+         *p++ = 0xE58CB000;
+         *p++ = 0xE59CB004;
+         *p++ = 0xE2ABB000;
+         *p++ = 0xE58CB004;
+         /* Tell the caller .. */
+         vassert(!(*is_profInc));
+         *is_profInc = True;
+         goto done;
+      }
+
       /* ... */
       default: 
          goto bad;
@@ -4114,6 +4487,109 @@
    return ((UChar*)p) - &buf[0];
 }
 
+
+/* How big is an event check?  See case for ARMin_EvCheck in
+   emit_ARMInstr just above.  That crosschecks what this returns, so
+   we can tell if we're inconsistent. */
+Int evCheckSzB_ARM ( void )
+{
+   return 24;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   emitInstr case for XDirect, above. */
+VexInvalRange chainXDirect_ARM ( void* place_to_chain,
+                                 void* disp_cp_chain_me_EXPECTED,
+                                 void* place_to_jump_to )
+{
+   /* What we're expecting to see is:
+        movw r12, lo16(disp_cp_chain_me_to_EXPECTED)
+        movt r12, hi16(disp_cp_chain_me_to_EXPECTED)
+        blx  r12
+      viz
+        <8 bytes generated by imm32_to_iregNo_EXACTLY2>
+        E1 2F FF 3C
+   */
+   UInt* p = (UInt*)place_to_chain;
+   vassert(0 == (3 & (HWord)p));
+   vassert(is_imm32_to_iregNo_EXACTLY2(
+              p, /*r*/12, (UInt)Ptr_to_ULong(disp_cp_chain_me_EXPECTED)));
+   vassert(p[2] == 0xE12FFF3C);
+   /* And what we want to change it to is:
+        movw r12, lo16(place_to_jump_to)
+        movt r12, hi16(place_to_jump_to)
+        bx   r12
+      viz
+        <8 bytes generated by imm32_to_iregNo_EXACTLY2>
+        E1 2F FF 1C
+      The replacement has the same length as the original.
+   */
+   (void)imm32_to_iregNo_EXACTLY2(
+            p, /*r*/12, (UInt)Ptr_to_ULong(place_to_jump_to));
+   p[2] = 0xE12FFF1C;
+   VexInvalRange vir = {(HWord)p, 12};
+   return vir;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   emitInstr case for XDirect, above. */
+VexInvalRange unchainXDirect_ARM ( void* place_to_unchain,
+                                   void* place_to_jump_to_EXPECTED,
+                                   void* disp_cp_chain_me )
+{
+   /* What we're expecting to see is:
+        movw r12, lo16(place_to_jump_to_EXPECTED)
+        movt r12, lo16(place_to_jump_to_EXPECTED)
+        bx   r12
+      viz
+        <8 bytes generated by imm32_to_iregNo_EXACTLY2>
+        E1 2F FF 1C
+   */
+   UInt* p = (UInt*)place_to_unchain;
+   vassert(0 == (3 & (HWord)p));
+   vassert(is_imm32_to_iregNo_EXACTLY2(
+              p, /*r*/12, (UInt)Ptr_to_ULong(place_to_jump_to_EXPECTED)));
+   vassert(p[2] == 0xE12FFF1C);
+   /* And what we want to change it to is:
+        movw r12, lo16(disp_cp_chain_me)
+        movt r12, hi16(disp_cp_chain_me)
+        blx  r12
+      viz
+        <8 bytes generated by imm32_to_iregNo_EXACTLY2>
+        E1 2F FF 3C
+   */
+   (void)imm32_to_iregNo_EXACTLY2(
+            p, /*r*/12, (UInt)Ptr_to_ULong(disp_cp_chain_me));
+   p[2] = 0xE12FFF3C;
+   VexInvalRange vir = {(HWord)p, 12};
+   return vir;
+}
+
+
+/* Patch the counter address into a profile inc point, as previously
+   created by the ARMin_ProfInc case for emit_ARMInstr. */
+VexInvalRange patchProfInc_ARM ( void*  place_to_patch,
+                                 ULong* location_of_counter )
+{
+   vassert(sizeof(ULong*) == 4);
+   UInt* p = (UInt*)place_to_patch;
+   vassert(0 == (3 & (HWord)p));
+   vassert(is_imm32_to_iregNo_EXACTLY2(p, /*r*/12, 0x65556555));
+   vassert(p[2] == 0xE59CB000);
+   vassert(p[3] == 0xE29BB001);
+   vassert(p[4] == 0xE58CB000);
+   vassert(p[5] == 0xE59CB004);
+   vassert(p[6] == 0xE2ABB000);
+   vassert(p[7] == 0xE58CB004);
+   imm32_to_iregNo_EXACTLY2(p, /*r*/12, 
+                            (UInt)Ptr_to_ULong(location_of_counter));
+   VexInvalRange vir = {(HWord)p, 8};
+   return vir;
+}
+
+
 #undef BITS4
 #undef X0000
 #undef X0001
@@ -4136,6 +4612,7 @@
 #undef XXX___XX
 #undef XXXXX__X
 #undef XXXXXXXX
+#undef XX______
 
 /*---------------------------------------------------------------*/
 /*--- end                                     host_arm_defs.c ---*/
diff --git a/priv/host_arm_defs.h b/priv/host_arm_defs.h
index 0dea3f5..7eb4f3e 100644
--- a/priv/host_arm_defs.h
+++ b/priv/host_arm_defs.h
@@ -564,7 +564,9 @@
       ARMin_LdSt16,
       ARMin_LdSt8U,
       ARMin_Ld8S,
-      ARMin_Goto,
+      ARMin_XDirect,     /* direct transfer to GA */
+      ARMin_XIndir,      /* indirect transfer to GA */
+      ARMin_XAssisted,   /* assisted transfer to GA */
       ARMin_CMov,
       ARMin_Call,
       ARMin_Mul,
@@ -604,9 +606,10 @@
          allocator demands them to consist of no more than two instructions.
          We will split this instruction into 2 or 3 ARM instructions on the
          emiting phase.
-
          NOTE: source and destination registers should be different! */
-      ARMin_Add32
+      ARMin_Add32,
+      ARMin_EvCheck,     /* Event check */
+      ARMin_ProfInc      /* 64-bit profile counter increment */
    }
    ARMInstrTag;
 
@@ -676,13 +679,30 @@
             HReg       rD;
             ARMAMode2* amode;
          } Ld8S;
-         /* Pseudo-insn.  Go to guest address gnext, on given
-            condition, which could be ARMcc_AL. */
+         /* Update the guest R15T value, then exit requesting to chain
+            to it.  May be conditional.  Urr, use of Addr32 implicitly
+            assumes that wordsize(guest) == wordsize(host). */
          struct {
+            Addr32      dstGA;    /* next guest address */
+            ARMAMode1*  amR15T;   /* amode in guest state for R15T */
+            ARMCondCode cond;     /* can be ARMcc_AL */
+            Bool        toFastEP; /* chain to the slow or fast point? */
+         } XDirect;
+         /* Boring transfer to a guest address not known at JIT time.
+            Not chainable.  May be conditional. */
+         struct {
+            HReg        dstGA;
+            ARMAMode1*  amR15T;
+            ARMCondCode cond; /* can be ARMcc_AL */
+         } XIndir;
+         /* Assisted transfer to a guest address, most general case.
+            Not chainable.  May be conditional. */
+         struct {
+            HReg        dstGA;
+            ARMAMode1*  amR15T;
+            ARMCondCode cond; /* can be ARMcc_AL */
             IRJumpKind  jk;
-            ARMCondCode cond;
-            HReg        gnext;
-         } Goto;
+         } XAssisted;
          /* Mov src to dst on the given condition, which may not
             be ARMcc_AL. */
          struct {
@@ -905,6 +925,15 @@
             HReg rN;
             UInt imm32;
          } Add32;
+         struct {
+            ARMAMode1* amCounter;
+            ARMAMode1* amFailAddr;
+         } EvCheck;
+         struct {
+            /* No fields.  The address of the counter to inc is
+               installed later, post-translation, by patching it in,
+               as it is not known at translation time. */
+         } ProfInc;
       } ARMin;
    }
    ARMInstr;
@@ -921,7 +950,12 @@
                                      HReg, ARMAMode2* );
 extern ARMInstr* ARMInstr_LdSt8U   ( Bool isLoad, HReg, ARMAMode1* );
 extern ARMInstr* ARMInstr_Ld8S     ( HReg, ARMAMode2* );
-extern ARMInstr* ARMInstr_Goto     ( IRJumpKind, ARMCondCode, HReg gnext );
+extern ARMInstr* ARMInstr_XDirect  ( Addr32 dstGA, ARMAMode1* amR15T,
+                                     ARMCondCode cond, Bool toFastEP );
+extern ARMInstr* ARMInstr_XIndir   ( HReg dstGA, ARMAMode1* amR15T,
+                                     ARMCondCode cond );
+extern ARMInstr* ARMInstr_XAssisted ( HReg dstGA, ARMAMode1* amR15T,
+                                      ARMCondCode cond, IRJumpKind jk );
 extern ARMInstr* ARMInstr_CMov     ( ARMCondCode, HReg dst, ARMRI84* src );
 extern ARMInstr* ARMInstr_Call     ( ARMCondCode, HWord, Int nArgRegs );
 extern ARMInstr* ARMInstr_Mul      ( ARMMulOp op );
@@ -957,6 +991,9 @@
 extern ARMInstr* ARMInstr_NeonImm  ( HReg, ARMNImm* );
 extern ARMInstr* ARMInstr_NCMovQ   ( ARMCondCode, HReg, HReg );
 extern ARMInstr* ARMInstr_Add32    ( HReg rD, HReg rN, UInt imm32 );
+extern ARMInstr* ARMInstr_EvCheck  ( ARMAMode1* amCounter,
+                                     ARMAMode1* amFailAddr );
+extern ARMInstr* ARMInstr_ProfInc  ( void );
 
 extern void ppARMInstr ( ARMInstr* );
 
@@ -966,10 +1003,13 @@
 extern void getRegUsage_ARMInstr ( HRegUsage*, ARMInstr*, Bool );
 extern void mapRegs_ARMInstr     ( HRegRemap*, ARMInstr*, Bool );
 extern Bool isMove_ARMInstr      ( ARMInstr*, HReg*, HReg* );
-extern Int  emit_ARMInstr        ( UChar* buf, Int nbuf, ARMInstr*, 
-                                   Bool,
-                                   void* dispatch_unassisted,
-                                   void* dispatch_assisted );
+extern Int  emit_ARMInstr        ( /*MB_MOD*/Bool* is_profInc,
+                                   UChar* buf, Int nbuf, ARMInstr* i, 
+                                   Bool mode64,
+                                   void* disp_cp_chain_me_to_slowEP,
+                                   void* disp_cp_chain_me_to_fastEP,
+                                   void* disp_cp_xindir,
+                                   void* disp_cp_xassisted );
 
 extern void genSpill_ARM  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                             HReg rreg, Int offset, Bool );
@@ -977,8 +1017,34 @@
                             HReg rreg, Int offset, Bool );
 
 extern void getAllocableRegs_ARM ( Int*, HReg** );
-extern HInstrArray* iselSB_ARM   ( IRSB*, VexArch,
-                                   VexArchInfo*, VexAbiInfo* );
+extern HInstrArray* iselSB_ARM   ( IRSB*, 
+                                   VexArch,
+                                   VexArchInfo*,
+                                   VexAbiInfo*,
+                                   Int offs_Host_EvC_Counter,
+                                   Int offs_Host_EvC_FailAddr,
+                                   Bool chainingAllowed,
+                                   Bool addProfInc,
+                                   Addr64 max_ga );
+
+/* How big is an event check?  This is kind of a kludge because it
+   depends on the offsets of host_EvC_FAILADDR and
+   host_EvC_COUNTER. */
+extern Int evCheckSzB_ARM ( void );
+
+/* Perform a chaining and unchaining of an XDirect jump. */
+extern VexInvalRange chainXDirect_ARM ( void* place_to_chain,
+                                        void* disp_cp_chain_me_EXPECTED,
+                                        void* place_to_jump_to );
+
+extern VexInvalRange unchainXDirect_ARM ( void* place_to_unchain,
+                                          void* place_to_jump_to_EXPECTED,
+                                          void* disp_cp_chain_me );
+
+/* Patch the counter location into an existing ProfInc point. */
+extern VexInvalRange patchProfInc_ARM ( void*  place_to_patch,
+                                        ULong* location_of_counter );
+
 
 #endif /* ndef __VEX_HOST_ARM_DEFS_H */
 
diff --git a/priv/host_arm_isel.c b/priv/host_arm_isel.c
index e695567..7ddd077 100644
--- a/priv/host_arm_isel.c
+++ b/priv/host_arm_isel.c
@@ -84,9 +84,6 @@
              32-bit virtual HReg, which holds the high half
              of the value.
 
-   - The name of the vreg in which we stash a copy of the link reg, so
-     helper functions don't kill it.
-
    - The code array, that is, the insns selected so far.
 
    - A counter, for generating new virtual registers.
@@ -94,23 +91,38 @@
    - The host hardware capabilities word.  This is set at the start
      and does not change.
 
-   Note, this is all host-independent.  */
+   - A Bool for indicating whether we may generate chain-me
+     instructions for control flow transfers, or whether we must use
+     XAssisted.
+
+   - The maximum guest address of any guest insn in this block.
+     Actually, the address of the highest-addressed byte from any insn
+     in this block.  Is set at the start and does not change.  This is
+     used for detecting jumps which are definitely forward-edges from
+     this block, and therefore can be made (chained) to the fast entry
+     point of the destination, thereby avoiding the destination's
+     event check.
+
+   Note, this is all (well, mostly) host-independent.
+*/
 
 typedef
    struct {
+      /* Constant -- are set at the start and do not change. */
       IRTypeEnv*   type_env;
 
       HReg*        vregmap;
       HReg*        vregmapHI;
       Int          n_vregmap;
 
-      HReg         savedLR;
-
-      HInstrArray* code;
-
-      Int          vreg_ctr;
-
       UInt         hwcaps;
+
+      Bool         chainingAllowed;
+      Addr64       max_ga;
+
+      /* These are modified as we go along. */
+      HInstrArray* code;
+      Int          vreg_ctr;
    }
    ISelEnv;
 
@@ -1514,7 +1526,7 @@
          }
          case Iop_64to8: {
             HReg rHi, rLo;
-            if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+            if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
                HReg tHi = newVRegI(env);
                HReg tLo = newVRegI(env);
                HReg tmp = iselNeon64Expr(env, e->Iex.Unop.arg);
@@ -1819,7 +1831,7 @@
 
    /* read 64-bit IRTemp */
    if (e->tag == Iex_RdTmp) {
-      if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+      if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
          HReg tHi = newVRegI(env);
          HReg tLo = newVRegI(env);
          HReg tmp = iselNeon64Expr(env, e);
@@ -2028,7 +2040,7 @@
    /* It is convenient sometimes to call iselInt64Expr even when we
       have NEON support (e.g. in do_helper_call we need 64-bit
       arguments as 2 x 32 regs). */
-   if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+   if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
       HReg tHi = newVRegI(env);
       HReg tLo = newVRegI(env);
       HReg tmp = iselNeon64Expr(env, e);
@@ -5339,7 +5351,7 @@
    if (e->tag == Iex_Unop) {
       switch (e->Iex.Unop.op) {
          case Iop_ReinterpI64asF64: {
-            if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+            if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
                return iselNeon64Expr(env, e->Iex.Unop.arg);
             } else {
                HReg srcHi, srcLo;
@@ -5631,7 +5643,7 @@
          return;
       }
       if (tyd == Ity_I64) {
-         if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+         if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
             HReg dD = iselNeon64Expr(env, stmt->Ist.Store.data);
             ARMAModeN* am = iselIntExpr_AModeN(env, stmt->Ist.Store.addr);
             addInstr(env, ARMInstr_NLdStD(False, dD, am));
@@ -5680,7 +5692,7 @@
            return;
        }
        if (tyd == Ity_I64) {
-          if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+          if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
              HReg addr = newVRegI(env);
              HReg qD = iselNeon64Expr(env, stmt->Ist.Put.data);
              addInstr(env, ARMInstr_Add32(addr, hregARM_R8(),
@@ -5765,7 +5777,7 @@
          return;
       }
       if (ty == Ity_I64) {
-         if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+         if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
             HReg src = iselNeon64Expr(env, stmt->Ist.WrTmp.data);
             HReg dst = lookupIRTemp(env, tmp);
             addInstr(env, ARMInstr_NUnary(ARMneon_COPY, dst, src, 4, False));
@@ -5824,7 +5836,7 @@
       retty = typeOfIRTemp(env->type_env, d->tmp);
 
       if (retty == Ity_I64) {
-         if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+         if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
             HReg tmp = lookupIRTemp(env, d->tmp);
             addInstr(env, ARMInstr_VXferD(True, tmp, hregARM_R1(),
                                                      hregARM_R0()));
@@ -5878,7 +5890,7 @@
                move it into a result register pair.  On a NEON capable
                CPU, the result register will be a 64 bit NEON
                register, so we must move it there instead. */
-            if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+            if (env->hwcaps & VEX_HWCAPS_ARM_NEON) {
                HReg dst = lookupIRTemp(env, res);
                addInstr(env, ARMInstr_VXferD(True, dst, hregARM_R3(),
                                                         hregARM_R2()));
@@ -5964,15 +5976,53 @@
 
    /* --------- EXIT --------- */
    case Ist_Exit: {
-      HReg        gnext;
-      ARMCondCode cc;
       if (stmt->Ist.Exit.dst->tag != Ico_U32)
          vpanic("isel_arm: Ist_Exit: dst is not a 32-bit value");
-      gnext = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
-      cc    = iselCondCode(env, stmt->Ist.Exit.guard);
-      addInstr(env, mk_iMOVds_RR(hregARM_R14(), env->savedLR));
-      addInstr(env, ARMInstr_Goto(stmt->Ist.Exit.jk, cc, gnext));
-      return;
+
+      ARMCondCode cc     = iselCondCode(env, stmt->Ist.Exit.guard);
+      ARMAMode1*  amR15T = ARMAMode1_RI(hregARM_R8(),
+                                        stmt->Ist.Exit.offsIP);
+
+      /* Case: boring transfer to known address */
+      if (stmt->Ist.Exit.jk == Ijk_Boring
+          || stmt->Ist.Exit.jk == Ijk_Call
+          || stmt->Ist.Exit.jk == Ijk_Ret) {
+         if (env->chainingAllowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool toFastEP
+               = ((Addr32)stmt->Ist.Exit.dst->Ico.U32) > env->max_ga;
+            if (0) vex_printf("%s", toFastEP ? "Y" : ",");
+            addInstr(env, ARMInstr_XDirect(stmt->Ist.Exit.dst->Ico.U32,
+                                           amR15T, cc, toFastEP));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an assisted transfer,
+               as that's the only alternative that is allowable. */
+            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+            addInstr(env, ARMInstr_XAssisted(r, amR15T, cc, Ijk_Boring));
+         }
+         return;
+      }
+
+      /* Case: assisted transfer to arbitrary address */
+      switch (stmt->Ist.Exit.jk) {
+         //case Ijk_MapFail:
+         //case Ijk_SigSEGV: case Ijk_TInval: case Ijk_EmWarn:
+         case Ijk_NoDecode:
+         {
+            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+            addInstr(env, ARMInstr_XAssisted(r, amR15T, cc,
+                                             stmt->Ist.Exit.jk));
+            return;
+         }
+         default:
+            break;
+      }
+
+      /* Do we ever expect to see any other kind? */
+      goto stmt_fail;
    }
 
    default: break;
@@ -5987,19 +6037,85 @@
 /*--- ISEL: Basic block terminators (Nexts)             ---*/
 /*---------------------------------------------------------*/
 
-static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
+static void iselNext ( ISelEnv* env,
+                       IRExpr* next, IRJumpKind jk, Int offsIP )
 {
-   HReg rDst;
    if (vex_traceflags & VEX_TRACE_VCODE) {
-      vex_printf("\n-- goto {");
+      vex_printf( "\n-- PUT(%d) = ", offsIP);
+      ppIRExpr( next );
+      vex_printf( "; exit-");
       ppIRJumpKind(jk);
-      vex_printf("} ");
-      ppIRExpr(next);
-      vex_printf("\n");
+      vex_printf( "\n");
    }
-   rDst = iselIntExpr_R(env, next);
-   addInstr(env, mk_iMOVds_RR(hregARM_R14(), env->savedLR));
-   addInstr(env, ARMInstr_Goto(jk, ARMcc_AL, rDst));
+
+   /* Case: boring transfer to known address */
+   if (next->tag == Iex_Const) {
+      IRConst* cdst = next->Iex.Const.con;
+      vassert(cdst->tag == Ico_U32);
+      if (jk == Ijk_Boring || jk == Ijk_Call) {
+         /* Boring transfer to known address */
+         ARMAMode1* amR15T = ARMAMode1_RI(hregARM_R8(), offsIP);
+         if (env->chainingAllowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool toFastEP
+               = ((Addr64)cdst->Ico.U32) > env->max_ga;
+            if (0) vex_printf("%s", toFastEP ? "X" : ".");
+            addInstr(env, ARMInstr_XDirect(cdst->Ico.U32,
+                                           amR15T, ARMcc_AL, 
+                                           toFastEP));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an assisted transfer,
+               as that's the only alternative that is allowable. */
+            HReg r = iselIntExpr_R(env, next);
+            addInstr(env, ARMInstr_XAssisted(r, amR15T, ARMcc_AL,
+                                             Ijk_Boring));
+         }
+         return;
+      }
+   }
+
+   /* Case: call/return (==boring) transfer to any address */
+   switch (jk) {
+      case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
+         HReg       r      = iselIntExpr_R(env, next);
+         ARMAMode1* amR15T = ARMAMode1_RI(hregARM_R8(), offsIP);
+         if (env->chainingAllowed) {
+            addInstr(env, ARMInstr_XIndir(r, amR15T, ARMcc_AL));
+         } else {
+            addInstr(env, ARMInstr_XAssisted(r, amR15T, ARMcc_AL,
+                                                Ijk_Boring));
+         }
+         return;
+      }
+      default:
+         break;
+   }
+
+   /* Case: some other kind of transfer to any address */
+   switch (jk) {
+      case Ijk_Sys_syscall: case Ijk_ClientReq: case Ijk_NoDecode:
+      case Ijk_NoRedir:
+      //case Ijk_Sys_int128: 
+      //case Ijk_Yield: case Ijk_SigTRAP:
+      {
+         HReg       r      = iselIntExpr_R(env, next);
+         ARMAMode1* amR15T = ARMAMode1_RI(hregARM_R8(), offsIP);
+         addInstr(env, ARMInstr_XAssisted(r, amR15T, ARMcc_AL, jk));
+         return;
+      }
+      default:
+         break;
+   }
+
+   vex_printf( "\n-- PUT(%d) = ", offsIP);
+   ppIRExpr( next );
+   vex_printf( "; exit-");
+   ppIRJumpKind(jk);
+   vex_printf( "\n");
+   vassert(0); // are we expecting any other kind?
 }
 
 
@@ -6009,21 +6125,27 @@
 
 /* Translate an entire SB to arm code. */
 
-HInstrArray* iselSB_ARM ( IRSB* bb, VexArch      arch_host,
-                                    VexArchInfo* archinfo_host,
-                                    VexAbiInfo*  vbi/*UNUSED*/ )
+HInstrArray* iselSB_ARM ( IRSB* bb,
+                          VexArch      arch_host,
+                          VexArchInfo* archinfo_host,
+                          VexAbiInfo*  vbi/*UNUSED*/,
+                          Int offs_Host_EvC_Counter,
+                          Int offs_Host_EvC_FailAddr,
+                          Bool chainingAllowed,
+                          Bool addProfInc,
+                          Addr64 max_ga )
 {
-   Int      i, j;
-   HReg     hreg, hregHI;
-   ISelEnv* env;
-   UInt     hwcaps_host = archinfo_host->hwcaps;
-   static UInt counter = 0;
+   Int       i, j;
+   HReg      hreg, hregHI;
+   ISelEnv*  env;
+   UInt      hwcaps_host = archinfo_host->hwcaps;
+   ARMAMode1 *amCounter, *amFailAddr;
 
    /* sanity ... */
    vassert(arch_host == VexArchARM);
 
    /* hwcaps should not change from one ISEL call to another. */
-   arm_hwcaps = hwcaps_host;
+   arm_hwcaps = hwcaps_host; // JRS 2012 Mar 31: FIXME (RM)
 
    /* Make up an initial environment to use. */
    env = LibVEX_Alloc(sizeof(ISelEnv));
@@ -6041,6 +6163,11 @@
    env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
    env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
 
+   /* and finally ... */
+   env->chainingAllowed = chainingAllowed;
+   env->hwcaps          = hwcaps_host;
+   env->max_ga          = max_ga;
+
    /* For each IR temporary, allocate a suitably-kinded virtual
       register. */
    j = 0;
@@ -6052,7 +6179,7 @@
          case Ity_I16:
          case Ity_I32:  hreg   = mkHReg(j++, HRcInt32, True); break;
          case Ity_I64:
-            if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+            if (hwcaps_host & VEX_HWCAPS_ARM_NEON) {
                hreg = mkHReg(j++, HRcFlt64, True);
             } else {
                hregHI = mkHReg(j++, HRcInt32, True);
@@ -6070,21 +6197,27 @@
    }
    env->vreg_ctr = j;
 
-   /* Keep a copy of the link reg, since any call to a helper function
-      will trash it, and we can't get back to the dispatcher once that
-      happens. */
-   env->savedLR = newVRegI(env);
-   addInstr(env, mk_iMOVds_RR(env->savedLR, hregARM_R14()));
+   /* The very first instruction must be an event check. */
+   amCounter  = ARMAMode1_RI(hregARM_R8(), offs_Host_EvC_Counter);
+   amFailAddr = ARMAMode1_RI(hregARM_R8(), offs_Host_EvC_FailAddr);
+   addInstr(env, ARMInstr_EvCheck(amCounter, amFailAddr));
+
+   /* Possibly a block counter increment (for profiling).  At this
+      point we don't know the address of the counter, so just pretend
+      it is zero.  It will have to be patched later, but before this
+      translation is used, by a call to LibVEX_patchProfCtr. */
+   if (addProfInc) {
+      addInstr(env, ARMInstr_ProfInc());
+   }
 
    /* Ok, finally we can iterate over the statements. */
    for (i = 0; i < bb->stmts_used; i++)
-      iselStmt(env,bb->stmts[i]);
+      iselStmt(env, bb->stmts[i]);
 
-   iselNext(env,bb->next,bb->jumpkind);
+   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
 
    /* record the number of vregs we used. */
    env->code->n_vregs = env->vreg_ctr;
-   counter++;
    return env->code;
 }
 
diff --git a/priv/host_ppc_defs.c b/priv/host_ppc_defs.c
index 9974b7b..f8ff79c 100644
--- a/priv/host_ppc_defs.c
+++ b/priv/host_ppc_defs.c
@@ -845,13 +845,33 @@
    vassert(0 == (argiregs & ~mask));
    return i;
 }
-PPCInstr* PPCInstr_Goto ( IRJumpKind jk, 
-                          PPCCondCode cond, PPCRI* dst ) {
-   PPCInstr* i      = LibVEX_Alloc(sizeof(PPCInstr));
-   i->tag           = Pin_Goto;
-   i->Pin.Goto.cond = cond;
-   i->Pin.Goto.dst  = dst;
-   i->Pin.Goto.jk   = jk;
+PPCInstr* PPCInstr_XDirect ( Addr64 dstGA, PPCAMode* amCIA,
+                             PPCCondCode cond, Bool toFastEP ) {
+   PPCInstr* i             = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag                  = Pin_XDirect;
+   i->Pin.XDirect.dstGA    = dstGA;
+   i->Pin.XDirect.amCIA    = amCIA;
+   i->Pin.XDirect.cond     = cond;
+   i->Pin.XDirect.toFastEP = toFastEP;
+   return i;
+}
+PPCInstr* PPCInstr_XIndir ( HReg dstGA, PPCAMode* amCIA,
+                            PPCCondCode cond ) {
+   PPCInstr* i         = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag              = Pin_XIndir;
+   i->Pin.XIndir.dstGA = dstGA;
+   i->Pin.XIndir.amCIA = amCIA;
+   i->Pin.XIndir.cond  = cond;
+   return i;
+}
+PPCInstr* PPCInstr_XAssisted ( HReg dstGA, PPCAMode* amCIA,
+                               PPCCondCode cond, IRJumpKind jk ) {
+   PPCInstr* i            = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag                 = Pin_XAssisted;
+   i->Pin.XAssisted.dstGA = dstGA;
+   i->Pin.XAssisted.amCIA = amCIA;
+   i->Pin.XAssisted.cond  = cond;
+   i->Pin.XAssisted.jk    = jk;
    return i;
 }
 PPCInstr* PPCInstr_CMov  ( PPCCondCode cond, 
@@ -1057,7 +1077,6 @@
    i->Pin.DfpD128toD64.dst    = dst;
    return i;
 }
-
 PPCInstr* PPCInstr_DfpI64StoD128 ( PPCFpOp op, HReg dst_hi,
                                    HReg dst_lo, HReg src ) {
    PPCInstr* i                 = LibVEX_Alloc(sizeof(PPCInstr));
@@ -1068,6 +1087,20 @@
    i->Pin.DfpI64StoD128.dst_lo = dst_lo;
    return i;
 }
+PPCInstr* PPCInstr_EvCheck ( PPCAMode* amCounter,
+                             PPCAMode* amFailAddr ) {
+   PPCInstr* i               = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag                    = Pin_EvCheck;
+   i->Pin.EvCheck.amCounter  = amCounter;
+   i->Pin.EvCheck.amFailAddr = amFailAddr;
+   return i;
+}
+PPCInstr* PPCInstr_ProfInc ( void ) {
+   PPCInstr* i = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag      = Pin_ProfInc;
+   return i;
+}
+
 
 /*
 Valid combo | fromI | int32 | syned | flt64 |
@@ -1432,26 +1465,53 @@
       vex_printf("] }");
       break;
    }
-   case Pin_Goto:
-      vex_printf("goto: ");
-      if (i->Pin.Goto.cond.test != Pct_ALWAYS) {
-         vex_printf("if (%s) ", showPPCCondCode(i->Pin.Goto.cond));
-      }
-      vex_printf("{ ");
-      if (i->Pin.Goto.jk != Ijk_Boring
-          && i->Pin.Goto.jk != Ijk_Call
-          && i->Pin.Goto.jk != Ijk_Ret) {
-         vex_printf("li %%r31,$");
-         ppIRJumpKind(i->Pin.Goto.jk);
-         vex_printf(" ; ");
-      }
-      if (i->Pin.Goto.dst->tag == Pri_Imm) {
-         ppLoadImm(hregPPC_GPR3(mode64), i->Pin.Goto.dst->Pri.Imm,
-                   mode64);
+   case Pin_XDirect:
+      vex_printf("(xDirect) ");
+      vex_printf("if (%s) { ",
+                 showPPCCondCode(i->Pin.XDirect.cond));
+      if (mode64) {
+         vex_printf("imm64 r30,0x%llx; ", i->Pin.XDirect.dstGA);
+         vex_printf("std r30,");
       } else {
-         ppMovReg(hregPPC_GPR3(mode64), i->Pin.Goto.dst->Pri.Reg);
+         vex_printf("imm32 r30,0x%llx; ", i->Pin.XDirect.dstGA);
+         vex_printf("stw r30,");
       }
-      vex_printf(" ; blr }");
+      ppPPCAMode(i->Pin.XDirect.amCIA);
+      vex_printf("; ");
+      if (mode64) {
+         vex_printf("imm64-fixed5 r30,$disp_cp_chain_me_to_%sEP; ",
+                    i->Pin.XDirect.toFastEP ? "fast" : "slow");
+      } else {
+         vex_printf("imm32-fixed2 r30,$disp_cp_chain_me_to_%sEP; ",
+                    i->Pin.XDirect.toFastEP ? "fast" : "slow");
+      }
+      vex_printf("mtctr r30; bctrl }");
+      return;
+   case Pin_XIndir:
+      vex_printf("(xIndir) ");
+      vex_printf("if (%s) { ",
+                 showPPCCondCode(i->Pin.XIndir.cond));
+      vex_printf("%s ", mode64 ? "std" : "stw");
+      ppHRegPPC(i->Pin.XIndir.dstGA);
+      vex_printf(",");
+      ppPPCAMode(i->Pin.XIndir.amCIA);
+      vex_printf("; ");
+      vex_printf("imm%s r30,$disp_cp_xindir; ", mode64 ? "64" : "32");
+      vex_printf("mtctr r30; bctr }");
+      return;
+   case Pin_XAssisted:
+      vex_printf("(xAssisted) ");
+      vex_printf("if (%s) { ",
+                 showPPCCondCode(i->Pin.XAssisted.cond));
+      vex_printf("%s ", mode64 ? "std" : "stw");
+      ppHRegPPC(i->Pin.XAssisted.dstGA);
+      vex_printf(",");
+      ppPPCAMode(i->Pin.XAssisted.amCIA);
+      vex_printf("; ");
+      vex_printf("li r31,$IRJumpKind_to_TRCVAL(%d); ",                            
+                 (Int)i->Pin.XAssisted.jk);
+      vex_printf("imm%s r30,$disp_cp_xindir; ", mode64 ? "64" : "32");
+      vex_printf("mtctr r30; bctr }");
       return;
    case Pin_CMov:
       vex_printf("cmov (%s) ", showPPCCondCode(i->Pin.CMov.cond));
@@ -1875,6 +1935,30 @@
       vex_printf(",");
       return;
 
+   case Pin_EvCheck:
+      /* Note that the counter dec is 32 bit even in 64-bit mode. */
+      vex_printf("(evCheck) ");
+      vex_printf("lwz r30,");
+      ppPPCAMode(i->Pin.EvCheck.amCounter);
+      vex_printf("; addic. r30,r30,-1; ");
+      vex_printf("stw r30,");
+      ppPPCAMode(i->Pin.EvCheck.amCounter);
+      vex_printf("; bge nofail; lwz r30,");
+      ppPPCAMode(i->Pin.EvCheck.amFailAddr);
+      vex_printf("; mtctr r30; bctr; nofail:");
+      return;
+
+   case Pin_ProfInc:
+      if (mode64) {
+         vex_printf("(profInc) imm64-fixed5 r30,$NotKnownYet; ");
+         vex_printf("ld r29,(r30); addi r29,r29,1; std r29,(r30)");
+      } else {
+         vex_printf("(profInc) imm32-fixed2 r30,$NotKnownYet; ");
+         vex_printf("lwz r29,4(r30); addic. r29,r29,1; stw r29,4(r30)");
+         vex_printf("lwz r29,0(r30); addze r29,r29; stw r29,0(r30)");
+      }
+      break;
+
    default:
       vex_printf("\nppPPCInstr: No such tag(%d)\n", (Int)i->tag);
       vpanic("ppPPCInstr");
@@ -1973,17 +2057,21 @@
          and no other, as a destination temporary. */
       return;
    }
-   case Pin_Goto:
-      addRegUsage_PPCRI(u, i->Pin.Goto.dst);
-      /* GPR3 holds destination address from Pin_Goto */
-      addHRegUse(u, HRmWrite, hregPPC_GPR3(mode64));
-      if (i->Pin.Goto.jk != Ijk_Boring
-          && i->Pin.Goto.jk != Ijk_Call
-          && i->Pin.Goto.jk != Ijk_Ret)
-            /* note, this is irrelevant since the guest state pointer
-               register is not actually available to the allocator.
-               But still .. */
-         addHRegUse(u, HRmWrite, GuestStatePtr(mode64));
+   /* XDirect/XIndir/XAssisted are also a bit subtle.  They
+      conditionally exit the block.  Hence we only need to list (1)
+      the registers that they read, and (2) the registers that they
+      write in the case where the block is not exited.  (2) is empty,
+      hence only (1) is relevant here. */
+   case Pin_XDirect:
+      addRegUsage_PPCAMode(u, i->Pin.XDirect.amCIA);
+      return;
+   case Pin_XIndir:
+      addHRegUse(u, HRmRead, i->Pin.XIndir.dstGA);
+      addRegUsage_PPCAMode(u, i->Pin.XIndir.amCIA);
+      return;
+   case Pin_XAssisted:
+      addHRegUse(u, HRmRead, i->Pin.XAssisted.dstGA);
+      addRegUsage_PPCAMode(u, i->Pin.XAssisted.amCIA);
       return;
    case Pin_CMov:
       addRegUsage_PPCRI(u,  i->Pin.CMov.src);
@@ -2185,7 +2273,18 @@
       addHRegUse(u, HRmWrite, i->Pin.DfpI64StoD128.dst_hi);
       addHRegUse(u, HRmWrite, i->Pin.DfpI64StoD128.dst_lo);
       return;
-
+   case Pin_EvCheck:
+      /* We expect both amodes only to mention the GSP (r31), so this
+         is in fact pointless, since GSP isn't allocatable, but
+         anyway.. */
+      addRegUsage_PPCAMode(u, i->Pin.EvCheck.amCounter);
+      addRegUsage_PPCAMode(u, i->Pin.EvCheck.amFailAddr);
+      addHRegUse(u, HRmWrite, hregPPC_GPR30(mode64)); /* also unavail to RA */
+      return;
+   case Pin_ProfInc:
+      addHRegUse(u, HRmWrite, hregPPC_GPR29(mode64));
+      addHRegUse(u, HRmWrite, hregPPC_GPR30(mode64));
+      return;
    default:
       ppPPCInstr(i, mode64);
       vpanic("getRegUsage_PPCInstr");
@@ -2239,8 +2338,16 @@
       return;
    case Pin_Call:
       return;
-   case Pin_Goto:
-      mapRegs_PPCRI(m, i->Pin.Goto.dst);
+   case Pin_XDirect:
+      mapRegs_PPCAMode(m, i->Pin.XDirect.amCIA);
+      return;
+   case Pin_XIndir:
+      mapReg(m, &i->Pin.XIndir.dstGA);
+      mapRegs_PPCAMode(m, i->Pin.XIndir.amCIA);
+      return;
+   case Pin_XAssisted:
+      mapReg(m, &i->Pin.XAssisted.dstGA);
+      mapRegs_PPCAMode(m, i->Pin.XAssisted.amCIA);
       return;
    case Pin_CMov:
       mapRegs_PPCRI(m, i->Pin.CMov.src);
@@ -2424,7 +2531,16 @@
       mapReg(m, &i->Pin.DfpI64StoD128.dst_hi);
       mapReg(m, &i->Pin.DfpI64StoD128.dst_lo);
       return;
-
+   case Pin_EvCheck:
+      /* We expect both amodes only to mention the GSP (r31), so this
+         is in fact pointless, since GSP isn't allocatable, but
+         anyway.. */
+      mapRegs_PPCAMode(m, i->Pin.EvCheck.amCounter);
+      mapRegs_PPCAMode(m, i->Pin.EvCheck.amFailAddr);
+      return;
+   case Pin_ProfInc:
+      /* hardwires r29 and r30 -- nothing to modify. */
+      return;
    default:
       ppPPCInstr(i, mode64);
       vpanic("mapRegs_PPCInstr");
@@ -2558,7 +2674,7 @@
    return n;
 }
 
-/* Emit 32bit instruction big-endianly */
+/* Emit an instruction big-endianly */
 static UChar* emit32 ( UChar* p, UInt w32 )
 {
    *p++ = toUChar((w32 >> 24) & 0x000000FF);
@@ -2568,6 +2684,17 @@
    return p;
 }
 
+/* Fetch an instruction big-endianly */
+static UInt fetch32 ( UChar* p )
+{
+   UInt w32 = 0;
+   w32 |= ((0xFF & (UInt)p[0]) << 24);
+   w32 |= ((0xFF & (UInt)p[1]) << 16);
+   w32 |= ((0xFF & (UInt)p[2]) <<  8);
+   w32 |= ((0xFF & (UInt)p[3]) <<  0);
+   return w32;
+}
+
 /* The following mkForm[...] functions refer to ppc instruction forms
    as per PPC32 p576
  */
@@ -2866,6 +2993,210 @@
    return p;
 }
 
+/* A simplified version of mkLoadImm that always generates 2 or 5
+   instructions (32 or 64 bits respectively) even if it could generate
+   fewer.  This is needed for generating fixed sized patchable
+   sequences. */
+static UChar* mkLoadImm_EXACTLY2or5 ( UChar* p,
+                                      UInt r_dst, ULong imm, Bool mode64 )
+{
+   vassert(r_dst < 0x20);
+
+   if (!mode64) {
+      /* In 32-bit mode, make sure the top 32 bits of imm are a sign
+         extension of the bottom 32 bits.  (Probably unnecessary.) */
+      UInt u32 = (UInt)imm;
+      Int  s32 = (Int)u32;
+      Long s64 = (Long)s32;
+      imm = (ULong)s64;
+   }
+
+   if (!mode64) {
+      // addis r_dst,r0,(imm>>16) => lis r_dst, (imm>>16)
+      p = mkFormD(p, 15, r_dst, 0, (imm>>16) & 0xFFFF);
+      // ori r_dst, r_dst, (imm & 0xFFFF)
+      p = mkFormD(p, 24, r_dst, r_dst, imm & 0xFFFF);
+
+   } else {
+      // full 64bit immediate load: 5 (five!) insns.
+
+      // load high word
+      // lis r_dst, (imm>>48) & 0xFFFF
+      p = mkFormD(p, 15, r_dst, 0, (imm>>48) & 0xFFFF);
+
+      // ori r_dst, r_dst, (imm>>32) & 0xFFFF
+      p = mkFormD(p, 24, r_dst, r_dst, (imm>>32) & 0xFFFF);
+         
+      // shift r_dst low word to high word => rldicr
+      p = mkFormMD(p, 30, r_dst, r_dst, 32, 31, 1);
+
+      // load low word
+      // oris r_dst, r_dst, (imm>>16) & 0xFFFF
+      p = mkFormD(p, 25, r_dst, r_dst, (imm>>16) & 0xFFFF);
+
+      // ori r_dst, r_dst, (imm) & 0xFFFF
+      p = mkFormD(p, 24, r_dst, r_dst, imm & 0xFFFF);
+   }
+   return p;
+}
+
+/* Checks whether the sequence of bytes at p was indeed created
+   by mkLoadImm_EXACTLY2or5 with the given parameters. */
+static Bool isLoadImm_EXACTLY2or5 ( UChar* p_to_check,
+                                    UInt r_dst, ULong imm, Bool mode64 )
+{
+   vassert(r_dst < 0x20);
+
+   if (!mode64) {
+      /* In 32-bit mode, make sure the top 32 bits of imm are a sign
+         extension of the bottom 32 bits.  (Probably unnecessary.) */
+      UInt u32 = (UInt)imm;
+      Int  s32 = (Int)u32;
+      Long s64 = (Long)s32;
+      imm = (ULong)s64;
+   }
+
+   if (!mode64) {
+      UInt   expect[2] = { 0, 0 };
+      UChar* p         = (UChar*)&expect[0];
+      // addis r_dst,r0,(imm>>16) => lis r_dst, (imm>>16)
+      p = mkFormD(p, 15, r_dst, 0, (imm>>16) & 0xFFFF);
+      // ori r_dst, r_dst, (imm & 0xFFFF)
+      p = mkFormD(p, 24, r_dst, r_dst, imm & 0xFFFF);
+      vassert(p == (UChar*)&expect[2]);
+
+      return fetch32(p_to_check + 0) == expect[0]
+             && fetch32(p_to_check + 4) == expect[1];
+
+   } else {
+      UInt   expect[5] = { 0, 0, 0, 0, 0 };
+      UChar* p         = (UChar*)&expect[0];
+      // full 64bit immediate load: 5 (five!) insns.
+
+      // load high word
+      // lis r_dst, (imm>>48) & 0xFFFF
+      p = mkFormD(p, 15, r_dst, 0, (imm>>48) & 0xFFFF);
+
+      // ori r_dst, r_dst, (imm>>32) & 0xFFFF
+      p = mkFormD(p, 24, r_dst, r_dst, (imm>>32) & 0xFFFF);
+         
+      // shift r_dst low word to high word => rldicr
+      p = mkFormMD(p, 30, r_dst, r_dst, 32, 31, 1);
+
+      // load low word
+      // oris r_dst, r_dst, (imm>>16) & 0xFFFF
+      p = mkFormD(p, 25, r_dst, r_dst, (imm>>16) & 0xFFFF);
+
+      // ori r_dst, r_dst, (imm) & 0xFFFF
+      p = mkFormD(p, 24, r_dst, r_dst, imm & 0xFFFF);
+
+      vassert(p == (UChar*)&expect[5]);
+
+      return fetch32(p_to_check + 0) == expect[0]
+             && fetch32(p_to_check + 4) == expect[1]
+             && fetch32(p_to_check + 8) == expect[2]
+             && fetch32(p_to_check + 12) == expect[3]
+             && fetch32(p_to_check + 16) == expect[4];
+   }
+}
+
+
+/* Generate a machine-word sized load or store.  Simplified version of
+   the Pin_Load and Pin_Store cases below. */
+static UChar* do_load_or_store_machine_word ( 
+                 UChar* p, Bool isLoad,
+                 UInt reg, PPCAMode* am, Bool mode64 )
+{
+   if (isLoad) {
+      UInt opc1, sz = mode64 ? 8 : 4;
+      switch (am->tag) {
+         case Pam_IR:
+            if (mode64) {
+               vassert(0 == (am->Pam.IR.index & 3));
+            }
+            switch (sz) {
+               case 4:  opc1 = 32; vassert(!mode64); break;
+               case 8:  opc1 = 58; vassert(mode64);  break;
+               default: vassert(0);
+            }
+            p = doAMode_IR(p, opc1, reg, am, mode64);
+            break;
+         case Pam_RR:
+            /* we could handle this case, but we don't expect to ever
+               need to. */
+            vassert(0);
+         default:
+            vassert(0);
+      }
+   } else /*store*/ {
+      UInt opc1, sz = mode64 ? 8 : 4;
+      switch (am->tag) {
+         case Pam_IR:
+            if (mode64) {
+               vassert(0 == (am->Pam.IR.index & 3));
+            }
+            switch (sz) {
+               case 4:  opc1 = 36; vassert(!mode64); break;
+               case 8:  opc1 = 62; vassert(mode64);  break;
+               default: vassert(0);
+            }
+            p = doAMode_IR(p, opc1, reg, am, mode64);
+            break;
+         case Pam_RR:
+            /* we could handle this case, but we don't expect to ever
+               need to. */
+            vassert(0);
+         default:
+            vassert(0);
+      }
+   }
+   return p;
+}
+
+/* Generate a 32-bit sized load or store.  Simplified version of
+   do_load_or_store_machine_word above. */
+static UChar* do_load_or_store_word32 ( 
+                 UChar* p, Bool isLoad,
+                 UInt reg, PPCAMode* am, Bool mode64 )
+{
+   if (isLoad) {
+      UInt opc1;
+      switch (am->tag) {
+         case Pam_IR:
+            if (mode64) {
+               vassert(0 == (am->Pam.IR.index & 3));
+            }
+            opc1 = 32;
+            p = doAMode_IR(p, opc1, reg, am, mode64);
+            break;
+         case Pam_RR:
+            /* we could handle this case, but we don't expect to ever
+               need to. */
+            vassert(0);
+         default:
+            vassert(0);
+      }
+   } else /*store*/ {
+      UInt opc1;
+      switch (am->tag) {
+         case Pam_IR:
+            if (mode64) {
+               vassert(0 == (am->Pam.IR.index & 3));
+            }
+            opc1 = 36;
+            p = doAMode_IR(p, opc1, reg, am, mode64);
+            break;
+         case Pam_RR:
+            /* we could handle this case, but we don't expect to ever
+               need to. */
+            vassert(0);
+         default:
+            vassert(0);
+      }
+   }
+   return p;
+}
+
 /* Move r_dst to r_src */
 static UChar* mkMoveReg ( UChar* p, UInt r_dst, UInt r_src )
 {
@@ -2926,18 +3257,19 @@
 
 /* Emit an instruction into buf and return the number of bytes used.
    Note that buf is not the insn's final place, and therefore it is
-   imperative to emit position-independent code. 
-
-   Note, dispatch should always be NULL since ppc32/64 backends
-   use a call-return scheme to get from the dispatcher to generated
-   code and back.
+   imperative to emit position-independent code.  If the emitted
+   instruction was a profiler inc, set *is_profInc to True, else leave
+   it unchanged.
 */
-Int emit_PPCInstr ( UChar* buf, Int nbuf, PPCInstr* i, 
+Int emit_PPCInstr ( /*MB_MOD*/Bool* is_profInc,
+                    UChar* buf, Int nbuf, PPCInstr* i, 
                     Bool mode64,
-                    void* dispatch_unassisted, void* dispatch_assisted )
+                    void* disp_cp_chain_me_to_slowEP,
+                    void* disp_cp_chain_me_to_fastEP,
+                    void* disp_cp_xindir,
+                    void* disp_cp_xassisted )
 {
    UChar* p = &buf[0];
-   UChar* ptmp = p;
    vassert(nbuf >= 32);
 
    if (0) {
@@ -3304,6 +3636,7 @@
          getRegUsage_PPCInstr above, %r10 is used as an address temp */
 
       /* jump over the following insns if condition does not hold */
+      UChar* ptmp = NULL;
       if (cond.test != Pct_ALWAYS) {
          /* jmp fwds if !condition */
          /* don't know how many bytes to jump over yet...
@@ -3332,75 +3665,176 @@
       goto done;
    }
 
-   case Pin_Goto: {
-      UInt        trc   = 0;
-      UChar       r_ret = 3;        /* Put target addr into %r3 */
-      PPCCondCode cond  = i->Pin.Goto.cond;
-      UInt r_dst;
-      ULong imm_dst;
+   case Pin_XDirect: {
+      /* NB: what goes on here has to be very closely coordinated
+         with the chainXDirect_PPC and unchainXDirect_PPC below. */
+      /* We're generating chain-me requests here, so we need to be
+            sure this is actually allowed -- no-redir translations
+            can't use chain-me's.  Hence: */
+      vassert(disp_cp_chain_me_to_slowEP != NULL);
+      vassert(disp_cp_chain_me_to_fastEP != NULL);
 
-      vassert(dispatch_unassisted == NULL);
-      vassert(dispatch_assisted == NULL);
-      
-      /* First off, if this is conditional, create a conditional
-         jump over the rest of it. */
-      if (cond.test != Pct_ALWAYS) {
-         /* jmp fwds if !condition */
-         /* don't know how many bytes to jump over yet...
-            make space for a jump instruction and fill in later. */
-         ptmp = p; /* fill in this bit later */
+      /* First off, if this is conditional, create a conditional jump
+         over the rest of it.  Or at least, leave a space for it that
+         we will shortly fill in. */
+      UChar* ptmp = NULL;
+      if (i->Pin.XDirect.cond.test != Pct_ALWAYS) {
+         vassert(i->Pin.XDirect.cond.flag != Pcf_NONE);
+         ptmp = p;
          p += 4;
-      }
-
-      // cond succeeds...
-      
-      /* If a non-boring, set GuestStatePtr appropriately. */
-      switch (i->Pin.Goto.jk) {
-         case Ijk_ClientReq:   trc = VEX_TRC_JMP_CLIENTREQ;   break;
-         case Ijk_Sys_syscall: trc = VEX_TRC_JMP_SYS_SYSCALL; break;
-         case Ijk_Yield:       trc = VEX_TRC_JMP_YIELD;       break;
-         case Ijk_EmWarn:      trc = VEX_TRC_JMP_EMWARN;      break;
-         case Ijk_EmFail:      trc = VEX_TRC_JMP_EMFAIL;      break;
-         case Ijk_MapFail:     trc = VEX_TRC_JMP_MAPFAIL;     break;
-         case Ijk_NoDecode:    trc = VEX_TRC_JMP_NODECODE;    break;
-         case Ijk_TInval:      trc = VEX_TRC_JMP_TINVAL;      break;
-         case Ijk_NoRedir:     trc = VEX_TRC_JMP_NOREDIR;     break;
-         case Ijk_SigTRAP:     trc = VEX_TRC_JMP_SIGTRAP;     break;
-         case Ijk_SigBUS:      trc = VEX_TRC_JMP_SIGBUS;      break;
-         case Ijk_Ret:
-         case Ijk_Call:
-         case Ijk_Boring:
-            break;
-         default: 
-            ppIRJumpKind(i->Pin.Goto.jk);
-            vpanic("emit_PPCInstr.Pin_Goto: unknown jump kind");
-      }
-      if (trc !=0) {
-         vassert(trc < 0x10000);
-         /* addi r31,0,trc */
-         p = mkFormD(p, 14, 31, 0, trc);               // p += 4
-      }
-
-      /* Get the destination address into %r_ret */
-      if (i->Pin.Goto.dst->tag == Pri_Imm) {
-         imm_dst = i->Pin.Goto.dst->Pri.Imm;
-         p = mkLoadImm(p, r_ret, imm_dst, mode64);     // p += 4|8|20
       } else {
-         vassert(i->Pin.Goto.dst->tag == Pri_Reg);
-         r_dst = iregNo(i->Pin.Goto.dst->Pri.Reg, mode64);
-         p = mkMoveReg(p, r_ret, r_dst);               // p += 4
+         vassert(i->Pin.XDirect.cond.flag == Pcf_NONE);
       }
-      
-      /* blr */
-      p = mkFormXL(p, 19, Pct_ALWAYS, 0, 0, 16, 0);    // p += 4
+
+      /* Update the guest CIA. */
+      /* imm32/64 r30, dstGA */
+      if (!mode64) vassert(0 == (((ULong)i->Pin.XDirect.dstGA) >> 32));
+      p = mkLoadImm(p, /*r*/30, (ULong)i->Pin.XDirect.dstGA, mode64);
+      /* stw/std r30, amCIA */
+      p = do_load_or_store_machine_word(
+             p, False/*!isLoad*/,
+             /*r*/30, i->Pin.XDirect.amCIA, mode64
+          );
+
+      /* --- FIRST PATCHABLE BYTE follows --- */
+      /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
+         to) backs up the return address, so as to find the address of
+         the first patchable byte.  So: don't change the number of
+         instructions (32-bit: 4, 64-bit: 7) below. */
+      /* imm32/64-fixed r30, VG_(disp_cp_chain_me_to_{slowEP,fastEP} */
+      void* disp_cp_chain_me
+               = i->Pin.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP 
+                                         : disp_cp_chain_me_to_slowEP;
+      p = mkLoadImm_EXACTLY2or5(
+             p, /*r*/30, Ptr_to_ULong(disp_cp_chain_me), mode64);
+      /* mtctr r30 */
+      p = mkFormXFX(p, /*r*/30, 9, 467);
+      /* bctrl */
+      p = mkFormXL(p, 19, Pct_ALWAYS, 0, 0, 528, 1);
+      /* --- END of PATCHABLE BYTES --- */
 
       /* Fix up the conditional jump, if there was one. */
-      if (cond.test != Pct_ALWAYS) {
+      if (i->Pin.XDirect.cond.test != Pct_ALWAYS) {
          Int delta = p - ptmp;
-         vassert(delta >= 12 && delta <= 32);
+         vassert(delta >= 16 && delta <= 64 && 0 == (delta & 3));
          /* bc !ct,cf,delta */
-         mkFormB(ptmp, invertCondTest(cond.test),
-                 cond.flag, delta>>2, 0, 0);
+         mkFormB(ptmp, invertCondTest(i->Pin.XDirect.cond.test),
+                 i->Pin.XDirect.cond.flag, (delta>>2), 0, 0);
+      }
+      goto done;
+   }
+
+   case Pin_XIndir: {
+      /* We're generating transfers that could lead indirectly to a
+         chain-me, so we need to be sure this is actually allowed --
+         no-redir translations are not allowed to reach normal
+         translations without going through the scheduler.  That means
+         no XDirects or XIndirs out from no-redir translations.
+         Hence: */
+      vassert(disp_cp_xindir != NULL);
+
+      /* First off, if this is conditional, create a conditional jump
+         over the rest of it.  Or at least, leave a space for it that
+         we will shortly fill in. */
+      UChar* ptmp = NULL;
+      if (i->Pin.XIndir.cond.test != Pct_ALWAYS) {
+         vassert(i->Pin.XIndir.cond.flag != Pcf_NONE);
+         ptmp = p;
+         p += 4;
+      } else {
+         vassert(i->Pin.XIndir.cond.flag == Pcf_NONE);
+      }
+
+      /* Update the guest CIA. */
+      /* stw/std r-dstGA, amCIA */
+      p = do_load_or_store_machine_word(
+             p, False/*!isLoad*/,
+             iregNo(i->Pin.XIndir.dstGA, mode64),
+             i->Pin.XIndir.amCIA, mode64
+          );
+
+      /* imm32/64 r30, VG_(disp_cp_xindir) */
+      p = mkLoadImm(p, /*r*/30, (ULong)Ptr_to_ULong(disp_cp_xindir), mode64);
+      /* mtctr r30 */
+      p = mkFormXFX(p, /*r*/30, 9, 467);
+      /* bctr */
+      p = mkFormXL(p, 19, Pct_ALWAYS, 0, 0, 528, 0);
+
+      /* Fix up the conditional jump, if there was one. */
+      if (i->Pin.XIndir.cond.test != Pct_ALWAYS) {
+         Int delta = p - ptmp;
+         vassert(delta >= 16 && delta <= 32 && 0 == (delta & 3));
+         /* bc !ct,cf,delta */
+         mkFormB(ptmp, invertCondTest(i->Pin.XIndir.cond.test),
+                 i->Pin.XIndir.cond.flag, (delta>>2), 0, 0);
+      }
+      goto done;
+   }
+
+   case Pin_XAssisted: {
+      /* First off, if this is conditional, create a conditional jump
+         over the rest of it.  Or at least, leave a space for it that
+         we will shortly fill in. */
+      UChar* ptmp = NULL;
+      if (i->Pin.XAssisted.cond.test != Pct_ALWAYS) {
+         vassert(i->Pin.XAssisted.cond.flag != Pcf_NONE);
+         ptmp = p;
+         p += 4;
+      } else {
+         vassert(i->Pin.XAssisted.cond.flag == Pcf_NONE);
+      }
+
+      /* Update the guest CIA. */
+      /* stw/std r-dstGA, amCIA */
+      p = do_load_or_store_machine_word(
+             p, False/*!isLoad*/,
+             iregNo(i->Pin.XIndir.dstGA, mode64),
+             i->Pin.XIndir.amCIA, mode64
+          );
+
+      /* imm32/64 r31, $magic_number */
+      UInt trcval = 0;
+      switch (i->Pin.XAssisted.jk) {
+         case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
+         case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
+         //case Ijk_Sys_int128:  trcval = VEX_TRC_JMP_SYS_INT128;  break;
+         //case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
+         case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
+         case Ijk_EmFail:      trcval = VEX_TRC_JMP_EMFAIL;      break;
+         //case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
+         case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
+         case Ijk_TInval:      trcval = VEX_TRC_JMP_TINVAL;      break;
+         case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
+         case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
+         //case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
+         case Ijk_SigBUS:        trcval = VEX_TRC_JMP_SIGBUS;    break;
+         case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
+         /* We don't expect to see the following being assisted. */
+         //case Ijk_Ret:
+         //case Ijk_Call:
+         /* fallthrough */
+         default: 
+            ppIRJumpKind(i->Pin.XAssisted.jk);
+            vpanic("emit_ARMInstr.Pin_XAssisted: unexpected jump kind");
+      }
+      vassert(trcval != 0);
+      p = mkLoadImm(p, /*r*/31, trcval, mode64);
+
+      /* imm32/64 r30, VG_(disp_cp_xassisted) */
+      p = mkLoadImm(p, /*r*/30,
+                       (ULong)Ptr_to_ULong(disp_cp_xassisted), mode64);
+      /* mtctr r30 */
+      p = mkFormXFX(p, /*r*/30, 9, 467);
+      /* bctr */
+      p = mkFormXL(p, 19, Pct_ALWAYS, 0, 0, 528, 0);
+
+      /* Fix up the conditional jump, if there was one. */
+      if (i->Pin.XAssisted.cond.test != Pct_ALWAYS) {
+         Int delta = p - ptmp;
+         vassert(delta >= 16 && delta <= 32 && 0 == (delta & 3));
+         /* bc !ct,cf,delta */
+         mkFormB(ptmp, invertCondTest(i->Pin.XAssisted.cond.test),
+                 i->Pin.XAssisted.cond.flag, (delta>>2), 0, 0);
       }
       goto done;
    }
@@ -3415,6 +3849,7 @@
       cond = i->Pin.CMov.cond;
 
       /* branch (if cond fails) over move instrs */
+      UChar* ptmp = NULL;
       if (cond.test != Pct_ALWAYS) {
          /* don't know how many bytes to jump over yet...
             make space for a jump instruction and fill in later. */
@@ -4433,6 +4868,7 @@
       p = mkFormX(p, 63, fr_dst, 0, 10,  72, 0);
       goto done;
    }
+
    case Pin_DfpI64StoD128: {
       UInt fr_dstHi = fregNo( i->Pin.DfpI64StoD128.dst_hi );
       UInt fr_dstLo = fregNo( i->Pin.DfpI64StoD128.dst_lo );
@@ -4451,6 +4887,87 @@
       p = mkFormX(p, 63, fr_dstLo, 0, 11,  72, 0);
       goto done;
    }
+
+   case Pin_EvCheck: {
+      /* This requires a 32-bit dec/test in both 32- and 64-bit
+         modes. */
+      /* We generate:
+            lwz     r30, amCounter
+            addic.  r30, r30, -1
+            stw     r30, amCounter
+            bge     nofail
+            lwz/ld  r30, amFailAddr
+            mtctr   r30
+            bctr
+           nofail:
+      */
+      UChar* p0 = p;
+      /* lwz r30, amCounter */
+      p = do_load_or_store_word32(p, True/*isLoad*/, /*r*/30,
+                                  i->Pin.EvCheck.amCounter, mode64);
+      /* addic. r30,r30,-1 */
+      p = emit32(p, 0x37DEFFFF);
+      /* stw r30, amCounter */
+      p = do_load_or_store_word32(p, False/*!isLoad*/, /*r*/30,
+                                  i->Pin.EvCheck.amCounter, mode64);
+      /* bge nofail */
+      p = emit32(p, 0x40800010);
+      /* lwz/ld r30, amFailAddr */
+      p = do_load_or_store_machine_word(p, True/*isLoad*/, /*r*/30,
+                                        i->Pin.EvCheck.amFailAddr, mode64);
+      /* mtctr r30 */
+      p = mkFormXFX(p, /*r*/30, 9, 467);
+      /* bctr */
+      p = mkFormXL(p, 19, Pct_ALWAYS, 0, 0, 528, 0);
+      /* nofail: */
+
+      /* Crosscheck */
+      vassert(evCheckSzB_PPC() == (UChar*)p - (UChar*)p0);
+      goto done;
+   }
+
+   case Pin_ProfInc: {
+      /* We generate:
+               (ctrP is unknown now, so use 0x65556555(65556555) in the
+               expectation that a later call to LibVEX_patchProfCtr
+               will be used to fill in the immediate fields once the
+               right value is known.)
+            32-bit:
+              imm32-exactly r30, 0x65556555
+              lwz     r29, 4(r30)
+              addic.  r29, r29, 1
+              stw     r29, 4(r30)
+              lwz     r29, 0(r30)
+              addze   r29, r29
+              stw     r29, 0(r30)
+            64-bit:
+              imm64-exactly r30, 0x6555655565556555
+              ld      r29, 0(r30)
+              addi    r29, r29, 1
+              std     r29, 0(r30)
+      */
+      if (mode64) {
+         p = mkLoadImm_EXACTLY2or5(
+                p, /*r*/30, 0x6555655565556555ULL, True/*mode64*/);
+         p = emit32(p, 0xEBBE0000);
+         p = emit32(p, 0x3BBD0001);
+         p = emit32(p, 0xFBBE0000);
+      } else {
+         p = mkLoadImm_EXACTLY2or5(
+                p, /*r*/30, 0x65556555ULL, False/*!mode64*/);
+         p = emit32(p, 0x83BE0004);
+         p = emit32(p, 0x37BD0001);
+         p = emit32(p, 0x93BE0004);
+         p = emit32(p, 0x83BE0000);
+         p = emit32(p, 0x7FBD0194);
+         p = emit32(p, 0x93BE0000);
+      }
+      /* Tell the caller .. */
+      vassert(!(*is_profInc));
+      *is_profInc = True;
+      goto done;
+   }
+
    default: 
       goto bad;
    }
@@ -4462,10 +4979,151 @@
    /*NOTREACHED*/
    
   done:
-   vassert(p - &buf[0] <= 32);
+   vassert(p - &buf[0] <= 64);
    return p - &buf[0];
 }
 
+
+/* How big is an event check?  See case for Pin_EvCheck in
+   emit_PPCInstr just above.  That crosschecks what this returns, so
+   we can tell if we're inconsistent. */
+Int evCheckSzB_PPC ( void )
+{
+  return 28;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   emitInstr case for XDirect, above. */
+VexInvalRange chainXDirect_PPC ( void* place_to_chain,
+                                 void* disp_cp_chain_me_EXPECTED,
+                                 void* place_to_jump_to,
+                                 Bool  mode64 )
+{
+   /* What we're expecting to see is:
+        imm32/64-fixed r30, disp_cp_chain_me_to_EXPECTED
+        mtctr r30
+        bctrl
+      viz
+        <8 or 20 bytes generated by mkLoadImm_EXACTLY2or5>
+        7F C9 03 A6
+        4E 80 04 21
+   */
+   UChar* p = (UChar*)place_to_chain;
+   vassert(0 == (3 & (HWord)p));
+   vassert(isLoadImm_EXACTLY2or5(p, /*r*/30,
+                                 Ptr_to_ULong(disp_cp_chain_me_EXPECTED),
+                                 mode64));
+   vassert(fetch32(p + (mode64 ? 20 : 8) + 0) == 0x7FC903A6);
+   vassert(fetch32(p + (mode64 ? 20 : 8) + 4) == 0x4E800421);
+   /* And what we want to change it to is:
+        imm32/64-fixed r30, place_to_jump_to
+        mtctr r30
+        bctr
+      viz
+        <8 or 20 bytes generated by mkLoadImm_EXACTLY2or5>
+        7F C9 03 A6
+        4E 80 04 20
+      The replacement has the same length as the original.
+   */
+   p = mkLoadImm_EXACTLY2or5(p, /*r*/30,
+                             Ptr_to_ULong(place_to_jump_to), mode64);
+   p = emit32(p, 0x7FC903A6);
+   p = emit32(p, 0x4E800420);
+
+   Int len = p - (UChar*)place_to_chain;
+   vassert(len == (mode64 ? 28 : 16)); /* stay sane */
+   VexInvalRange vir = {(HWord)place_to_chain, len};
+   return vir;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   emitInstr case for XDirect, above. */
+VexInvalRange unchainXDirect_PPC ( void* place_to_unchain,
+                                   void* place_to_jump_to_EXPECTED,
+                                   void* disp_cp_chain_me,
+                                   Bool  mode64 )
+{
+   /* What we're expecting to see is:
+        imm32/64-fixed r30, place_to_jump_to_EXPECTED
+        mtctr r30
+        bctr
+      viz
+        <8 or 20 bytes generated by mkLoadImm_EXACTLY2or5>
+        7F C9 03 A6
+        4E 80 04 20
+   */
+   UChar* p = (UChar*)place_to_unchain;
+   vassert(0 == (3 & (HWord)p));
+   vassert(isLoadImm_EXACTLY2or5(p, /*r*/30,
+                                 Ptr_to_ULong(place_to_jump_to_EXPECTED),
+                                 mode64));
+   vassert(fetch32(p + (mode64 ? 20 : 8) + 0) == 0x7FC903A6);
+   vassert(fetch32(p + (mode64 ? 20 : 8) + 4) == 0x4E800420);
+   /* And what we want to change it to is:
+        imm32/64-fixed r30, disp_cp_chain_me
+        mtctr r30
+        bctrl
+      viz
+        <8 or 20 bytes generated by mkLoadImm_EXACTLY2or5>
+        7F C9 03 A6
+        4E 80 04 21
+      The replacement has the same length as the original.
+   */
+   p = mkLoadImm_EXACTLY2or5(p, /*r*/30,
+                             Ptr_to_ULong(disp_cp_chain_me), mode64);
+   p = emit32(p, 0x7FC903A6);
+   p = emit32(p, 0x4E800421);
+
+   Int len = p - (UChar*)place_to_unchain;
+   vassert(len == (mode64 ? 28 : 16)); /* stay sane */
+   VexInvalRange vir = {(HWord)place_to_unchain, len};
+   return vir;
+}
+
+
+/* Patch the counter address into a profile inc point, as previously
+   created by the Pin_ProfInc case for emit_PPCInstr. */
+VexInvalRange patchProfInc_PPC ( void*  place_to_patch,
+                                 ULong* location_of_counter,
+                                 Bool   mode64 )
+{
+   UChar* p = (UChar*)place_to_patch;
+   vassert(0 == (3 & (HWord)p));
+
+   Int len = 0;
+   if (mode64) {
+      vassert(isLoadImm_EXACTLY2or5(p, /*r*/30,
+                                    0x6555655565556555ULL, True/*mode64*/));
+      vassert(fetch32(p + 20) == 0xEBBE0000);
+      vassert(fetch32(p + 24) == 0x3BBD0001);
+      vassert(fetch32(p + 28) == 0xFBBE0000);
+      p = mkLoadImm_EXACTLY2or5(p, /*r*/30,
+                                Ptr_to_ULong(location_of_counter),
+                                True/*mode64*/);
+      len = p - (UChar*)place_to_patch;
+      vassert(len == 20);
+   } else {
+      vassert(isLoadImm_EXACTLY2or5(p, /*r*/30,
+                                    0x65556555ULL, False/*!mode64*/));
+      vassert(fetch32(p +  8) == 0x83BE0004);
+      vassert(fetch32(p + 12) == 0x37BD0001);
+      vassert(fetch32(p + 16) == 0x93BE0004);
+      vassert(fetch32(p + 20) == 0x83BE0000);
+      vassert(fetch32(p + 24) == 0x7FBD0194);
+      vassert(fetch32(p + 28) == 0x93BE0000);
+      p = mkLoadImm_EXACTLY2or5(p, /*r*/30,
+                                Ptr_to_ULong(location_of_counter),
+                                False/*!mode64*/);
+      len = p - (UChar*)place_to_patch;
+      vassert(len == 8);
+   }
+   VexInvalRange vir = {(HWord)place_to_patch, len};
+   return vir;
+}
+
+
 /*---------------------------------------------------------------*/
 /*--- end                                     host_ppc_defs.c ---*/
 /*---------------------------------------------------------------*/
diff --git a/priv/host_ppc_defs.h b/priv/host_ppc_defs.h
index c09d748..1a1f902 100644
--- a/priv/host_ppc_defs.h
+++ b/priv/host_ppc_defs.h
@@ -454,7 +454,9 @@
       Pin_MulL,       /* widening multiply */
       Pin_Div,        /* div */
       Pin_Call,       /* call to address in register */
-      Pin_Goto,       /* conditional/unconditional jmp to dst */
+      Pin_XDirect,    /* direct transfer to GA */
+      Pin_XIndir,     /* indirect transfer to GA */
+      Pin_XAssisted,  /* assisted transfer to GA */
       Pin_CMov,       /* conditional move */
       Pin_Load,       /* zero-extending load a 8|16|32|64 bit value from mem */
       Pin_LoadL,      /* load-linked (lwarx/ldarx) 32|64 bit value from mem */
@@ -503,6 +505,8 @@
                          * immediate value */
       Pin_DfpD128toD64, /* DFP 128 to DFP 64 op */
       Pin_DfpI64StoD128, /* DFP signed integer to DFP 128 */
+      Pin_EvCheck,    /* Event check */
+      Pin_ProfInc     /* 64-bit profile counter increment */
    }
    PPCInstrTag;
 
@@ -594,13 +598,30 @@
             Addr64      target;
             UInt        argiregs;
          } Call;
-         /* Pseudo-insn.  Goto dst, on given condition (which could be
-            Pct_ALWAYS). */
+         /* Update the guest CIA value, then exit requesting to chain
+            to it.  May be conditional.  Use of Addr64 in order to cope
+            with 64-bit hosts. */
          struct {
+            Addr64      dstGA;    /* next guest address */
+            PPCAMode*   amCIA;    /* amode in guest state for CIA */
+            PPCCondCode cond;     /* can be ALWAYS */
+            Bool        toFastEP; /* chain to the slow or fast point? */
+         } XDirect;
+         /* Boring transfer to a guest address not known at JIT time.
+            Not chainable.  May be conditional. */
+         struct {
+            HReg        dstGA;
+            PPCAMode*   amCIA;
+            PPCCondCode cond; /* can be ALWAYS */
+         } XIndir;
+         /* Assisted transfer to a guest address, most general case.
+            Not chainable.  May be conditional. */
+         struct {
+            HReg        dstGA;
+            PPCAMode*   amCIA;
+            PPCCondCode cond; /* can be ALWAYS */
             IRJumpKind  jk;
-            PPCCondCode cond;
-            PPCRI*      dst;
-         } Goto;
+         } XAssisted;
          /* Mov src to dst on the given condition, which may not
             be the bogus Pct_ALWAYS. */
          struct {
@@ -854,6 +875,15 @@
             HReg   dst_lo;
             HReg   src;
          } DfpI64StoD128;
+         struct {
+            PPCAMode* amCounter;
+            PPCAMode* amFailAddr;
+         } EvCheck;
+         struct {
+            /* No fields.  The address of the counter to inc is
+               installed later, post-translation, by patching it in,
+               as it is not known at translation time. */
+         } ProfInc;
       } Pin;
    }
    PPCInstr;
@@ -868,7 +898,12 @@
 extern PPCInstr* PPCInstr_MulL       ( Bool syned, Bool hi32, Bool sz32, HReg, HReg, HReg );
 extern PPCInstr* PPCInstr_Div        ( Bool extended, Bool syned, Bool sz32, HReg dst, HReg srcL, HReg srcR );
 extern PPCInstr* PPCInstr_Call       ( PPCCondCode, Addr64, UInt );
-extern PPCInstr* PPCInstr_Goto       ( IRJumpKind, PPCCondCode cond, PPCRI* dst );
+extern PPCInstr* PPCInstr_XDirect    ( Addr64 dstGA, PPCAMode* amCIA,
+                                       PPCCondCode cond, Bool toFastEP );
+extern PPCInstr* PPCInstr_XIndir     ( HReg dstGA, PPCAMode* amCIA,
+                                       PPCCondCode cond );
+extern PPCInstr* PPCInstr_XAssisted  ( HReg dstGA, PPCAMode* amCIA,
+                                       PPCCondCode cond, IRJumpKind jk );
 extern PPCInstr* PPCInstr_CMov       ( PPCCondCode, HReg dst, PPCRI* src );
 extern PPCInstr* PPCInstr_Load       ( UChar sz,
                                        HReg dst, PPCAMode* src, Bool mode64 );
@@ -928,6 +963,9 @@
                                          HReg dst_lo, HReg src_lo);
 extern PPCInstr* PPCInstr_DfpI64StoD128 ( PPCFpOp op, HReg dst_hi,
                                           HReg dst_lo, HReg src);
+extern PPCInstr* PPCInstr_EvCheck     ( PPCAMode* amCounter,
+                                        PPCAMode* amFailAddr );
+extern PPCInstr* PPCInstr_ProfInc     ( void );
 
 extern void ppPPCInstr(PPCInstr*, Bool mode64);
 
@@ -937,10 +975,13 @@
 extern void         getRegUsage_PPCInstr ( HRegUsage*, PPCInstr*, Bool mode64 );
 extern void         mapRegs_PPCInstr     ( HRegRemap*, PPCInstr* , Bool mode64);
 extern Bool         isMove_PPCInstr      ( PPCInstr*, HReg*, HReg* );
-extern Int          emit_PPCInstr        ( UChar* buf, Int nbuf, PPCInstr*, 
+extern Int          emit_PPCInstr        ( /*MB_MOD*/Bool* is_profInc,
+                                           UChar* buf, Int nbuf, PPCInstr* i, 
                                            Bool mode64,
-                                           void* dispatch_unassisted,
-                                           void* dispatch_assisted );
+                                           void* disp_cp_chain_me_to_slowEP,
+                                           void* disp_cp_chain_me_to_fastEP,
+                                           void* disp_cp_xindir,
+                                           void* disp_cp_xassisted );
 
 extern void genSpill_PPC  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                             HReg rreg, Int offsetB, Bool mode64 );
@@ -948,9 +989,37 @@
                             HReg rreg, Int offsetB, Bool mode64 );
 
 extern void         getAllocableRegs_PPC ( Int*, HReg**, Bool mode64 );
-extern HInstrArray* iselSB_PPC           ( IRSB*, VexArch,
-                                                  VexArchInfo*,
-                                                  VexAbiInfo* );
+extern HInstrArray* iselSB_PPC           ( IRSB*, 
+                                           VexArch,
+                                           VexArchInfo*,
+                                           VexAbiInfo*,
+                                           Int offs_Host_EvC_Counter,
+                                           Int offs_Host_EvC_FailAddr,
+                                           Bool chainingAllowed,
+                                           Bool addProfInc,
+                                           Addr64 max_ga );
+
+/* How big is an event check?  This is kind of a kludge because it
+   depends on the offsets of host_EvC_FAILADDR and
+   host_EvC_COUNTER. */
+extern Int evCheckSzB_PPC ( void );
+
+/* Perform a chaining and unchaining of an XDirect jump. */
+extern VexInvalRange chainXDirect_PPC ( void* place_to_chain,
+                                        void* disp_cp_chain_me_EXPECTED,
+                                        void* place_to_jump_to,
+                                        Bool  mode64 );
+
+extern VexInvalRange unchainXDirect_PPC ( void* place_to_unchain,
+                                          void* place_to_jump_to_EXPECTED,
+                                          void* disp_cp_chain_me,
+                                          Bool  mode64 );
+
+/* Patch the counter location into an existing ProfInc point. */
+extern VexInvalRange patchProfInc_PPC ( void*  place_to_patch,
+                                        ULong* location_of_counter,
+                                        Bool   mode64 );
+
 
 #endif /* ndef __VEX_HOST_PPC_DEFS_H */
 
diff --git a/priv/host_ppc_isel.c b/priv/host_ppc_isel.c
index be2b0b3..be10029 100644
--- a/priv/host_ppc_isel.c
+++ b/priv/host_ppc_isel.c
@@ -219,17 +219,20 @@
 
    - A mapping from IRTemp to HReg.  This tells the insn selector
      which virtual register(s) are associated with each IRTemp
-      temporary.  This is computed before insn selection starts, and
-      does not change.  We expect this mapping to map precisely the
-      same set of IRTemps as the type mapping does.
+     temporary.  This is computed before insn selection starts, and
+     does not change.  We expect this mapping to map precisely the
+     same set of IRTemps as the type mapping does.
  
-         - vregmap   holds the primary register for the IRTemp.
-         - vregmapHI holds the secondary register for the IRTemp,
+         - vregmapLo    holds the primary register for the IRTemp.
+         - vregmapMedLo holds the secondary register for the IRTemp,
               if any is needed.  That's only for Ity_I64 temps
               in 32 bit mode or Ity_I128 temps in 64-bit mode.
-
-    - The name of the vreg in which we stash a copy of the link reg,
-      so helper functions don't kill it.
+         - vregmapMedHi is only for dealing with Ity_I128 temps in
+              32 bit mode.  It holds bits 95:64 (Intel numbering)
+              of the IRTemp.
+         - vregmapHi is also only for dealing with Ity_I128 temps
+              in 32 bit mode.  It holds the most significant bits
+              (127:96 in Intel numbering) of the IRTemp.
 
     - The code array, that is, the insns selected so far.
  
@@ -248,11 +251,20 @@
       described in set_FPU_rounding_mode below.
 
     - A VexMiscInfo*, needed for knowing how to generate
-      function calls for this target
+      function calls for this target.
+
+    - The maximum guest address of any guest insn in this block.
+      Actually, the address of the highest-addressed byte from any
+      insn in this block.  Is set at the start and does not change.
+      This is used for detecting jumps which are definitely
+      forward-edges from this block, and therefore can be made
+      (chained) to the fast entry point of the destination, thereby
+      avoiding the destination's event check.
 */
  
 typedef
    struct {
+      /* Constant -- are set at the start and do not change. */
       IRTypeEnv* type_env;
                               //    64-bit mode              32-bit mode
       HReg*    vregmapLo;     // Low 64-bits [63:0]    Low 32-bits     [31:0]
@@ -261,20 +273,21 @@
       HReg*    vregmapHi;     // unused                highest 32-bits [127:96]
       Int      n_vregmap;
 
-      HReg     savedLR;
-
-      HInstrArray* code;
- 
-      Int          vreg_ctr;
- 
       /* 27 Jan 06: Not currently used, but should be */
       UInt         hwcaps;
 
       Bool         mode64;
 
-      IRExpr*      previous_rm;
-
       VexAbiInfo*  vbi;
+
+      Bool         chainingAllowed;
+      Addr64       max_ga;
+
+      /* These are modified as we go along. */
+      HInstrArray* code;
+      Int          vreg_ctr;
+
+      IRExpr*      previous_rm;
    }
    ISelEnv;
  
@@ -4684,18 +4697,61 @@
 
    /* --------- EXIT --------- */
    case Ist_Exit: {
-      PPCRI*      ri_dst;
-      PPCCondCode cc;
-      IRConstTag tag = stmt->Ist.Exit.dst->tag;
-      if (!mode64 && (tag != Ico_U32))
+      IRConst* dst = stmt->Ist.Exit.dst;
+      if (!mode64 && dst->tag != Ico_U32)
          vpanic("iselStmt(ppc): Ist_Exit: dst is not a 32-bit value");
-      if (mode64 && (tag != Ico_U64))
+      if (mode64 && dst->tag != Ico_U64)
          vpanic("iselStmt(ppc64): Ist_Exit: dst is not a 64-bit value");
-      ri_dst = iselWordExpr_RI(env, IRExpr_Const(stmt->Ist.Exit.dst));
-      cc     = iselCondCode(env,stmt->Ist.Exit.guard);
-      addInstr(env, PPCInstr_RdWrLR(True, env->savedLR));
-      addInstr(env, PPCInstr_Goto(stmt->Ist.Exit.jk, cc, ri_dst));
-      return;
+
+      PPCCondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
+      PPCAMode*   amCIA = PPCAMode_IR(stmt->Ist.Exit.offsIP,
+                                      hregPPC_GPR31(mode64));
+
+      /* Case: boring transfer to known address */
+      if (stmt->Ist.Exit.jk == Ijk_Boring
+          || stmt->Ist.Exit.jk == Ijk_Call
+          /* || stmt->Ist.Exit.jk == Ijk_Ret */) {
+         if (env->chainingAllowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool toFastEP
+               = mode64
+               ? (((Addr64)stmt->Ist.Exit.dst->Ico.U64) > (Addr64)env->max_ga)
+               : (((Addr32)stmt->Ist.Exit.dst->Ico.U32) > (Addr32)env->max_ga);
+            if (0) vex_printf("%s", toFastEP ? "Y" : ",");
+            addInstr(env, PPCInstr_XDirect(
+                             mode64 ? (Addr64)stmt->Ist.Exit.dst->Ico.U64
+                                    : (Addr64)stmt->Ist.Exit.dst->Ico.U32,
+                             amCIA, cc, toFastEP));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an assisted transfer,
+               as that's the only alternative that is allowable. */
+            HReg r = iselWordExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+            addInstr(env, PPCInstr_XAssisted(r, amCIA, cc, Ijk_Boring));
+         }
+         return;
+      }
+
+      /* Case: assisted transfer to arbitrary address */
+      switch (stmt->Ist.Exit.jk) {
+         //case Ijk_MapFail:
+         //case Ijk_SigSEGV: case Ijk_TInval: case Ijk_EmWarn:
+         case Ijk_NoDecode: case Ijk_SigBUS: case Ijk_SigTRAP:
+         case Ijk_EmFail:
+         {
+            HReg r = iselWordExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+            addInstr(env, PPCInstr_XAssisted(r, amCIA, cc,
+                                             stmt->Ist.Exit.jk));
+            return;
+         }
+         default:
+            break;
+      }
+
+      /* Do we ever expect to see any other kind? */
+      goto stmt_fail;
    }
 
    default: break;
@@ -4710,21 +4766,91 @@
 /*--- ISEL: Basic block terminators (Nexts)             ---*/
 /*---------------------------------------------------------*/
 
-static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
+static void iselNext ( ISelEnv* env,
+                       IRExpr* next, IRJumpKind jk, Int offsIP )
 {
-   PPCCondCode cond;
-   PPCRI* ri;
    if (vex_traceflags & VEX_TRACE_VCODE) {
-      vex_printf("\n-- goto {");
+      vex_printf( "\n-- PUT(%d) = ", offsIP);
+      ppIRExpr( next );
+      vex_printf( "; exit-");
       ppIRJumpKind(jk);
-      vex_printf("} ");
-      ppIRExpr(next);
-      vex_printf("\n");
+      vex_printf( "\n");
    }
-   cond = mk_PPCCondCode( Pct_ALWAYS, Pcf_NONE );
-   ri = iselWordExpr_RI(env, next);
-   addInstr(env, PPCInstr_RdWrLR(True, env->savedLR));
-   addInstr(env, PPCInstr_Goto(jk, cond, ri));
+
+   PPCCondCode always = mk_PPCCondCode( Pct_ALWAYS, Pcf_NONE );
+
+   /* Case: boring transfer to known address */
+   if (next->tag == Iex_Const) {
+      IRConst* cdst = next->Iex.Const.con;
+      vassert(cdst->tag == (env->mode64 ? Ico_U64 :Ico_U32));
+      if (jk == Ijk_Boring || jk == Ijk_Call) {
+         /* Boring transfer to known address */
+         PPCAMode* amCIA = PPCAMode_IR(offsIP, hregPPC_GPR31(env->mode64));
+         if (env->chainingAllowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool toFastEP
+               = env->mode64
+               ? (((Addr64)cdst->Ico.U64) > (Addr64)env->max_ga)
+               : (((Addr32)cdst->Ico.U32) > (Addr32)env->max_ga);
+            if (0) vex_printf("%s", toFastEP ? "X" : ".");
+            addInstr(env, PPCInstr_XDirect(
+                             env->mode64 ? (Addr64)cdst->Ico.U64
+                                         : (Addr64)cdst->Ico.U32,
+                             amCIA, always, toFastEP));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an assisted transfer,
+               as that's the only alternative that is allowable. */
+            HReg r = iselWordExpr_R(env, next);
+            addInstr(env, PPCInstr_XAssisted(r, amCIA, always,
+                                             Ijk_Boring));
+         }
+         return;
+      }
+   }
+
+   /* Case: call/return (==boring) transfer to any address */
+   switch (jk) {
+      case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
+         HReg       r     = iselWordExpr_R(env, next);
+         PPCAMode*  amCIA = PPCAMode_IR(offsIP, hregPPC_GPR31(env->mode64));
+         if (env->chainingAllowed) {
+            addInstr(env, PPCInstr_XIndir(r, amCIA, always));
+         } else {
+            addInstr(env, PPCInstr_XAssisted(r, amCIA, always,
+                                             Ijk_Boring));
+         }
+         return;
+      }
+      default:
+         break;
+   }
+
+   /* Case: some other kind of transfer to any address */
+   switch (jk) {
+      case Ijk_Sys_syscall: case Ijk_ClientReq: case Ijk_NoDecode:
+      case Ijk_EmWarn: case Ijk_SigTRAP: case Ijk_TInval:
+      case Ijk_NoRedir:
+      //case Ijk_Sys_int128: 
+      //case Ijk_Yield:
+      {
+         HReg      r     = iselWordExpr_R(env, next);
+         PPCAMode* amCIA = PPCAMode_IR(offsIP, hregPPC_GPR31(env->mode64));
+         addInstr(env, PPCInstr_XAssisted(r, amCIA, always, jk));
+         return;
+      }
+      default:
+         break;
+   }
+
+   vex_printf( "\n-- PUT(%d) = ", offsIP);
+   ppIRExpr( next );
+   vex_printf( "; exit-");
+   ppIRJumpKind(jk);
+   vex_printf( "\n");
+   vassert(0); // are we expecting any other kind?
 }
 
 
@@ -4732,20 +4858,29 @@
 /*--- Insn selector top-level                           ---*/
 /*---------------------------------------------------------*/
 
-/* Translate an entire BS to ppc code. */
-HInstrArray* iselSB_PPC ( IRSB* bb, VexArch      arch_host,
-                                    VexArchInfo* archinfo_host,
-                                    VexAbiInfo*  vbi )
+/* Translate an entire SB to ppc code. */
+HInstrArray* iselSB_PPC ( IRSB* bb, 
+                          VexArch      arch_host,
+                          VexArchInfo* archinfo_host,
+                          VexAbiInfo*  vbi,
+                          Int offs_Host_EvC_Counter,
+                          Int offs_Host_EvC_FailAddr,
+                          Bool chainingAllowed,
+                          Bool addProfInc,
+                          Addr64 max_ga )
 {
-   Int      i, j;
-   HReg     hregLo, hregMedLo, hregMedHi, hregHi;
-   ISelEnv* env;
-   UInt     hwcaps_host = archinfo_host->hwcaps;
-   Bool     mode64 = False;
-   UInt     mask32, mask64;
+   Int       i, j;
+   HReg      hregLo, hregMedLo, hregMedHi, hregHi;
+   ISelEnv*  env;
+   UInt      hwcaps_host = archinfo_host->hwcaps;
+   Bool      mode64 = False;
+   UInt      mask32, mask64;
+   PPCAMode *amCounter, *amFailAddr;
+
 
    vassert(arch_host == VexArchPPC32 || arch_host == VexArchPPC64);
    mode64 = arch_host == VexArchPPC64;
+   if (!mode64) vassert(max_ga <= 0xFFFFFFFFULL);
 
    /* do some sanity checks */
    mask32 = VEX_HWCAPS_PPC32_F | VEX_HWCAPS_PPC32_V
@@ -4783,15 +4918,20 @@
    env->n_vregmap = bb->tyenv->types_used;
    env->vregmapLo    = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
    env->vregmapMedLo = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
-   if (!mode64) {
+   if (mode64) {
+      env->vregmapMedHi = NULL;
+      env->vregmapHi    = NULL;
+   } else {
       env->vregmapMedHi = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
       env->vregmapHi    = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
    }
 
    /* and finally ... */
-   env->hwcaps      = hwcaps_host;
-   env->previous_rm = NULL;
-   env->vbi         = vbi;
+   env->chainingAllowed = chainingAllowed;
+   env->max_ga          = max_ga;
+   env->hwcaps          = hwcaps_host;
+   env->previous_rm     = NULL;
+   env->vbi             = vbi;
 
    /* For each IR temporary, allocate a suitably-kinded virtual
       register. */
@@ -4838,16 +4978,24 @@
    }
    env->vreg_ctr = j;
 
-   /* Keep a copy of the link reg, so helper functions don't kill it. */
-   env->savedLR = newVRegI(env);
-   addInstr(env, PPCInstr_RdWrLR(False, env->savedLR));
+   /* The very first instruction must be an event check. */
+   amCounter  = PPCAMode_IR(offs_Host_EvC_Counter, hregPPC_GPR31(mode64));
+   amFailAddr = PPCAMode_IR(offs_Host_EvC_FailAddr, hregPPC_GPR31(mode64));
+   addInstr(env, PPCInstr_EvCheck(amCounter, amFailAddr));
+
+   /* Possibly a block counter increment (for profiling).  At this
+      point we don't know the address of the counter, so just pretend
+      it is zero.  It will have to be patched later, but before this
+      translation is used, by a call to LibVEX_patchProfCtr. */
+   if (addProfInc) {
+      addInstr(env, PPCInstr_ProfInc());
+   }
 
    /* Ok, finally we can iterate over the statements. */
    for (i = 0; i < bb->stmts_used; i++)
-      if (bb->stmts[i])
-         iselStmt(env,bb->stmts[i]);
+      iselStmt(env, bb->stmts[i]);
 
-   iselNext(env,bb->next,bb->jumpkind);
+   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
 
    /* record the number of vregs we used. */
    env->code->n_vregs = env->vreg_ctr;
diff --git a/priv/host_s390_defs.c b/priv/host_s390_defs.c
index 98183a8..f46a1be 100644
--- a/priv/host_s390_defs.c
+++ b/priv/host_s390_defs.c
@@ -59,6 +59,7 @@
 static Bool s390_insn_is_reg_reg_move(const s390_insn *, HReg *src, HReg *dst);
 static void s390_insn_map_regs(HRegRemap *, s390_insn *);
 static void s390_insn_get_reg_usage(HRegUsage *u, const s390_insn *);
+static UInt s390_tchain_load64_len(void);
 
 
 /*------------------------------------------------------------*/
@@ -118,7 +119,7 @@
    /* Total number of allocable registers (all classes) */
    *nregs =  16 /* GPRs */
       -  1 /* r0 */
-      -  1 /* r12 register holding VG_(dispatch_ctr) */
+      -  1 /* r12 scratch register for translation chaining support */
       -  1 /* r13 guest state pointer */
       -  1 /* r14 link register */
       -  1 /* r15 stack pointer */
@@ -144,12 +145,8 @@
       Otherwise, they are available to the allocator */
    (*arr)[i++] = mkHReg(10, HRcInt64, False);
    (*arr)[i++] = mkHReg(11, HRcInt64, False);
-   /* GPR12 is not available because it caches VG_(dispatch_ctr).
-      Setting aside a register for the counter gives slightly better
-      performance - most of the time. From the 10 tests in "make perf"
-      8 run faster with a max observed speedup of 2.6% for bz2. ffbench
-      is the counter example. It runs 1.3% faster without the dedicated
-      register. */
+   /* GPR12 is not available because it us used as a scratch register
+      in translation chaining. */
    /* GPR13 is not available because it is used as guest state pointer */
    /* GPR14 is not available because it is used as link register */
    /* GPR15 is not available because it is used as stack pointer */
@@ -183,6 +180,7 @@
    return mkHReg(S390_REGNO_GUEST_STATE_POINTER, HRcInt64, False);
 }
 
+
 /* Is VALUE within the domain of a 20-bit signed integer. */
 static __inline__ Bool
 fits_signed_20bit(Int value)
@@ -617,14 +615,6 @@
       s390_opnd_RMI_get_reg_usage(u, insn->variant.compare.src2);
       break;
 
-   case S390_INSN_BRANCH:
-      s390_opnd_RMI_get_reg_usage(u, insn->variant.branch.dst);
-      /* The destination address is loaded into S390_REGNO_RETURN_VALUE.
-         See s390_insn_branch_emit. */
-      addHRegUse(u, HRmWrite,
-                 mkHReg(S390_REGNO_RETURN_VALUE, HRcInt64, False));
-      break;
-
    case S390_INSN_HELPER_CALL: {
       UInt i;
 
@@ -718,6 +708,29 @@
    case S390_INSN_GADD:
       break;
 
+   case S390_INSN_EVCHECK:
+      s390_amode_get_reg_usage(u, insn->variant.evcheck.counter);
+      s390_amode_get_reg_usage(u, insn->variant.evcheck.fail_addr);
+      break;
+
+   case S390_INSN_PROFINC:
+      /* Does not use any register visible to the register allocator */
+      break;
+
+   case S390_INSN_XDIRECT:
+      s390_amode_get_reg_usage(u, insn->variant.xdirect.guest_IA);
+      break;
+
+   case S390_INSN_XINDIR:
+      addHRegUse(u, HRmRead, insn->variant.xindir.dst);
+      s390_amode_get_reg_usage(u, insn->variant.xindir.guest_IA);
+      break;
+
+   case S390_INSN_XASSISTED:
+      addHRegUse(u, HRmRead, insn->variant.xassisted.dst);
+      s390_amode_get_reg_usage(u, insn->variant.xassisted.guest_IA);
+      break;
+
    default:
       vpanic("s390_insn_get_reg_usage");
    }
@@ -829,11 +842,6 @@
       s390_opnd_RMI_map_regs(m, &insn->variant.compare.src2);
       break;
 
-   case S390_INSN_BRANCH:
-      s390_opnd_RMI_map_regs(m, &insn->variant.branch.dst);
-      /* No need to map S390_REGNO_RETURN_VALUE. It's not virtual */
-      break;
-
    case S390_INSN_HELPER_CALL:
       /* s390_insn_helper_call_emit also reads / writes the link register
          and stack pointer. But those registers are not visible to the
@@ -923,6 +931,31 @@
    case S390_INSN_GADD:
       break;
 
+   case S390_INSN_EVCHECK:
+      s390_amode_map_regs(m, insn->variant.evcheck.counter);
+      s390_amode_map_regs(m, insn->variant.evcheck.fail_addr);
+      break;
+
+   case S390_INSN_PROFINC:
+      /* Does not use any register visible to the register allocator */
+      break;
+
+   case S390_INSN_XDIRECT:
+      s390_amode_map_regs(m, insn->variant.xdirect.guest_IA);
+      break;
+
+   case S390_INSN_XINDIR:
+      s390_amode_map_regs(m, insn->variant.xindir.guest_IA);
+      insn->variant.xindir.dst =
+         lookupHRegRemap(m, insn->variant.xindir.dst);
+      break;
+
+   case S390_INSN_XASSISTED:
+      s390_amode_map_regs(m, insn->variant.xassisted.guest_IA);
+      insn->variant.xassisted.dst =
+         lookupHRegRemap(m, insn->variant.xassisted.dst);
+      break;
+
    default:
       vpanic("s390_insn_map_regs");
    }
@@ -1403,6 +1436,16 @@
 
 
 static UChar *
+s390_emit_BRCL(UChar *p, UChar r1, ULong i2)
+{
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
+      s390_disasm(ENC2(XMNM, PCREL), S390_XMNM_BRCL, r1, i2);
+
+   return emit_RIL(p, 0xc00400000000ULL, r1, i2);
+}
+
+
+static UChar *
 s390_emit_CR(UChar *p, UChar r1, UChar r2)
 {
    if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM))
@@ -4252,21 +4295,6 @@
 
 
 s390_insn *
-s390_insn_branch(IRJumpKind kind, s390_cc_t cond, s390_opnd_RMI dst)
-{
-   s390_insn *insn = LibVEX_Alloc(sizeof(s390_insn));
-
-   insn->tag  = S390_INSN_BRANCH;
-   insn->size = 0;  /* does not matter */
-   insn->variant.branch.kind = kind;
-   insn->variant.branch.dst  = dst;
-   insn->variant.branch.cond = cond;
-
-   return insn;
-}
-
-
-s390_insn *
 s390_insn_helper_call(s390_cc_t cond, Addr64 target, UInt num_args,
                       HChar *name)
 {
@@ -4489,6 +4517,89 @@
 }
 
 
+s390_insn *
+s390_insn_xdirect(s390_cc_t cond, Addr64 dst, s390_amode *guest_IA,
+                  Bool to_fast_entry)
+{
+   s390_insn *insn = LibVEX_Alloc(sizeof(s390_insn));
+
+   insn->tag  = S390_INSN_XDIRECT;
+   insn->size = 0;   /* does not matter */
+
+   insn->variant.xdirect.cond = cond;
+   insn->variant.xdirect.dst = dst;
+   insn->variant.xdirect.guest_IA = guest_IA;
+   insn->variant.xdirect.to_fast_entry = to_fast_entry;
+
+   return insn;
+}
+
+
+s390_insn *
+s390_insn_xindir(s390_cc_t cond, HReg dst, s390_amode *guest_IA)
+{
+   s390_insn *insn = LibVEX_Alloc(sizeof(s390_insn));
+
+   insn->tag  = S390_INSN_XINDIR;
+   insn->size = 0;   /* does not matter */
+
+   insn->variant.xindir.cond = cond;
+   insn->variant.xindir.dst = dst;
+   insn->variant.xindir.guest_IA = guest_IA;
+
+   return insn;
+}
+
+
+s390_insn *
+s390_insn_xassisted(s390_cc_t cond, HReg dst, s390_amode *guest_IA,
+                    IRJumpKind kind)
+{
+   s390_insn *insn = LibVEX_Alloc(sizeof(s390_insn));
+
+   insn->tag  = S390_INSN_XASSISTED;
+   insn->size = 0;   /* does not matter */
+
+   insn->variant.xassisted.cond = cond;
+   insn->variant.xassisted.dst = dst;
+   insn->variant.xassisted.guest_IA = guest_IA;
+   insn->variant.xassisted.kind = kind;
+
+   return insn;
+}
+
+
+s390_insn *
+s390_insn_evcheck(s390_amode *counter, s390_amode *fail_addr)
+{
+   s390_insn *insn = LibVEX_Alloc(sizeof(s390_insn));
+
+   vassert(counter->tag == S390_AMODE_B12 || counter->tag == S390_AMODE_BX12);
+   vassert(fail_addr->tag == S390_AMODE_B12 ||
+           fail_addr->tag == S390_AMODE_BX12);
+
+   insn->tag  = S390_INSN_EVCHECK;
+   insn->size = 0;   /* does not matter */
+
+   insn->variant.evcheck.counter = counter;
+   insn->variant.evcheck.fail_addr = fail_addr;
+
+   return insn;
+}
+
+
+s390_insn *
+s390_insn_profinc(void)
+{
+   s390_insn *insn = LibVEX_Alloc(sizeof(s390_insn));
+
+   insn->tag  = S390_INSN_PROFINC;
+   insn->size = 0;   /* does not matter */
+
+   return insn;
+}
+
+
 /*---------------------------------------------------------------*/
 /*--- Debug print                                             ---*/
 /*---------------------------------------------------------------*/
@@ -4792,11 +4903,6 @@
                    &insn->variant.compare.src2);
       break;
 
-   case S390_INSN_BRANCH:
-      s390_sprintf(buf, "if (%C) %J %O", insn->variant.branch.cond,
-                   insn->variant.branch.kind, &insn->variant.branch.dst);
-      return buf;   /* avoid printing "size = ..." which is meaningless */
-
    case S390_INSN_HELPER_CALL: {
       s390_sprintf(buf, "%M if (%C) %s{%I}(%L)", "v-call",
                    insn->variant.helper_call.cond,
@@ -4924,6 +5030,39 @@
                    insn->variant.gadd.value);
       break;
 
+   case S390_INSN_EVCHECK:
+      s390_sprintf(buf, "%M counter = %A, fail-addr = %A", "v-evcheck",
+                   insn->variant.evcheck.counter,
+                   insn->variant.evcheck.fail_addr);
+      return buf;   /* avoid printing "size = ..." which is meaningless */
+
+   case S390_INSN_PROFINC:
+      s390_sprintf(buf, "%M", "v-profinc");
+      return buf;   /* avoid printing "size = ..." which is meaningless */
+
+   case S390_INSN_XDIRECT:
+      s390_sprintf(buf, "%M if (%C) %A = %I  %s", "v-xdirect",
+                   insn->variant.xdirect.cond,
+                   insn->variant.xdirect.guest_IA,
+                   insn->variant.xdirect.dst,
+                   insn->variant.xdirect.to_fast_entry ? "fast" : "slow");
+      return buf;   /* avoid printing "size = ..." which is meaningless */
+
+   case S390_INSN_XINDIR:
+      s390_sprintf(buf, "%M if (%C) %A = %R", "v-xindir",
+                   insn->variant.xindir.cond,
+                   insn->variant.xindir.guest_IA,
+                   insn->variant.xindir.dst);
+      return buf;   /* avoid printing "size = ..." which is meaningless */
+
+   case S390_INSN_XASSISTED:
+      s390_sprintf(buf, "%M if (%C) %J %A = %R", "v-xassisted",
+                   insn->variant.xassisted.cond,
+                   insn->variant.xassisted.kind,
+                   insn->variant.xassisted.guest_IA,
+                   insn->variant.xassisted.dst);
+      return buf;   /* avoid printing "size = ..." which is meaningless */
+
    default: goto fail;
    }
 
@@ -6512,104 +6651,6 @@
 
 
 static UChar *
-s390_insn_branch_emit(UChar *buf, const s390_insn *insn)
-{
-   s390_opnd_RMI dst;
-   s390_cc_t cond;
-   UInt       trc;
-   UChar *p, *ptmp = 0;  /* avoid compiler warnings */
-
-   cond = insn->variant.branch.cond;
-   dst  = insn->variant.branch.dst;
-
-   p = buf;
-   trc = 0;
-
-   if (cond != S390_CC_ALWAYS) {
-      /* So we have something like this
-         if (cond) goto X;
-         Y: ...
-         We convert this into
-         if (! cond) goto Y;        // BRC insn; 4 bytes
-         return_reg = X;
-         return to dispatcher
-         Y:
-      */
-      ptmp = p; /* 4 bytes (a BRC insn) to be filled in here */
-      p += 4;
-   }
-
-   /* If a non-boring, set guest-state-pointer appropriately. */
-
-   switch (insn->variant.branch.kind) {
-   case Ijk_ClientReq:   trc = VEX_TRC_JMP_CLIENTREQ;   break;
-   case Ijk_Sys_syscall: trc = VEX_TRC_JMP_SYS_SYSCALL; break;
-   case Ijk_Yield:       trc = VEX_TRC_JMP_YIELD;       break;
-   case Ijk_EmWarn:      trc = VEX_TRC_JMP_EMWARN;      break;
-   case Ijk_EmFail:      trc = VEX_TRC_JMP_EMFAIL;      break;
-   case Ijk_MapFail:     trc = VEX_TRC_JMP_MAPFAIL;     break;
-   case Ijk_NoDecode:    trc = VEX_TRC_JMP_NODECODE;    break;
-   case Ijk_TInval:      trc = VEX_TRC_JMP_TINVAL;      break;
-   case Ijk_NoRedir:     trc = VEX_TRC_JMP_NOREDIR;     break;
-   case Ijk_SigTRAP:     trc = VEX_TRC_JMP_SIGTRAP;     break;
-   case Ijk_Ret:         trc = 0; break;
-   case Ijk_Call:        trc = 0; break;
-   case Ijk_Boring:      trc = 0; break;
-      break;
-
-   default:
-      vpanic("s390_insn_branch_emit: unknown jump kind");
-   }
-
-   /* Get the destination address into the return register */
-   switch (dst.tag) {
-   case S390_OPND_REG:
-      p = s390_emit_LGR(p, S390_REGNO_RETURN_VALUE, hregNumber(dst.variant.reg));
-      break;
-
-   case S390_OPND_AMODE: {
-      const s390_amode *am = dst.variant.am;
-      UChar b = hregNumber(am->b);
-      UChar x = hregNumber(am->x);
-      Int   d = am->d;
-
-      p = s390_emit_LG(p, S390_REGNO_RETURN_VALUE, x, b, DISP20(d));
-      break;
-   }
-
-   case S390_OPND_IMMEDIATE:
-      p = s390_emit_load_64imm(p, S390_REGNO_RETURN_VALUE, dst.variant.imm);
-      break;
-
-   default:
-      goto fail;
-   }
-
-   if (trc != 0) {
-      /* Something special. Set guest-state pointer appropriately */
-      p = s390_emit_LGHI(p, S390_REGNO_GUEST_STATE_POINTER, trc);
-   } else {
-      /* Nothing special needs to be done for calls and returns. */
-   }
-
-   p = s390_emit_BCR(p, S390_CC_ALWAYS, S390_REGNO_LINK_REGISTER);
-
-   if (cond != S390_CC_ALWAYS) {
-      Int delta = p - ptmp;
-
-      delta >>= 1;  /* immediate constant is #half-words */
-      vassert(delta > 0 && delta < (1 << 16));
-      s390_emit_BRC(ptmp, s390_cc_invert(cond), delta);
-   }
-
-   return p;
-
- fail:
-   vpanic("s390_insn_branch_emit");
-}
-
-
-static UChar *
 s390_insn_helper_call_emit(UChar *buf, const s390_insn *insn)
 {
    s390_cc_t cond;
@@ -7163,9 +7204,415 @@
 }
 
 
+/* Define convenience functions needed for translation chaining.
+   Any changes need to be applied to the functions in concert. */
+
+static __inline__ Bool
+s390_insn_is_BRCL(const UChar *p, UChar condition)
+{
+   return p[0] == 0xc0 && p[1] == ((condition << 4) | 0x04);
+}
+
+static __inline__ Bool
+s390_insn_is_BR(const UChar *p, UChar reg)
+{
+   return p[0] == 0x07 && p[1] == (0xF0 | reg);  /* BCR 15,reg */
+}
+
+static __inline__ Bool
+s390_insn_is_BASR(const UChar *p, UChar link_reg, UChar other_reg)
+{
+   return p[0] == 0x0D && p[1] == ((link_reg << 4) | other_reg);
+}
+
+/* Load the 64-bit VALUE into REG. Note that this function must NOT
+   optimise the generated code by looking at the value. I.e. using
+   LGHI if value == 0 would be very wrong.
+   fixs390: Do it in a way that works everywhere for now. */
+static UChar *
+s390_tchain_load64(UChar *buf, UChar regno, ULong value)
+{
+   UChar *begin = buf;
+
+   buf = s390_emit_IILL(buf, regno, value & 0xFFFF);
+   value >>= 16;
+   buf = s390_emit_IILH(buf, regno, value & 0xFFFF);
+   value >>= 16;
+   buf = s390_emit_IIHL(buf, regno, value & 0xFFFF);
+   value >>= 16;
+   buf = s390_emit_IIHH(buf, regno, value & 0xFFFF);
+
+   vassert(buf - begin == s390_tchain_load64_len());
+
+   return buf;
+}
+
+/* Return number of bytes generated by s390_tchain_load64 */
+static UInt
+s390_tchain_load64_len(void)
+{
+   return S390_TCHAIN_LOAD64_LEN;
+}
+
+/* Verify that CODE is the code sequence generated by s390_tchain_load64
+   to load VALUE into REGNO. Return pointer to the byte following the
+   insn sequence. */
+static const UChar *
+s390_tchain_verify_load64(const UChar *code, UChar regno, ULong value)
+{
+   UInt regmask = regno << 4;
+   UInt hw;
+
+   /* Check for IILL */
+   hw = value & 0xFFFF;
+   vassert(code[0]  ==  0xA5);
+   vassert(code[1]  == (0x03 | regmask));
+   vassert(code[2]  == (hw >> 8));
+   vassert(code[3]  == (hw & 0xFF));
+
+   /* Check for IILH */
+   hw = (value >> 16) & 0xFFFF;
+   vassert(code[4]  ==  0xA5);
+   vassert(code[5]  == (0x02 | regmask));
+   vassert(code[6]  == (hw >> 8));
+   vassert(code[7]  == (hw & 0xFF));
+
+   /* Check for IIHL */
+   hw = (value >> 32) & 0xFFFF;
+   vassert(code[8]  ==  0xA5);
+   vassert(code[9]  == (0x01 | regmask));
+   vassert(code[10] == (hw >> 8));
+   vassert(code[11] == (hw & 0xFF));
+
+   /* Check for IIHH */
+   hw = (value >> 48) & 0xFFFF;
+   vassert(code[12] ==  0xA5);
+   vassert(code[13] == (0x00 | regmask));
+   vassert(code[14] == (hw >> 8));
+   vassert(code[15] == (hw & 0xFF));
+
+   return code + s390_tchain_load64_len();
+}
+
+/* CODE points to the code sequence as generated by s390_tchain_load64.
+   Change the loaded value to VALUE. Return pointer to the byte following
+   the patched code sequence. */
+static UChar *
+s390_tchain_patch_load64(UChar *code, ULong imm64)
+{
+   code[3]  = imm64 & 0xFF; imm64 >>= 8;
+   code[2]  = imm64 & 0xFF; imm64 >>= 8;
+   code[7]  = imm64 & 0xFF; imm64 >>= 8;
+   code[6]  = imm64 & 0xFF; imm64 >>= 8;
+   code[11] = imm64 & 0xFF; imm64 >>= 8;
+   code[10] = imm64 & 0xFF; imm64 >>= 8;
+   code[15] = imm64 & 0xFF; imm64 >>= 8;
+   code[14] = imm64 & 0xFF; imm64 >>= 8;
+
+   return code + s390_tchain_load64_len();
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   chainXDirect_S390 and unchainXDirect_S390 below. */
+static UChar *
+s390_insn_xdirect_emit(UChar *buf, const s390_insn *insn,
+                       void *disp_cp_chain_me_to_slowEP,
+                       void *disp_cp_chain_me_to_fastEP)
+{
+   /* We're generating chain-me requests here, so we need to be
+      sure this is actually allowed -- no-redir translations can't
+      use chain-me's.  Hence: */
+   vassert(disp_cp_chain_me_to_slowEP != NULL);
+   vassert(disp_cp_chain_me_to_fastEP != NULL);
+
+   /* Use ptmp for backpatching conditional jumps. */
+   UChar *ptmp = buf;
+
+   /* First off, if this is conditional, create a conditional
+      jump over the rest of it. */
+   s390_cc_t cond = insn->variant.xdirect.cond;
+
+   if (cond != S390_CC_ALWAYS) {
+      /* So we have something like this
+         if (cond) do_xdirect;
+         Y: ...
+         We convert this into
+         if (! cond) goto Y;        // BRC opcode; 4 bytes
+         do_xdirect;
+         Y:
+      */
+      /* 4 bytes (a BRC insn) to be filled in here */
+      buf += 4;
+   }
+
+   /* Update the guest IA. */
+   buf = s390_emit_load_64imm(buf, R0, insn->variant.xdirect.dst);
+
+   const s390_amode *amode = insn->variant.xdirect.guest_IA;
+   vassert(amode->tag == S390_AMODE_B12 || amode->tag == S390_AMODE_BX12);
+   UInt b = hregNumber(amode->b);
+   UInt x = hregNumber(amode->x);  /* 0 for B12 and B20 */
+   UInt d = amode->d;
+
+   buf = s390_emit_STG(buf, R0, x, b, DISP20(d));
+
+   /* --- FIRST PATCHABLE BYTE follows --- */
+   /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
+      to) backs up the return address, so as to find the address of
+      the first patchable byte.  So: don't change the length of the
+      two instructions below. */
+
+   /* Load the chosen entry point into the scratch reg */
+   void *disp_cp_chain_me;
+
+   disp_cp_chain_me =
+      insn->variant.xdirect.to_fast_entry ? disp_cp_chain_me_to_fastEP 
+                                          : disp_cp_chain_me_to_slowEP;
+
+   ULong addr = Ptr_to_ULong(disp_cp_chain_me);
+   buf = s390_tchain_load64(buf, S390_REGNO_TCHAIN_SCRATCH, addr);
+
+   /* call *tchain_scratch */
+   buf = s390_emit_BASR(buf, 1, S390_REGNO_TCHAIN_SCRATCH);
+
+   /* --- END of PATCHABLE BYTES --- */
+
+   /* Fix up the conditional jump, if there was one. */
+   if (cond != S390_CC_ALWAYS) {
+      Int delta = buf - ptmp;
+
+      delta >>= 1;  /* immediate constant is #half-words */
+      vassert(delta > 0 && delta < (1 << 16));
+      s390_emit_BRC(ptmp, s390_cc_invert(cond), delta);
+   }
+
+   return buf;
+}
+
+/* Return the number of patchable bytes from an xdirect insn. */
+static UInt
+s390_xdirect_patchable_len(void)
+{
+   return s390_tchain_load64_len() + S390_TCHAIN_CALL_LEN;
+}
+
+
+static UChar *
+s390_insn_xindir_emit(UChar *buf, const s390_insn *insn, void *disp_cp_xindir)
+{
+   /* We're generating transfers that could lead indirectly to a
+      chain-me, so we need to be sure this is actually allowed --
+      no-redir translations are not allowed to reach normal
+      translations without going through the scheduler.  That means
+      no XDirects or XIndirs out from no-redir translations.
+      Hence: */
+   vassert(disp_cp_xindir != NULL);
+
+   /* Use ptmp for backpatching conditional jumps. */
+   UChar *ptmp = buf;
+
+   /* First off, if this is conditional, create a conditional
+      jump over the rest of it. */
+   s390_cc_t cond = insn->variant.xdirect.cond;
+
+   if (cond != S390_CC_ALWAYS) {
+      /* So we have something like this
+         if (cond) do_xdirect;
+         Y: ...
+         We convert this into
+         if (! cond) goto Y;        // BRC opcode; 4 bytes
+         do_xdirect;
+         Y:
+      */
+      /* 4 bytes (a BRC insn) to be filled in here */
+      buf += 4;
+   }
+
+   /* Update the guest IA with the address in xdirect.dst. */
+   const s390_amode *amode = insn->variant.xindir.guest_IA;
+
+   vassert(amode->tag == S390_AMODE_B12 || amode->tag == S390_AMODE_BX12);
+   UInt b = hregNumber(amode->b);
+   UInt x = hregNumber(amode->x);  /* 0 for B12 and B20 */
+   UInt d = amode->d;
+   UInt regno = hregNumber(insn->variant.xindir.dst);
+
+   buf = s390_emit_STG(buf, regno, x, b, DISP20(d));
+
+   /* load tchain_scratch, #disp_indir */
+   buf = s390_tchain_load64(buf, S390_REGNO_TCHAIN_SCRATCH,
+                            Ptr_to_ULong(disp_cp_xindir));
+   /* BR *tchain_direct */
+   buf = s390_emit_BCR(buf, S390_CC_ALWAYS, S390_REGNO_TCHAIN_SCRATCH);
+
+   /* Fix up the conditional jump, if there was one. */
+   if (cond != S390_CC_ALWAYS) {
+      Int delta = buf - ptmp;
+
+      delta >>= 1;  /* immediate constant is #half-words */
+      vassert(delta > 0 && delta < (1 << 16));
+      s390_emit_BRC(ptmp, s390_cc_invert(cond), delta);
+   }
+
+   return buf;
+}
+
+static UChar *
+s390_insn_xassisted_emit(UChar *buf, const s390_insn *insn,
+                         void *disp_cp_xassisted)
+{
+   /* Use ptmp for backpatching conditional jumps. */
+   UChar *ptmp = buf;
+
+   /* First off, if this is conditional, create a conditional
+      jump over the rest of it. */
+   s390_cc_t cond = insn->variant.xdirect.cond;
+
+   if (cond != S390_CC_ALWAYS) {
+      /* So we have something like this
+         if (cond) do_xdirect;
+         Y: ...
+         We convert this into
+         if (! cond) goto Y;        // BRC opcode; 4 bytes
+         do_xdirect;
+         Y:
+      */
+      /* 4 bytes (a BRC insn) to be filled in here */
+      buf += 4;
+   }
+
+   /* Update the guest IA with the address in xassisted.dst. */
+   const s390_amode *amode = insn->variant.xassisted.guest_IA;
+
+   vassert(amode->tag == S390_AMODE_B12 || amode->tag == S390_AMODE_BX12);
+   UInt b = hregNumber(amode->b);
+   UInt x = hregNumber(amode->x);  /* 0 for B12 and B20 */
+   UInt d = amode->d;
+   UInt regno = hregNumber(insn->variant.xassisted.dst);
+
+   buf = s390_emit_STG(buf, regno, x, b, DISP20(d));
+
+   UInt trcval = 0;
+
+   switch (insn->variant.xassisted.kind) {
+   case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
+   case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
+   case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
+   case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
+   case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
+   case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
+   case Ijk_TInval:      trcval = VEX_TRC_JMP_TINVAL;      break;
+   case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
+   case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
+   case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
+   case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
+      /* We don't expect to see the following being assisted. */
+   case Ijk_Ret:
+   case Ijk_Call:
+      /* fallthrough */
+   default: 
+      ppIRJumpKind(insn->variant.xassisted.kind);
+      vpanic("s390_insn_xassisted_emit: unexpected jump kind");
+   }
+
+   vassert(trcval != 0);
+
+   /* guest_state_pointer = trcval */
+   buf = s390_emit_LGHI(buf, S390_REGNO_GUEST_STATE_POINTER, trcval);
+
+   /* load tchain_scratch, #disp_assisted */
+   buf = s390_tchain_load64(buf, S390_REGNO_TCHAIN_SCRATCH,
+                            Ptr_to_ULong(disp_cp_xassisted));
+
+   /* BR *tchain_direct */
+   buf = s390_emit_BCR(buf, S390_CC_ALWAYS, S390_REGNO_TCHAIN_SCRATCH);
+
+   /* Fix up the conditional jump, if there was one. */
+   if (cond != S390_CC_ALWAYS) {
+      Int delta = buf - ptmp;
+
+      delta >>= 1;  /* immediate constant is #half-words */
+      vassert(delta > 0 && delta < (1 << 16));
+      s390_emit_BRC(ptmp, s390_cc_invert(cond), delta);
+   }
+
+   return buf;
+}
+
+
+/* Pseudo code:
+
+   guest_state[host_EvC_COUNTER] -= 1;
+   if (guest_state[host_EvC_COUNTER] >= 0) goto nofail;
+   goto guest_state[host_EvC_FAILADDR];
+   nofail: ;
+
+   The dispatch counter is a 32-bit value. */
+static UChar *
+s390_insn_evcheck_emit(UChar *buf, const s390_insn *insn)
+{
+   s390_amode *amode;
+   UInt b, x, d;
+   UChar *code_begin, *code_end;
+
+   code_begin = buf;
+
+   amode = insn->variant.evcheck.counter;
+   vassert(amode->tag == S390_AMODE_B12 || amode->tag == S390_AMODE_BX12);
+   b = hregNumber(amode->b);
+   x = hregNumber(amode->x);  /* 0 for B12 and B20 */
+   d = amode->d;
+
+   /* Decrement the dispatch counter in the guest state */
+   /* fixs390: ASI if available */
+   buf = s390_emit_LHI(buf, R0, -1);             /* 4 bytes */
+   buf = s390_emit_A(buf, R0, x, b, d);          /* 4 bytes */
+   buf = s390_emit_ST(buf, R0, x, b, d);         /* 4 bytes */
+
+   /* Jump over the next insn if >= 0 */
+   buf = s390_emit_BRC(buf, S390_CC_HE, (4 + 6 + 2) / 2);  /* 4 bytes */
+
+   /* Computed goto to fail_address */
+   amode = insn->variant.evcheck.fail_addr;
+   b = hregNumber(amode->b);
+   x = hregNumber(amode->x);  /* 0 for B12 and B20 */
+   d = amode->d;
+   buf = s390_emit_LG(buf, S390_REGNO_TCHAIN_SCRATCH, x, b, DISP20(d));  /* 6 bytes */
+   buf = s390_emit_BCR(buf, S390_CC_ALWAYS, S390_REGNO_TCHAIN_SCRATCH);  /* 2 bytes */
+
+   code_end = buf;
+   
+   /* Make sure the size of the generated code is identical to the size
+      returned by evCheckSzB_S390 */
+   vassert(evCheckSzB_S390() == code_end - code_begin);
+
+   return buf;
+}
+
+
+static UChar *
+s390_insn_profinc_emit(UChar *buf,
+                       const s390_insn *insn __attribute__((unused)))
+{
+   /* Generate a code template to increment a memory location whose
+      address will be known later as an immediate value. This code
+      template will be patched once the memory location is known.
+      For now we do this with address == 0. */
+   buf = s390_tchain_load64(buf, S390_REGNO_TCHAIN_SCRATCH, 0);
+   buf = s390_emit_LGHI(buf, R0, 1);
+   buf = s390_emit_AG( buf, R0, 0, S390_REGNO_TCHAIN_SCRATCH, DISP20(0));
+   buf = s390_emit_STG(buf, R0, 0, S390_REGNO_TCHAIN_SCRATCH, DISP20(0));
+
+   return buf;
+}
+
+
 Int
-emit_S390Instr(UChar *buf, Int nbuf, s390_insn *insn, Bool mode64,
-               void *dispatch_unassisted, void *dispatch_assisted)
+emit_S390Instr(Bool *is_profinc, UChar *buf, Int nbuf, s390_insn *insn,
+               Bool mode64, void *disp_cp_chain_me_to_slowEP,
+               void *disp_cp_chain_me_to_fastEP, void *disp_cp_xindir,
+               void *disp_cp_xassisted)
 {
    UChar *end;
 
@@ -7230,12 +7677,6 @@
       end = s390_insn_compare_emit(buf, insn);
       break;
 
-   case S390_INSN_BRANCH:
-      vassert(dispatch_unassisted == NULL);
-      vassert(dispatch_assisted == NULL);
-      end = s390_insn_branch_emit(buf, insn);
-      break;
-
    case S390_INSN_HELPER_CALL:
       end = s390_insn_helper_call_emit(buf, insn);
       break;
@@ -7288,6 +7729,30 @@
       end = s390_insn_gadd_emit(buf, insn);
       break;
 
+   case S390_INSN_PROFINC:
+      end = s390_insn_profinc_emit(buf, insn);
+      /* Tell the caller .. */
+      vassert(*is_profinc == False);
+      *is_profinc = True;
+      break;
+
+   case S390_INSN_EVCHECK:
+      end = s390_insn_evcheck_emit(buf, insn);
+      break;
+
+   case S390_INSN_XDIRECT:
+      end = s390_insn_xdirect_emit(buf, insn, disp_cp_chain_me_to_slowEP,
+                                   disp_cp_chain_me_to_fastEP);
+      break;
+
+   case S390_INSN_XINDIR:
+      end = s390_insn_xindir_emit(buf, insn, disp_cp_xindir);
+      break;
+
+   case S390_INSN_XASSISTED:
+      end = s390_insn_xassisted_emit(buf, insn, disp_cp_xassisted);
+      break;
+
    default:
       vpanic("emit_S390Instr");
    }
@@ -7298,6 +7763,168 @@
 }
 
 
+/* Return the number of bytes emitted for an S390_INSN_EVCHECK.
+   See s390_insn_evcheck_emit */
+Int
+evCheckSzB_S390(void)
+{
+   return 24;
+}
+
+
+/* Patch the counter address into CODE_TO_PATCH as previously
+   generated by s390_insn_profinc_emit. */
+VexInvalRange
+patchProfInc_S390(void *code_to_patch, ULong *location_of_counter)
+{
+   vassert(sizeof(ULong *) == 8);
+
+   s390_tchain_verify_load64(code_to_patch, S390_REGNO_TCHAIN_SCRATCH, 0);
+
+   s390_tchain_patch_load64(code_to_patch, Ptr_to_ULong(location_of_counter));
+
+   VexInvalRange vir = {0, 0};
+   return vir;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   s390_insn_xdirect_emit code above. */
+VexInvalRange
+chainXDirect_S390(void *place_to_chain,
+                  void *disp_cp_chain_me_EXPECTED,
+                  void *place_to_jump_to)
+{
+   /* What we're expecting to see @ PLACE_TI_CHAIN is:
+
+        load tchain-scratch, #disp_cp_chain_me_EXPECTED
+        BASR  1,S390_REGNO_TCHAIN_SCRATCH
+   */
+   const UChar *next;
+   next = s390_tchain_verify_load64(place_to_chain, S390_REGNO_TCHAIN_SCRATCH,
+                                    Ptr_to_ULong(disp_cp_chain_me_EXPECTED));
+   vassert(s390_insn_is_BASR(next, 1, S390_REGNO_TCHAIN_SCRATCH));
+
+   /* And what we want to change it to is either:
+        (general case):
+
+          load tchain_scratch, #place_to_jump_to
+          BR  *tchain_scratch
+
+      ---OR---
+
+        in the case where the displacement is small enough
+
+          BRCL delta       where delta is in half-words
+          invalid opcodes
+
+      In both cases the replacement has the same length as the original.
+      To remain sane & verifiable,
+      (1) limit the displacement for the short form to 
+          (say) +/- one billion, so as to avoid wraparound
+          off-by-ones
+      (2) even if the short form is applicable, once every (say)
+          1024 times use the long form anyway, so as to maintain
+          verifiability
+   */
+
+   /* This is the delta we need to put into a BRCL insn. Note, that the
+      offset in BRCL is in half-words. Hence division by 2. */
+   Long delta = (Long)((UChar *)place_to_jump_to - (UChar *)place_to_chain) / 2;
+   Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
+
+   static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
+   if (shortOK) {
+      shortCTR++; // thread safety bleh
+      if (0 == (shortCTR & 0x3FF)) {
+         shortOK = False;
+         if (0)
+            vex_printf("QQQ chainXDirect_S390: shortCTR = %u, "
+                       "using long jmp\n", shortCTR);
+      }
+   }
+
+   /* And make the modifications. */
+   UChar *p = (UChar *)place_to_chain;
+   if (shortOK) {
+      p = s390_emit_BRCL(p, S390_CC_ALWAYS, delta);  /* 6 bytes */
+
+      /* Make sure that BRCL fits into the patchable part of an xdirect
+         code sequence */
+      vassert(6 <= s390_xdirect_patchable_len());
+
+      /* Fill remaining bytes with 0x00 (invalid opcode) */
+      Int i;
+      for (i = 0; i < s390_xdirect_patchable_len() - 6; ++i)
+         p[i] = 0x00;
+   } else {
+      /*
+          load tchain_scratch, #place_to_jump_to
+          BR  *tchain_scratch
+      */
+      ULong addr = Ptr_to_ULong(place_to_jump_to);
+      p = s390_tchain_load64(p, S390_REGNO_TCHAIN_SCRATCH, addr);
+      s390_emit_BCR(p, S390_CC_ALWAYS, S390_REGNO_TCHAIN_SCRATCH);
+   }
+
+   VexInvalRange vir = {0, 0};
+   return vir;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   s390_insn_xdirect_emit code above. */
+VexInvalRange
+unchainXDirect_S390(void *place_to_unchain,
+                    void *place_to_jump_to_EXPECTED,
+                    void *disp_cp_chain_me)
+{
+   /* What we're expecting to see @ PLACE_TO_UNCHAIN:
+
+          load tchain_scratch, #place_to_jump_to_EXPECTED
+          BR  *tchain_scratch
+
+      ---OR---
+        in the case where the displacement falls within 32 bits
+
+          BRCL delta
+          invalid opcodes
+   */
+   UChar *p = place_to_unchain;
+
+   if (s390_insn_is_BRCL(p, S390_CC_ALWAYS)) {
+      /* Looks like the short form */
+      Int num_hw = *(Int *)&p[2];
+      Int delta = 2 *num_hw;
+
+      vassert(p + delta == place_to_jump_to_EXPECTED);
+
+      Int i;
+      for (i = 0; i < s390_xdirect_patchable_len() - 6; ++i)
+         vassert(p[6+i] == 0x00);
+   } else {
+      /* Should be the long form */
+      const UChar *next;
+
+      next = s390_tchain_verify_load64(p, S390_REGNO_TCHAIN_SCRATCH,
+                                       Ptr_to_ULong(place_to_jump_to_EXPECTED));
+      /* Check for BR *tchain_scratch */
+      vassert(s390_insn_is_BR(next, S390_REGNO_TCHAIN_SCRATCH));
+   }
+
+   /* And what we want to change it to is:
+
+        load  tchain_scratch, #disp_cp_chain_me
+        call *tchain_scratch
+   */
+   ULong addr = Ptr_to_ULong(disp_cp_chain_me);
+   p = s390_tchain_load64(p, S390_REGNO_TCHAIN_SCRATCH, addr);
+   s390_emit_BASR(p, 1, S390_REGNO_TCHAIN_SCRATCH);
+
+   VexInvalRange vir = {0, 0};
+   return vir;
+}
+
 /*---------------------------------------------------------------*/
 /*--- end                                    host_s390_defs.c ---*/
 /*---------------------------------------------------------------*/
diff --git a/priv/host_s390_defs.h b/priv/host_s390_defs.h
index 8b75486..ad99c4f 100644
--- a/priv/host_s390_defs.h
+++ b/priv/host_s390_defs.h
@@ -130,7 +130,6 @@
    S390_INSN_TEST,   /* test operand and set cc */
    S390_INSN_CC2BOOL,/* convert condition code to 0/1 */
    S390_INSN_COMPARE,
-   S390_INSN_BRANCH, /* un/conditional goto */
    S390_INSN_HELPER_CALL,
    S390_INSN_CAS,    /* compare and swap */
    S390_INSN_BFP_BINOP, /* Binary floating point 32-bit / 64-bit */
@@ -144,7 +143,13 @@
    S390_INSN_BFP128_CONVERT_FROM,
    S390_INSN_MFENCE,
    S390_INSN_GZERO,   /* Assign zero to a guest register */
-   S390_INSN_GADD     /* Add a value to a guest register */
+   S390_INSN_GADD,    /* Add a value to a guest register */
+   /* The following 5 insns are mandated by translation chaining */
+   S390_INSN_XDIRECT,     /* direct transfer to guest address */
+   S390_INSN_XINDIR,      /* indirect transfer to guest address */
+   S390_INSN_XASSISTED,   /* assisted transfer to guest address */
+   S390_INSN_EVCHECK,     /* Event check */
+   S390_INSN_PROFINC      /* 64-bit profile counter increment */
 } s390_insn_tag;
 
 
@@ -338,11 +343,6 @@
          HReg        op3;
          HReg        old_mem;
       } cas;
-      struct {
-         IRJumpKind    kind;
-         s390_cc_t     cond;
-         s390_opnd_RMI dst;
-      } branch;
       /* Pseudo-insn for representing a helper call.
          TARGET is the absolute address of the helper function
          NUM_ARGS says how many arguments are being passed.
@@ -407,6 +407,44 @@
          UChar            delta;
          ULong            value;  /* for debugging only */
       } gadd;
+
+      /* The next 5 entries are generic to support translation chaining */
+
+      /* Update the guest IA value, then exit requesting to chain
+         to it.  May be conditional. */
+      struct {
+         s390_cc_t     cond;
+         Bool          to_fast_entry;  /* chain to the what entry point? */
+         Addr64        dst;            /* next guest address */
+         s390_amode   *guest_IA;
+      } xdirect;
+      /* Boring transfer to a guest address not known at JIT time.
+         Not chainable.  May be conditional. */
+      struct {
+         s390_cc_t     cond;
+         HReg          dst;
+         s390_amode   *guest_IA;
+      } xindir;
+      /* Assisted transfer to a guest address, most general case.
+         Not chainable.  May be conditional. */
+      struct {
+         s390_cc_t     cond;
+         IRJumpKind    kind;
+         HReg          dst;
+         s390_amode   *guest_IA;
+      } xassisted;
+      struct {
+         /* fixs390: I don't think these are really needed
+            as the gsp and the offset are fixed  no ? */
+         s390_amode   *counter;    /* dispatch counter */
+         s390_amode   *fail_addr;
+      } evcheck;
+      struct {
+         /* No fields.  The address of the counter to increment is
+            installed later, post-translation, by patching it in,
+            as it is not known at translation time. */
+      } profinc;
+
    } variant;
 } s390_insn;
 
@@ -433,7 +471,6 @@
 s390_insn *s390_insn_test(UChar size, s390_opnd_RMI src);
 s390_insn *s390_insn_compare(UChar size, HReg dst, s390_opnd_RMI opnd,
                              Bool signed_comparison);
-s390_insn *s390_insn_branch(IRJumpKind jk, s390_cc_t cond, s390_opnd_RMI dst);
 s390_insn *s390_insn_helper_call(s390_cc_t cond, Addr64 target, UInt num_args,
                                  HChar *name);
 s390_insn *s390_insn_bfp_triop(UChar size, s390_bfp_triop_t, HReg dst, HReg op2,
@@ -460,6 +497,15 @@
 s390_insn *s390_insn_gzero(UChar size, UInt offset);
 s390_insn *s390_insn_gadd(UChar size, UInt offset, UChar delta, ULong value);
 
+/* Five for translation chaining */
+s390_insn *s390_insn_xdirect(s390_cc_t cond, Addr64 dst, s390_amode *guest_IA,
+                             Bool to_fast_entry);
+s390_insn *s390_insn_xindir(s390_cc_t cond, HReg dst, s390_amode *guest_IA);
+s390_insn *s390_insn_xassisted(s390_cc_t cond, HReg dst, s390_amode *guest_IA,
+                               IRJumpKind kind);
+s390_insn *s390_insn_evcheck(s390_amode *counter, s390_amode *fail_addr);
+s390_insn *s390_insn_profinc(void);
+
 const HChar *s390_insn_as_string(const s390_insn *);
 
 /*--------------------------------------------------------*/
@@ -475,13 +521,30 @@
 void  getRegUsage_S390Instr( HRegUsage *, s390_insn *, Bool );
 void  mapRegs_S390Instr    ( HRegRemap *, s390_insn *, Bool );
 Bool  isMove_S390Instr     ( s390_insn *, HReg *, HReg * );
-Int   emit_S390Instr       ( UChar *, Int, s390_insn *, Bool,
-                             void *, void * );
+Int   emit_S390Instr       ( Bool *, UChar *, Int, s390_insn *, Bool,
+                             void *, void *, void *, void *);
 void  getAllocableRegs_S390( Int *, HReg **, Bool );
 void  genSpill_S390        ( HInstr **, HInstr **, HReg , Int , Bool );
 void  genReload_S390       ( HInstr **, HInstr **, HReg , Int , Bool );
 s390_insn *directReload_S390 ( s390_insn *, HReg, Short );
-HInstrArray *iselSB_S390   ( IRSB *, VexArch, VexArchInfo *, VexAbiInfo * );
+HInstrArray *iselSB_S390   ( IRSB *, VexArch, VexArchInfo *, VexAbiInfo *,
+                             Int, Int, Bool, Bool, Addr64);
+
+/* Return the number of bytes of code needed for an event check */
+Int evCheckSzB_S390(void);
+
+/* Perform a chaining and unchaining of an XDirect jump. */
+VexInvalRange chainXDirect_S390(void *place_to_chain,
+                                void *disp_cp_chain_me_EXPECTED,
+                                void *place_to_jump_to);
+
+VexInvalRange unchainXDirect_S390(void *place_to_unchain,
+                                  void *place_to_jump_to_EXPECTED,
+                                  void *disp_cp_chain_me);
+
+/* Patch the counter location into an existing ProfInc point. */
+VexInvalRange patchProfInc_S390(void  *code_to_patch,
+                                ULong *location_of_counter);
 
 /* KLUDGE: See detailled comment in host_s390_defs.c. */
 extern const VexArchInfo *s390_archinfo_host;
diff --git a/priv/host_s390_isel.c b/priv/host_s390_isel.c
index a2217d4..9400012 100644
--- a/priv/host_s390_isel.c
+++ b/priv/host_s390_isel.c
@@ -69,6 +69,18 @@
     - The host subarchitecture we are selecting insns for.
       This is set at the start and does not change.
 
+   - A Bool for indicating whether we may generate chain-me
+     instructions for control flow transfers, or whether we must use
+     XAssisted.
+
+   - The maximum guest address of any guest insn in this block.
+     Actually, the address of the highest-addressed byte from any insn
+     in this block.  Is set at the start and does not change.  This is
+     used for detecting jumps which are definitely forward-edges from
+     this block, and therefore can be made (chained) to the fast entry
+     point of the destination, thereby avoiding the destination's
+     event check.
+
     - A flag to indicate whether the guest IA has been assigned to.
 
     - Values of certain guest registers which are often assigned constants.
@@ -92,16 +104,19 @@
 typedef struct {
    IRTypeEnv   *type_env;
 
+   HInstrArray *code;
    HReg        *vregmap;
    HReg        *vregmapHI;
    UInt         n_vregmap;
-
-   HInstrArray *code;
+   UInt         vreg_ctr;
+   UInt         hwcaps;
 
    ULong        old_value[NUM_TRACKED_REGS];
-   UInt         vreg_ctr;
 
-   UInt         hwcaps;
+   /* The next two are for translation chaining */
+   Addr64       max_ga;
+   Bool         chaining_allowed;
+
    Bool         first_IA_assignment;
    Bool         old_value_valid[NUM_TRACKED_REGS];
 } ISelEnv;
@@ -2437,17 +2452,56 @@
 
       /* --------- EXIT --------- */
    case Ist_Exit: {
-      s390_opnd_RMI dst;
       s390_cc_t cond;
       IRConstTag tag = stmt->Ist.Exit.dst->tag;
 
       if (tag != Ico_U64)
          vpanic("s390_isel_stmt: Ist_Exit: dst is not a 64-bit value");
 
-      dst  = s390_isel_int_expr_RMI(env, IRExpr_Const(stmt->Ist.Exit.dst));
+      s390_amode *guest_IA = s390_amode_for_guest_state(stmt->Ist.Exit.offsIP);
       cond = s390_isel_cc(env, stmt->Ist.Exit.guard);
-      addInstr(env, s390_insn_branch(stmt->Ist.Exit.jk, cond, dst));
-      return;
+
+      /* Case: boring transfer to known address */
+      if (stmt->Ist.Exit.jk == Ijk_Boring) {
+         if (env->chaining_allowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool to_fast_entry
+               = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
+            if (0) vex_printf("%s", to_fast_entry ? "Y" : ",");
+            addInstr(env, s390_insn_xdirect(cond, stmt->Ist.Exit.dst->Ico.U64,
+                                            guest_IA, to_fast_entry));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an assisted transfer,
+               as that's the only alternative that is allowable. */
+            HReg dst = s390_isel_int_expr(env,
+                                          IRExpr_Const(stmt->Ist.Exit.dst));
+            addInstr(env, s390_insn_xassisted(cond, dst, guest_IA, Ijk_Boring));
+         }
+         return;
+      }
+
+      /* Case: assisted transfer to arbitrary address */
+      switch (stmt->Ist.Exit.jk) {
+      case Ijk_TInval:
+      case Ijk_Sys_syscall:
+      case Ijk_ClientReq:
+      case Ijk_NoRedir:
+      case Ijk_Yield:
+      case Ijk_SigTRAP: {
+         HReg dst = s390_isel_int_expr(env, IRExpr_Const(stmt->Ist.Exit.dst));
+         addInstr(env, s390_insn_xassisted(cond, dst, guest_IA,
+                                           stmt->Ist.Exit.jk));
+         return;
+      }
+      default:
+         break;
+      }
+
+      /* Do we ever expect to see any other kind? */
+      goto stmt_fail;
    }
 
       /* --------- MEM FENCE --------- */
@@ -2484,20 +2538,80 @@
 /*---------------------------------------------------------*/
 
 static void
-iselNext(ISelEnv *env, IRExpr *next, IRJumpKind jk)
+iselNext(ISelEnv *env, IRExpr *next, IRJumpKind jk, int offsIP)
 {
-   s390_opnd_RMI dst;
-
    if (vex_traceflags & VEX_TRACE_VCODE) {
-      vex_printf("\n-- goto {");
-      ppIRJumpKind(jk);
-      vex_printf("} ");
+      vex_printf("\n-- PUT(%d) = ", offsIP);
       ppIRExpr(next);
+      vex_printf("; exit-");
+      ppIRJumpKind(jk);
       vex_printf("\n");
    }
 
-   dst = s390_isel_int_expr_RMI(env, next);
-   addInstr(env, s390_insn_branch(jk, S390_CC_ALWAYS, dst));
+   s390_amode *guest_IA = s390_amode_for_guest_state(offsIP);
+
+   /* Case: boring transfer to known address */
+   if (next->tag == Iex_Const) {
+      IRConst *cdst = next->Iex.Const.con;
+      vassert(cdst->tag == Ico_U64);
+      if (jk == Ijk_Boring || jk == Ijk_Call) {
+         /* Boring transfer to known address */
+         if (env->chaining_allowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool to_fast_entry
+               = ((Addr64)cdst->Ico.U64) > env->max_ga;
+            if (0) vex_printf("%s", to_fast_entry ? "X" : ".");
+            addInstr(env, s390_insn_xdirect(S390_CC_ALWAYS, cdst->Ico.U64,
+                                            guest_IA, to_fast_entry));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an indirect transfer,
+               as that's the cheapest alternative that is allowable. */
+            HReg dst = s390_isel_int_expr(env, next);
+            addInstr(env, s390_insn_xassisted(S390_CC_ALWAYS, dst, guest_IA,
+                                              Ijk_Boring));
+         }
+         return;
+      }
+   }
+
+   /* Case: call/return (==boring) transfer to any address */
+   switch (jk) {
+   case Ijk_Boring:
+   case Ijk_Ret:
+   case Ijk_Call: {
+      HReg dst = s390_isel_int_expr(env, next);
+      if (env->chaining_allowed) {
+         addInstr(env, s390_insn_xindir(S390_CC_ALWAYS, dst, guest_IA));
+      } else {
+         addInstr(env, s390_insn_xassisted(S390_CC_ALWAYS, dst, guest_IA,
+                                           Ijk_Boring));
+      }
+      return;
+   }
+   default:
+      break;
+   }
+
+   /* Case: some other kind of transfer to any address */
+   switch (jk) {
+   case Ijk_TInval:
+   case Ijk_Sys_syscall:
+   case Ijk_ClientReq:
+   case Ijk_NoRedir:
+   case Ijk_Yield:
+   case Ijk_SigTRAP: {
+      HReg dst = s390_isel_int_expr(env, next);
+      addInstr(env, s390_insn_xassisted(S390_CC_ALWAYS, dst, guest_IA, jk));
+      return;
+   }
+   default:
+      break;
+   }
+
+   vpanic("iselNext");
 }
 
 
@@ -2509,7 +2623,9 @@
 
 HInstrArray *
 iselSB_S390(IRSB *bb, VexArch arch_host, VexArchInfo *archinfo_host,
-             VexAbiInfo *vbi)
+            VexAbiInfo *vbi, Int offset_host_evcheck_counter,
+            Int offset_host_evcheck_fail_addr, Bool chaining_allowed,
+            Bool add_profinc, Addr64 max_ga)
 {
    UInt     i, j;
    HReg     hreg, hregHI;
@@ -2552,6 +2668,9 @@
    /* and finally ... */
    env->hwcaps    = hwcaps_host;
 
+   env->max_ga = max_ga;
+   env->chaining_allowed = chaining_allowed;
+
    /* For each IR temporary, allocate a suitably-kinded virtual
       register. */
    j = 0;
@@ -2595,12 +2714,26 @@
    }
    env->vreg_ctr = j;
 
+   /* The very first instruction must be an event check. */
+   s390_amode *counter, *fail_addr;
+   counter   = s390_amode_for_guest_state(offset_host_evcheck_counter);
+   fail_addr = s390_amode_for_guest_state(offset_host_evcheck_fail_addr);
+   addInstr(env, s390_insn_evcheck(counter, fail_addr));
+
+   /* Possibly a block counter increment (for profiling).  At this
+      point we don't know the address of the counter, so just pretend
+      it is zero.  It will have to be patched later, but before this
+      translation is used, by a call to LibVEX_patchProfInc. */
+   if (add_profinc) {
+      addInstr(env, s390_insn_profinc());
+   }
+
    /* Ok, finally we can iterate over the statements. */
    for (i = 0; i < bb->stmts_used; i++)
       if (bb->stmts[i])
          s390_isel_stmt(env, bb->stmts[i]);
 
-   iselNext(env, bb->next, bb->jumpkind);
+   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
 
    /* Record the number of vregs we used. */
    env->code->n_vregs = env->vreg_ctr;
diff --git a/priv/host_x86_defs.c b/priv/host_x86_defs.c
index 25848a3..4471f4d 100644
--- a/priv/host_x86_defs.c
+++ b/priv/host_x86_defs.c
@@ -647,12 +647,33 @@
    vassert(regparms >= 0 && regparms <= 3);
    return i;
 }
-X86Instr* X86Instr_Goto ( IRJumpKind jk, X86CondCode cond, X86RI* dst ) {
-   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
-   i->tag           = Xin_Goto;
-   i->Xin.Goto.cond = cond;
-   i->Xin.Goto.dst  = dst;
-   i->Xin.Goto.jk   = jk;
+X86Instr* X86Instr_XDirect ( Addr32 dstGA, X86AMode* amEIP,
+                             X86CondCode cond, Bool toFastEP ) {
+   X86Instr* i             = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag                  = Xin_XDirect;
+   i->Xin.XDirect.dstGA    = dstGA;
+   i->Xin.XDirect.amEIP    = amEIP;
+   i->Xin.XDirect.cond     = cond;
+   i->Xin.XDirect.toFastEP = toFastEP;
+   return i;
+}
+X86Instr* X86Instr_XIndir ( HReg dstGA, X86AMode* amEIP,
+                            X86CondCode cond ) {
+   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag              = Xin_XIndir;
+   i->Xin.XIndir.dstGA = dstGA;
+   i->Xin.XIndir.amEIP = amEIP;
+   i->Xin.XIndir.cond  = cond;
+   return i;
+}
+X86Instr* X86Instr_XAssisted ( HReg dstGA, X86AMode* amEIP,
+                               X86CondCode cond, IRJumpKind jk ) {
+   X86Instr* i            = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag                 = Xin_XAssisted;
+   i->Xin.XAssisted.dstGA = dstGA;
+   i->Xin.XAssisted.amEIP = amEIP;
+   i->Xin.XAssisted.cond  = cond;
+   i->Xin.XAssisted.jk    = jk;
    return i;
 }
 X86Instr* X86Instr_CMov32  ( X86CondCode cond, X86RM* src, HReg dst ) {
@@ -797,7 +818,6 @@
    i->Xin.FpCmp.dst  = dst;
    return i;
 }
-
 X86Instr* X86Instr_SseConst ( UShort con, HReg dst ) {
    X86Instr* i            = LibVEX_Alloc(sizeof(X86Instr));
    i->tag                 = Xin_SseConst;
@@ -886,6 +906,19 @@
    vassert(order >= 0 && order <= 0xFF);
    return i;
 }
+X86Instr* X86Instr_EvCheck ( X86AMode* amCounter,
+                             X86AMode* amFailAddr ) {
+   X86Instr* i               = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag                    = Xin_EvCheck;
+   i->Xin.EvCheck.amCounter  = amCounter;
+   i->Xin.EvCheck.amFailAddr = amFailAddr;
+   return i;
+}
+X86Instr* X86Instr_ProfInc ( void ) {
+   X86Instr* i = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag      = Xin_ProfInc;
+   return i;
+}
 
 void ppX86Instr ( X86Instr* i, Bool mode64 ) {
    vassert(mode64 == False);
@@ -953,24 +986,36 @@
                     i->Xin.Call.regparms);
          vex_printf("0x%x", i->Xin.Call.target);
          break;
-      case Xin_Goto:
-         if (i->Xin.Goto.cond != Xcc_ALWAYS) {
-            vex_printf("if (%%eflags.%s) { ", 
-                       showX86CondCode(i->Xin.Goto.cond));
-	 }
-         if (i->Xin.Goto.jk != Ijk_Boring
-             && i->Xin.Goto.jk != Ijk_Call
-             && i->Xin.Goto.jk != Ijk_Ret) {
-            vex_printf("movl $");
-            ppIRJumpKind(i->Xin.Goto.jk);
-            vex_printf(",%%ebp ; ");
-         }
+      case Xin_XDirect:
+         vex_printf("(xDirect) ");
+         vex_printf("if (%%eflags.%s) { ",
+                    showX86CondCode(i->Xin.XDirect.cond));
+         vex_printf("movl $0x%x,", i->Xin.XDirect.dstGA);
+         ppX86AMode(i->Xin.XDirect.amEIP);
+         vex_printf("; ");
+         vex_printf("movl $disp_cp_chain_me_to_%sEP,%%edx; call *%%edx }",
+                    i->Xin.XDirect.toFastEP ? "fast" : "slow");
+         return;
+      case Xin_XIndir:
+         vex_printf("(xIndir) ");
+         vex_printf("if (%%eflags.%s) { movl ",
+                    showX86CondCode(i->Xin.XIndir.cond));
+         ppHRegX86(i->Xin.XIndir.dstGA);
+         vex_printf(",");
+         ppX86AMode(i->Xin.XIndir.amEIP);
+         vex_printf("; movl $disp_indir,%%edx; jmp *%%edx }");
+         return;
+      case Xin_XAssisted:
+         vex_printf("(xAssisted) ");
+         vex_printf("if (%%eflags.%s) { ",
+                    showX86CondCode(i->Xin.XAssisted.cond));
          vex_printf("movl ");
-         ppX86RI(i->Xin.Goto.dst);
-         vex_printf(",%%eax ; movl $dispatcher_addr,%%edx ; jmp *%%edx");
-         if (i->Xin.Goto.cond != Xcc_ALWAYS) {
-            vex_printf(" }");
-	 }
+         ppHRegX86(i->Xin.XAssisted.dstGA);
+         vex_printf(",");
+         ppX86AMode(i->Xin.XAssisted.amEIP);
+         vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%ebp",
+                    (Int)i->Xin.XAssisted.jk);
+         vex_printf("; movl $disp_assisted,%%edx; jmp *%%edx }");
          return;
       case Xin_CMov32:
          vex_printf("cmov%s ", showX86CondCode(i->Xin.CMov32.cond));
@@ -1152,7 +1197,17 @@
          vex_printf(",");
          ppHRegX86(i->Xin.SseShuf.dst);
          return;
-
+      case Xin_EvCheck:
+         vex_printf("(evCheck) decl ");
+         ppX86AMode(i->Xin.EvCheck.amCounter);
+         vex_printf("; jns nofail; jmp *");
+         ppX86AMode(i->Xin.EvCheck.amFailAddr);
+         vex_printf("; nofail:");
+         return;
+      case Xin_ProfInc:
+         vex_printf("(profInc) addl $1,NotKnownYet; "
+                    "adcl $0,NotKnownYet+4");
+         return;
       default:
          vpanic("ppX86Instr");
    }
@@ -1258,16 +1313,21 @@
             address temporary, depending on the regparmness: 0==EAX,
             1==EDX, 2==ECX, 3==EDI. */
          return;
-      case Xin_Goto:
-         addRegUsage_X86RI(u, i->Xin.Goto.dst);
-         addHRegUse(u, HRmWrite, hregX86_EAX()); /* used for next guest addr */
-         addHRegUse(u, HRmWrite, hregX86_EDX()); /* used for dispatcher addr */
-         if (i->Xin.Goto.jk != Ijk_Boring
-             && i->Xin.Goto.jk != Ijk_Call
-             && i->Xin.Goto.jk != Ijk_Ret)
-            /* note, this is irrelevant since ebp is not actually
-               available to the allocator.  But still .. */
-            addHRegUse(u, HRmWrite, hregX86_EBP());
+      /* XDirect/XIndir/XAssisted are also a bit subtle.  They
+         conditionally exit the block.  Hence we only need to list (1)
+         the registers that they read, and (2) the registers that they
+         write in the case where the block is not exited.  (2) is
+         empty, hence only (1) is relevant here. */
+      case Xin_XDirect:
+         addRegUsage_X86AMode(u, i->Xin.XDirect.amEIP);
+         return;
+      case Xin_XIndir:
+         addHRegUse(u, HRmRead, i->Xin.XIndir.dstGA);
+         addRegUsage_X86AMode(u, i->Xin.XIndir.amEIP);
+         return;
+      case Xin_XAssisted:
+         addHRegUse(u, HRmRead, i->Xin.XAssisted.dstGA);
+         addRegUsage_X86AMode(u, i->Xin.XAssisted.amEIP);
          return;
       case Xin_CMov32:
          addRegUsage_X86RM(u, i->Xin.CMov32.src, HRmRead);
@@ -1410,6 +1470,15 @@
          addHRegUse(u, HRmRead,  i->Xin.SseShuf.src);
          addHRegUse(u, HRmWrite, i->Xin.SseShuf.dst);
          return;
+      case Xin_EvCheck:
+         /* We expect both amodes only to mention %ebp, so this is in
+            fact pointless, since %ebp isn't allocatable, but anyway.. */
+         addRegUsage_X86AMode(u, i->Xin.EvCheck.amCounter);
+         addRegUsage_X86AMode(u, i->Xin.EvCheck.amFailAddr);
+         return;
+      case Xin_ProfInc:
+         /* does not use any registers. */
+         return;
       default:
          ppX86Instr(i, False);
          vpanic("getRegUsage_X86Instr");
@@ -1462,8 +1531,16 @@
          return;
       case Xin_Call:
          return;
-      case Xin_Goto:
-         mapRegs_X86RI(m, i->Xin.Goto.dst);
+      case Xin_XDirect:
+         mapRegs_X86AMode(m, i->Xin.XDirect.amEIP);
+         return;
+      case Xin_XIndir:
+         mapReg(m, &i->Xin.XIndir.dstGA);
+         mapRegs_X86AMode(m, i->Xin.XIndir.amEIP);
+         return;
+      case Xin_XAssisted:
+         mapReg(m, &i->Xin.XAssisted.dstGA);
+         mapRegs_X86AMode(m, i->Xin.XAssisted.amEIP);
          return;
       case Xin_CMov32:
          mapRegs_X86RM(m, i->Xin.CMov32.src);
@@ -1566,6 +1643,16 @@
          mapReg(m, &i->Xin.SseShuf.src);
          mapReg(m, &i->Xin.SseShuf.dst);
          return;
+      case Xin_EvCheck:
+         /* We expect both amodes only to mention %ebp, so this is in
+            fact pointless, since %ebp isn't allocatable, but anyway.. */
+         mapRegs_X86AMode(m, i->Xin.EvCheck.amCounter);
+         mapRegs_X86AMode(m, i->Xin.EvCheck.amFailAddr);
+         return;
+      case Xin_ProfInc:
+         /* does not use any registers. */
+         return;
+
       default:
          ppX86Instr(i, mode64);
          vpanic("mapRegs_X86Instr");
@@ -1986,12 +2073,17 @@
 
 /* Emit an instruction into buf and return the number of bytes used.
    Note that buf is not the insn's final place, and therefore it is
-   imperative to emit position-independent code. */
+   imperative to emit position-independent code.  If the emitted
+   instruction was a profiler inc, set *is_profInc to True, else
+   leave it unchanged. */
 
-Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i, 
+Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
+                    UChar* buf, Int nbuf, X86Instr* i, 
                     Bool mode64,
-                    void* dispatch_unassisted,
-                    void* dispatch_assisted )
+                    void* disp_cp_chain_me_to_slowEP,
+                    void* disp_cp_chain_me_to_fastEP,
+                    void* disp_cp_xindir,
+                    void* disp_cp_xassisted )
 {
    UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
 
@@ -2306,110 +2398,153 @@
       *p++ = toUChar(0xD0 + irno);
       goto done;
 
-   case Xin_Goto: {
-      void* dispatch_to_use = NULL;
-      vassert(dispatch_unassisted != NULL);
-      vassert(dispatch_assisted != NULL);
+   case Xin_XDirect: {
+      /* NB: what goes on here has to be very closely coordinated with the
+         chainXDirect_X86 and unchainXDirect_X86 below. */
+      /* We're generating chain-me requests here, so we need to be
+         sure this is actually allowed -- no-redir translations can't
+         use chain-me's.  Hence: */
+      vassert(disp_cp_chain_me_to_slowEP != NULL);
+      vassert(disp_cp_chain_me_to_fastEP != NULL);
 
       /* Use ptmp for backpatching conditional jumps. */
       ptmp = NULL;
 
       /* First off, if this is conditional, create a conditional
-	 jump over the rest of it. */
-      if (i->Xin.Goto.cond != Xcc_ALWAYS) {
+         jump over the rest of it. */
+      if (i->Xin.XDirect.cond != Xcc_ALWAYS) {
          /* jmp fwds if !condition */
-         *p++ = toUChar(0x70 + (0xF & (i->Xin.Goto.cond ^ 1)));
+         *p++ = toUChar(0x70 + (0xF & (i->Xin.XDirect.cond ^ 1)));
          ptmp = p; /* fill in this bit later */
          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
       }
 
-      /* If a non-boring, set %ebp (the guest state pointer)
-         appropriately.  Also, decide which dispatcher we need to
-         use. */
-      dispatch_to_use = dispatch_assisted;
+      /* Update the guest EIP. */
+      /* movl $dstGA, amEIP */
+      *p++ = 0xC7;
+      p    = doAMode_M(p, fake(0), i->Xin.XDirect.amEIP);
+      p    = emit32(p, i->Xin.XDirect.dstGA);
 
-      /* movl $magic_number, %ebp */
-      switch (i->Xin.Goto.jk) {
-         case Ijk_ClientReq: 
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_CLIENTREQ); break;
-         case Ijk_Sys_int128:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_SYS_INT128); break;
-         case Ijk_Sys_int129:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_SYS_INT129); break;
-         case Ijk_Sys_int130:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_SYS_INT130); break;
-         case Ijk_Yield: 
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_YIELD); break;
-         case Ijk_EmWarn:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_EMWARN); break;
-         case Ijk_MapFail:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_MAPFAIL); break;
-         case Ijk_NoDecode:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_NODECODE); break;
-         case Ijk_TInval:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_TINVAL); break;
-         case Ijk_NoRedir:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_NOREDIR); break;
-         case Ijk_Sys_sysenter:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_SYS_SYSENTER); break;
-         case Ijk_SigTRAP:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_SIGTRAP); break;
-         case Ijk_SigSEGV:
-            *p++ = 0xBD;
-            p = emit32(p, VEX_TRC_JMP_SIGSEGV); break;
-         case Ijk_Ret:
-	 case Ijk_Call:
-         case Ijk_Boring:
-            dispatch_to_use = dispatch_unassisted;
-            break;
-         default: 
-            ppIRJumpKind(i->Xin.Goto.jk);
-            vpanic("emit_X86Instr.Xin_Goto: unknown jump kind");
-      }
-
-      /* Get the destination address into %eax */
-      if (i->Xin.Goto.dst->tag == Xri_Imm) {
-         /* movl $immediate, %eax */
-         *p++ = 0xB8;
-         p = emit32(p, i->Xin.Goto.dst->Xri.Imm.imm32);
-      } else {
-         vassert(i->Xin.Goto.dst->tag == Xri_Reg);
-         /* movl %reg, %eax */
-         if (i->Xin.Goto.dst->Xri.Reg.reg != hregX86_EAX()) {
-            *p++ = 0x89;
-            p = doAMode_R(p, i->Xin.Goto.dst->Xri.Reg.reg, hregX86_EAX());
-         }
-      }
-
-      /* Get the dispatcher address into %edx.  This has to happen
-         after the load of %eax since %edx might be carrying the value
-         destined for %eax immediately prior to this Xin_Goto. */
-      vassert(sizeof(UInt) == sizeof(void*));
-      vassert(dispatch_to_use != NULL);
-      /* movl $imm32, %edx */
+      /* --- FIRST PATCHABLE BYTE follows --- */
+      /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
+         to) backs up the return address, so as to find the address of
+         the first patchable byte.  So: don't change the length of the
+         two instructions below. */
+      /* movl $disp_cp_chain_me_to_{slow,fast}EP,%edx; */
       *p++ = 0xBA;
-      p = emit32(p, (UInt)Ptr_to_ULong(dispatch_to_use));
+      void* disp_cp_chain_me
+               = i->Xin.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP 
+                                         : disp_cp_chain_me_to_slowEP;
+      p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_chain_me));
+      /* call *%edx */
+      *p++ = 0xFF;
+      *p++ = 0xD2;
+      /* --- END of PATCHABLE BYTES --- */
 
+      /* Fix up the conditional jump, if there was one. */
+      if (i->Xin.XDirect.cond != Xcc_ALWAYS) {
+         Int delta = p - ptmp;
+         vassert(delta > 0 && delta < 40);
+         *ptmp = toUChar(delta-1);
+      }
+      goto done;
+   }
+
+   case Xin_XIndir: {
+      /* We're generating transfers that could lead indirectly to a
+         chain-me, so we need to be sure this is actually allowed --
+         no-redir translations are not allowed to reach normal
+         translations without going through the scheduler.  That means
+         no XDirects or XIndirs out from no-redir translations.
+         Hence: */
+      vassert(disp_cp_xindir != NULL);
+
+      /* Use ptmp for backpatching conditional jumps. */
+      ptmp = NULL;
+
+      /* First off, if this is conditional, create a conditional
+         jump over the rest of it. */
+      if (i->Xin.XIndir.cond != Xcc_ALWAYS) {
+         /* jmp fwds if !condition */
+         *p++ = toUChar(0x70 + (0xF & (i->Xin.XIndir.cond ^ 1)));
+         ptmp = p; /* fill in this bit later */
+         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+      }
+
+      /* movl dstGA(a reg), amEIP -- copied from Alu32M MOV case */
+      *p++ = 0x89;
+      p = doAMode_M(p, i->Xin.XIndir.dstGA, i->Xin.XIndir.amEIP);
+
+      /* movl $disp_indir, %edx */
+      *p++ = 0xBA;
+      p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xindir));
       /* jmp *%edx */
       *p++ = 0xFF;
       *p++ = 0xE2;
 
       /* Fix up the conditional jump, if there was one. */
-      if (i->Xin.Goto.cond != Xcc_ALWAYS) {
+      if (i->Xin.XIndir.cond != Xcc_ALWAYS) {
          Int delta = p - ptmp;
-	 vassert(delta > 0 && delta < 20);
+         vassert(delta > 0 && delta < 40);
+         *ptmp = toUChar(delta-1);
+      }
+      goto done;
+   }
+
+   case Xin_XAssisted: {
+      /* Use ptmp for backpatching conditional jumps. */
+      ptmp = NULL;
+
+      /* First off, if this is conditional, create a conditional
+         jump over the rest of it. */
+      if (i->Xin.XAssisted.cond != Xcc_ALWAYS) {
+         /* jmp fwds if !condition */
+         *p++ = toUChar(0x70 + (0xF & (i->Xin.XAssisted.cond ^ 1)));
+         ptmp = p; /* fill in this bit later */
+         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+      }
+
+      /* movl dstGA(a reg), amEIP -- copied from Alu32M MOV case */
+      *p++ = 0x89;
+      p = doAMode_M(p, i->Xin.XIndir.dstGA, i->Xin.XIndir.amEIP);
+      /* movl $magic_number, %ebp. */
+      UInt trcval = 0;
+      switch (i->Xin.XAssisted.jk) {
+         case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
+         case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
+         case Ijk_Sys_int128:  trcval = VEX_TRC_JMP_SYS_INT128;  break;
+         case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
+         case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
+         case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
+         case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
+         case Ijk_TInval:      trcval = VEX_TRC_JMP_TINVAL;      break;
+         case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
+         case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
+         case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
+         case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
+         /* We don't expect to see the following being assisted. */
+         case Ijk_Ret:
+         case Ijk_Call:
+         /* fallthrough */
+         default: 
+            ppIRJumpKind(i->Xin.XAssisted.jk);
+            vpanic("emit_X86Instr.Xin_XAssisted: unexpected jump kind");
+      }
+      vassert(trcval != 0);
+      *p++ = 0xBD;
+      p = emit32(p, trcval);
+
+      /* movl $disp_indir, %edx */
+      *p++ = 0xBA;
+      p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xassisted));
+      /* jmp *%edx */
+      *p++ = 0xFF;
+      *p++ = 0xE2;
+
+      /* Fix up the conditional jump, if there was one. */
+      if (i->Xin.XAssisted.cond != Xcc_ALWAYS) {
+         Int delta = p - ptmp;
+         vassert(delta > 0 && delta < 40);
          *ptmp = toUChar(delta-1);
       }
       goto done;
@@ -3088,6 +3223,63 @@
       *p++ = (UChar)(i->Xin.SseShuf.order);
       goto done;
 
+   case Xin_EvCheck: {
+      /* We generate:
+            (3 bytes)  decl 4(%ebp)    4 == offsetof(host_EvC_COUNTER)
+            (2 bytes)  jns  nofail     expected taken
+            (3 bytes)  jmp* 0(%ebp)    0 == offsetof(host_EvC_FAILADDR)
+            nofail:
+      */
+      /* This is heavily asserted re instruction lengths.  It needs to
+         be.  If we get given unexpected forms of .amCounter or
+         .amFailAddr -- basically, anything that's not of the form
+         uimm7(%ebp) -- they are likely to fail. */
+      /* Note also that after the decl we must be very careful not to
+         read the carry flag, else we get a partial flags stall.
+         js/jns avoids that, though. */
+      UChar* p0 = p;
+      /* ---  decl 8(%ebp) --- */
+      /* "fake(1)" because + there's no register in this encoding;
+         instead the register + field is used as a sub opcode.  The
+         encoding for "decl r/m32" + is FF /1, hence the fake(1). */
+      *p++ = 0xFF;
+      p = doAMode_M(p, fake(1), i->Xin.EvCheck.amCounter);
+      vassert(p - p0 == 3);
+      /* --- jns nofail --- */
+      *p++ = 0x79;
+      *p++ = 0x03; /* need to check this 0x03 after the next insn */
+      vassert(p - p0 == 5);
+      /* --- jmp* 0(%ebp) --- */
+      /* The encoding is FF /4. */
+      *p++ = 0xFF;
+      p = doAMode_M(p, fake(4), i->Xin.EvCheck.amFailAddr);
+      vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
+      /* And crosscheck .. */
+      vassert(evCheckSzB_X86() == 8);
+      goto done;
+   }
+
+   case Xin_ProfInc: {
+      /* We generate   addl $1,NotKnownYet
+                       adcl $0,NotKnownYet+4
+         in the expectation that a later call to LibVEX_patchProfCtr
+         will be used to fill in the immediate fields once the right
+         value is known.
+           83 05  00 00 00 00  01
+           83 15  00 00 00 00  00
+      */
+      *p++ = 0x83; *p++ = 0x05;
+      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
+      *p++ = 0x01;
+      *p++ = 0x83; *p++ = 0x15;
+      *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
+      *p++ = 0x00;
+      /* Tell the caller .. */
+      vassert(!(*is_profInc));
+      *is_profInc = True;
+      goto done;
+   }
+
    default: 
       goto bad;
    }
@@ -3104,6 +3296,140 @@
 #  undef fake
 }
 
+
+/* How big is an event check?  See case for Xin_EvCheck in
+   emit_X86Instr just above.  That crosschecks what this returns, so
+   we can tell if we're inconsistent. */
+Int evCheckSzB_X86 ( void )
+{
+   return 8;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   emitInstr case for XDirect, above. */
+VexInvalRange chainXDirect_X86 ( void* place_to_chain,
+                                 void* disp_cp_chain_me_EXPECTED,
+                                 void* place_to_jump_to )
+{
+   /* What we're expecting to see is:
+        movl $disp_cp_chain_me_EXPECTED, %edx
+        call *%edx
+      viz
+        BA <4 bytes value == disp_cp_chain_me_EXPECTED>
+        FF D2
+   */
+   UChar* p = (UChar*)place_to_chain;
+   vassert(p[0] == 0xBA);
+   vassert(*(UInt*)(&p[1]) == (UInt)Ptr_to_ULong(disp_cp_chain_me_EXPECTED));
+   vassert(p[5] == 0xFF);
+   vassert(p[6] == 0xD2);
+   /* And what we want to change it to is:
+          jmp disp32   where disp32 is relative to the next insn
+          ud2;
+        viz
+          E9 <4 bytes == disp32>
+          0F 0B
+      The replacement has the same length as the original.
+   */
+   /* This is the delta we need to put into a JMP d32 insn.  It's
+      relative to the start of the next insn, hence the -5.  */
+   Long delta = (Long)((UChar*)place_to_jump_to - (UChar*)p) - (Long)5;
+
+   /* And make the modifications. */
+   p[0] = 0xE9;
+   p[1] = (delta >> 0) & 0xFF;
+   p[2] = (delta >> 8) & 0xFF;
+   p[3] = (delta >> 16) & 0xFF;
+   p[4] = (delta >> 24) & 0xFF;
+   p[5] = 0x0F; p[6]  = 0x0B;
+   /* sanity check on the delta -- top 32 are all 0 or all 1 */
+   delta >>= 32;
+   vassert(delta == 0LL || delta == -1LL);
+   VexInvalRange vir = {0, 0};
+   return vir;
+}
+
+
+/* NB: what goes on here has to be very closely coordinated with the
+   emitInstr case for XDirect, above. */
+VexInvalRange unchainXDirect_X86 ( void* place_to_unchain,
+                                   void* place_to_jump_to_EXPECTED,
+                                   void* disp_cp_chain_me )
+{
+   /* What we're expecting to see is:
+          jmp d32
+          ud2;
+       viz
+          E9 <4 bytes == disp32>
+          0F 0B
+   */
+   UChar* p     = (UChar*)place_to_unchain;
+   Bool   valid = False;
+   if (p[0] == 0xE9 
+       && p[5]  == 0x0F && p[6]  == 0x0B) {
+      /* Check the offset is right. */
+      Int s32 = *(Int*)(&p[1]);
+      if ((UChar*)p + 5 + s32 == (UChar*)place_to_jump_to_EXPECTED) {
+         valid = True;
+         if (0)
+            vex_printf("QQQ unchainXDirect_X86: found valid\n");
+      }
+   }
+   vassert(valid);
+   /* And what we want to change it to is:
+         movl $disp_cp_chain_me, %edx
+         call *%edx
+      viz
+         BA <4 bytes value == disp_cp_chain_me_EXPECTED>
+         FF D2
+      So it's the same length (convenient, huh).
+   */
+   p[0] = 0xBA;
+   *(UInt*)(&p[1]) = (UInt)Ptr_to_ULong(disp_cp_chain_me);
+   p[5] = 0xFF;
+   p[6] = 0xD2;
+   VexInvalRange vir = {0, 0};
+   return vir;
+}
+
+
+/* Patch the counter address into a profile inc point, as previously
+   created by the Xin_ProfInc case for emit_X86Instr. */
+VexInvalRange patchProfInc_X86 ( void*  place_to_patch,
+                                 ULong* location_of_counter )
+{
+   vassert(sizeof(ULong*) == 4);
+   UChar* p = (UChar*)place_to_patch;
+   vassert(p[0] == 0x83);
+   vassert(p[1] == 0x05);
+   vassert(p[2] == 0x00);
+   vassert(p[3] == 0x00);
+   vassert(p[4] == 0x00);
+   vassert(p[5] == 0x00);
+   vassert(p[6] == 0x01);
+   vassert(p[7] == 0x83);
+   vassert(p[8] == 0x15);
+   vassert(p[9] == 0x00);
+   vassert(p[10] == 0x00);
+   vassert(p[11] == 0x00);
+   vassert(p[12] == 0x00);
+   vassert(p[13] == 0x00);
+   UInt imm32 = (UInt)Ptr_to_ULong(location_of_counter);
+   p[2] = imm32 & 0xFF; imm32 >>= 8;
+   p[3] = imm32 & 0xFF; imm32 >>= 8;
+   p[4] = imm32 & 0xFF; imm32 >>= 8;
+   p[5] = imm32 & 0xFF; imm32 >>= 8;
+   imm32 = 4 + (UInt)Ptr_to_ULong(location_of_counter);
+   p[9]  = imm32 & 0xFF; imm32 >>= 8;
+   p[10] = imm32 & 0xFF; imm32 >>= 8;
+   p[11] = imm32 & 0xFF; imm32 >>= 8;
+   p[12] = imm32 & 0xFF; imm32 >>= 8;
+   VexInvalRange vir = {0, 0};
+   return vir;
+}
+
+
 /*---------------------------------------------------------------*/
 /*--- end                                     host_x86_defs.c ---*/
 /*---------------------------------------------------------------*/
diff --git a/priv/host_x86_defs.h b/priv/host_x86_defs.h
index f68a426..544f8df 100644
--- a/priv/host_x86_defs.h
+++ b/priv/host_x86_defs.h
@@ -349,7 +349,9 @@
       Xin_Sh3232,    /* shldl or shrdl */
       Xin_Push,      /* push (32-bit?) value on stack */
       Xin_Call,      /* call to address in register */
-      Xin_Goto,      /* conditional/unconditional jmp to dst */
+      Xin_XDirect,   /* direct transfer to GA */
+      Xin_XIndir,    /* indirect transfer to GA */
+      Xin_XAssisted, /* assisted transfer to GA */
       Xin_CMov32,    /* conditional move */
       Xin_LoadEX,    /* mov{s,z}{b,w}l from mem to reg */
       Xin_Store,     /* store 16/8 bit value in memory */
@@ -378,7 +380,9 @@
       Xin_Sse64FLo,  /* SSE binary, 64F in lowest lane only */
       Xin_SseReRg,   /* SSE binary general reg-reg, Re, Rg */
       Xin_SseCMov,   /* SSE conditional move */
-      Xin_SseShuf    /* SSE2 shuffle (pshufd) */
+      Xin_SseShuf,   /* SSE2 shuffle (pshufd) */
+      Xin_EvCheck,   /* Event check */
+      Xin_ProfInc    /* 64-bit profile counter increment */
    }
    X86InstrTag;
 
@@ -444,13 +448,30 @@
             Addr32      target;
             Int         regparms; /* 0 .. 3 */
          } Call;
-         /* Pseudo-insn.  Goto dst, on given condition (which could be
-            Xcc_ALWAYS). */
+         /* Update the guest EIP value, then exit requesting to chain
+            to it.  May be conditional.  Urr, use of Addr32 implicitly
+            assumes that wordsize(guest) == wordsize(host). */
          struct {
+            Addr32      dstGA;    /* next guest address */
+            X86AMode*   amEIP;    /* amode in guest state for EIP */
+            X86CondCode cond;     /* can be Xcc_ALWAYS */
+            Bool        toFastEP; /* chain to the slow or fast point? */
+         } XDirect;
+         /* Boring transfer to a guest address not known at JIT time.
+            Not chainable.  May be conditional. */
+         struct {
+            HReg        dstGA;
+            X86AMode*   amEIP;
+            X86CondCode cond; /* can be Xcc_ALWAYS */
+         } XIndir;
+         /* Assisted transfer to a guest address, most general case.
+            Not chainable.  May be conditional. */
+         struct {
+            HReg        dstGA;
+            X86AMode*   amEIP;
+            X86CondCode cond; /* can be Xcc_ALWAYS */
             IRJumpKind  jk;
-            X86CondCode cond;
-            X86RI*      dst;
-         } Goto;
+         } XAssisted;
          /* Mov src to dst on the given condition, which may not
             be the bogus Xcc_ALWAYS. */
          struct {
@@ -615,6 +636,15 @@
             HReg   src;
             HReg   dst;
          } SseShuf;
+         struct {
+            X86AMode* amCounter;
+            X86AMode* amFailAddr;
+         } EvCheck;
+         struct {
+            /* No fields.  The address of the counter to inc is
+               installed later, post-translation, by patching it in,
+               as it is not known at translation time. */
+         } ProfInc;
 
       } Xin;
    }
@@ -632,7 +662,12 @@
 extern X86Instr* X86Instr_Sh3232    ( X86ShiftOp, UInt amt, HReg src, HReg dst );
 extern X86Instr* X86Instr_Push      ( X86RMI* );
 extern X86Instr* X86Instr_Call      ( X86CondCode, Addr32, Int );
-extern X86Instr* X86Instr_Goto      ( IRJumpKind, X86CondCode cond, X86RI* dst );
+extern X86Instr* X86Instr_XDirect   ( Addr32 dstGA, X86AMode* amEIP,
+                                      X86CondCode cond, Bool toFastEP );
+extern X86Instr* X86Instr_XIndir    ( HReg dstGA, X86AMode* amEIP,
+                                      X86CondCode cond );
+extern X86Instr* X86Instr_XAssisted ( HReg dstGA, X86AMode* amEIP,
+                                      X86CondCode cond, IRJumpKind jk );
 extern X86Instr* X86Instr_CMov32    ( X86CondCode, X86RM* src, HReg dst );
 extern X86Instr* X86Instr_LoadEX    ( UChar szSmall, Bool syned,
                                       X86AMode* src, HReg dst );
@@ -663,6 +698,9 @@
 extern X86Instr* X86Instr_SseReRg   ( X86SseOp, HReg, HReg );
 extern X86Instr* X86Instr_SseCMov   ( X86CondCode, HReg src, HReg dst );
 extern X86Instr* X86Instr_SseShuf   ( Int order, HReg src, HReg dst );
+extern X86Instr* X86Instr_EvCheck   ( X86AMode* amCounter,
+                                      X86AMode* amFailAddr );
+extern X86Instr* X86Instr_ProfInc   ( void );
 
 
 extern void ppX86Instr ( X86Instr*, Bool );
@@ -672,10 +710,13 @@
 extern void         getRegUsage_X86Instr ( HRegUsage*, X86Instr*, Bool );
 extern void         mapRegs_X86Instr     ( HRegRemap*, X86Instr*, Bool );
 extern Bool         isMove_X86Instr      ( X86Instr*, HReg*, HReg* );
-extern Int          emit_X86Instr        ( UChar* buf, Int nbuf, X86Instr*, 
-                                           Bool,
-                                           void* dispatch_unassisted,
-                                           void* dispatch_assisted );
+extern Int          emit_X86Instr        ( /*MB_MOD*/Bool* is_profInc,
+                                           UChar* buf, Int nbuf, X86Instr* i, 
+                                           Bool mode64,
+                                           void* disp_cp_chain_me_to_slowEP,
+                                           void* disp_cp_chain_me_to_fastEP,
+                                           void* disp_cp_xindir,
+                                           void* disp_cp_xassisted );
 
 extern void genSpill_X86  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                             HReg rreg, Int offset, Bool );
@@ -685,9 +726,36 @@
 extern X86Instr*    directReload_X86     ( X86Instr* i, 
                                            HReg vreg, Short spill_off );
 extern void         getAllocableRegs_X86 ( Int*, HReg** );
-extern HInstrArray* iselSB_X86           ( IRSB*, VexArch,
-                                                  VexArchInfo*,
-                                                  VexAbiInfo* );
+extern HInstrArray* iselSB_X86           ( IRSB*, 
+                                           VexArch,
+                                           VexArchInfo*,
+                                           VexAbiInfo*,
+                                           Int offs_Host_EvC_Counter,
+                                           Int offs_Host_EvC_FailAddr,
+                                           Bool chainingAllowed,
+                                           Bool addProfInc,
+                                           Addr64 max_ga );
+
+/* How big is an event check?  This is kind of a kludge because it
+   depends on the offsets of host_EvC_FAILADDR and host_EvC_COUNTER,
+   and so assumes that they are both <= 128, and so can use the short
+   offset encoding.  This is all checked with assertions, so in the
+   worst case we will merely assert at startup. */
+extern Int evCheckSzB_X86 ( void );
+
+/* Perform a chaining and unchaining of an XDirect jump. */
+extern VexInvalRange chainXDirect_X86 ( void* place_to_chain,
+                                        void* disp_cp_chain_me_EXPECTED,
+                                        void* place_to_jump_to );
+
+extern VexInvalRange unchainXDirect_X86 ( void* place_to_unchain,
+                                          void* place_to_jump_to_EXPECTED,
+                                          void* disp_cp_chain_me );
+
+/* Patch the counter location into an existing ProfInc point. */
+extern VexInvalRange patchProfInc_X86 ( void*  place_to_patch,
+                                        ULong* location_of_counter );
+
 
 #endif /* ndef __VEX_HOST_X86_DEFS_H */
 
diff --git a/priv/host_x86_isel.c b/priv/host_x86_isel.c
index 81896b3..2dd14ce 100644
--- a/priv/host_x86_isel.c
+++ b/priv/host_x86_isel.c
@@ -154,21 +154,38 @@
    - The host subarchitecture we are selecting insns for.  
      This is set at the start and does not change.
 
-   Note, this is all host-independent.  */
+   - A Bool for indicating whether we may generate chain-me
+     instructions for control flow transfers, or whether we must use
+     XAssisted.
+
+   - The maximum guest address of any guest insn in this block.
+     Actually, the address of the highest-addressed byte from any insn
+     in this block.  Is set at the start and does not change.  This is
+     used for detecting jumps which are definitely forward-edges from
+     this block, and therefore can be made (chained) to the fast entry
+     point of the destination, thereby avoiding the destination's
+     event check.
+
+   Note, this is all (well, mostly) host-independent.
+*/
 
 typedef
    struct {
+      /* Constant -- are set at the start and do not change. */
       IRTypeEnv*   type_env;
 
       HReg*        vregmap;
       HReg*        vregmapHI;
       Int          n_vregmap;
 
-      HInstrArray* code;
-
-      Int          vreg_ctr;
-
       UInt         hwcaps;
+
+      Bool         chainingAllowed;
+      Addr64       max_ga;
+
+      /* These are modified as we go along. */
+      HInstrArray* code;
+      Int          vreg_ctr;
    }
    ISelEnv;
 
@@ -4038,14 +4055,48 @@
 
    /* --------- EXIT --------- */
    case Ist_Exit: {
-      X86RI*      dst;
-      X86CondCode cc;
       if (stmt->Ist.Exit.dst->tag != Ico_U32)
-         vpanic("isel_x86: Ist_Exit: dst is not a 32-bit value");
-      dst = iselIntExpr_RI(env, IRExpr_Const(stmt->Ist.Exit.dst));
-      cc  = iselCondCode(env,stmt->Ist.Exit.guard);
-      addInstr(env, X86Instr_Goto(stmt->Ist.Exit.jk, cc, dst));
-      return;
+         vpanic("iselStmt(x86): Ist_Exit: dst is not a 32-bit value");
+
+      X86CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
+      X86AMode*   amEIP = X86AMode_IR(stmt->Ist.Exit.offsIP,
+                                      hregX86_EBP());
+
+      /* Case: boring transfer to known address */
+      if (stmt->Ist.Exit.jk == Ijk_Boring) {
+         if (env->chainingAllowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool toFastEP
+               = ((Addr32)stmt->Ist.Exit.dst->Ico.U32) > env->max_ga;
+            if (0) vex_printf("%s", toFastEP ? "Y" : ",");
+            addInstr(env, X86Instr_XDirect(stmt->Ist.Exit.dst->Ico.U32,
+                                           amEIP, cc, toFastEP));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an assisted transfer,
+               as that's the only alternative that is allowable. */
+            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+            addInstr(env, X86Instr_XAssisted(r, amEIP, cc, Ijk_Boring));
+         }
+         return;
+      }
+
+      /* Case: assisted transfer to arbitrary address */
+      switch (stmt->Ist.Exit.jk) {
+         case Ijk_MapFail:
+         case Ijk_SigSEGV: case Ijk_TInval: case Ijk_EmWarn: {
+            HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+            addInstr(env, X86Instr_XAssisted(r, amEIP, cc, stmt->Ist.Exit.jk));
+            return;
+         }
+         default:
+            break;
+      }
+
+      /* Do we ever expect to see any other kind? */
+      goto stmt_fail;
    }
 
    default: break;
@@ -4060,18 +4111,82 @@
 /*--- ISEL: Basic block terminators (Nexts)             ---*/
 /*---------------------------------------------------------*/
 
-static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
+static void iselNext ( ISelEnv* env,
+                       IRExpr* next, IRJumpKind jk, Int offsIP )
 {
-   X86RI* ri;
    if (vex_traceflags & VEX_TRACE_VCODE) {
-      vex_printf("\n-- goto {");
+      vex_printf( "\n-- PUT(%d) = ", offsIP);
+      ppIRExpr( next );
+      vex_printf( "; exit-");
       ppIRJumpKind(jk);
-      vex_printf("} ");
-      ppIRExpr(next);
-      vex_printf("\n");
+      vex_printf( "\n");
    }
-   ri = iselIntExpr_RI(env, next);
-   addInstr(env, X86Instr_Goto(jk, Xcc_ALWAYS,ri));
+
+   /* Case: boring transfer to known address */
+   if (next->tag == Iex_Const) {
+      IRConst* cdst = next->Iex.Const.con;
+      vassert(cdst->tag == Ico_U32);
+      if (jk == Ijk_Boring || jk == Ijk_Call) {
+         /* Boring transfer to known address */
+         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
+         if (env->chainingAllowed) {
+            /* .. almost always true .. */
+            /* Skip the event check at the dst if this is a forwards
+               edge. */
+            Bool toFastEP
+               = ((Addr64)cdst->Ico.U32) > env->max_ga;
+            if (0) vex_printf("%s", toFastEP ? "X" : ".");
+            addInstr(env, X86Instr_XDirect(cdst->Ico.U32,
+                                           amEIP, Xcc_ALWAYS, 
+                                           toFastEP));
+         } else {
+            /* .. very occasionally .. */
+            /* We can't use chaining, so ask for an assisted transfer,
+               as that's the only alternative that is allowable. */
+            HReg r = iselIntExpr_R(env, next);
+            addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
+                                             Ijk_Boring));
+         }
+         return;
+      }
+   }
+
+   /* Case: call/return (==boring) transfer to any address */
+   switch (jk) {
+      case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
+         HReg      r     = iselIntExpr_R(env, next);
+         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
+         if (env->chainingAllowed) {
+            addInstr(env, X86Instr_XIndir(r, amEIP, Xcc_ALWAYS));
+         } else {
+            addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
+                                               Ijk_Boring));
+         }
+         return;
+      }
+      default:
+         break;
+   }
+
+   /* Case: some other kind of transfer to any address */
+   switch (jk) {
+      case Ijk_Sys_int128: case Ijk_ClientReq: case Ijk_NoRedir:
+      case Ijk_Yield: case Ijk_SigTRAP: case Ijk_TInval: {
+         HReg      r     = iselIntExpr_R(env, next);
+         X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
+         addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS, jk));
+         return;
+      }
+      default:
+         break;
+   }
+
+   vex_printf( "\n-- PUT(%d) = ", offsIP);
+   ppIRExpr( next );
+   vex_printf( "; exit-");
+   ppIRJumpKind(jk);
+   vex_printf( "\n");
+   vassert(0); // are we expecting any other kind?
 }
 
 
@@ -4081,14 +4196,21 @@
 
 /* Translate an entire SB to x86 code. */
 
-HInstrArray* iselSB_X86 ( IRSB* bb, VexArch      arch_host,
-                                    VexArchInfo* archinfo_host,
-                                    VexAbiInfo*  vbi/*UNUSED*/ )
+HInstrArray* iselSB_X86 ( IRSB* bb,
+                          VexArch      arch_host,
+                          VexArchInfo* archinfo_host,
+                          VexAbiInfo*  vbi/*UNUSED*/,
+                          Int offs_Host_EvC_Counter,
+                          Int offs_Host_EvC_FailAddr,
+                          Bool chainingAllowed,
+                          Bool addProfInc,
+                          Addr64 max_ga )
 {
    Int      i, j;
    HReg     hreg, hregHI;
    ISelEnv* env;
    UInt     hwcaps_host = archinfo_host->hwcaps;
+   X86AMode *amCounter, *amFailAddr;
 
    /* sanity ... */
    vassert(arch_host == VexArchX86);
@@ -4097,6 +4219,8 @@
                      | VEX_HWCAPS_X86_SSE2
                      | VEX_HWCAPS_X86_SSE3
                      | VEX_HWCAPS_X86_LZCNT)));
+   vassert(sizeof(max_ga) == 8);
+   vassert((max_ga >> 32) == 0);
 
    /* Make up an initial environment to use. */
    env = LibVEX_Alloc(sizeof(ISelEnv));
@@ -4115,7 +4239,9 @@
    env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
 
    /* and finally ... */
-   env->hwcaps = hwcaps_host;
+   env->chainingAllowed = chainingAllowed;
+   env->hwcaps          = hwcaps_host;
+   env->max_ga          = max_ga;
 
    /* For each IR temporary, allocate a suitably-kinded virtual
       register. */
@@ -4140,11 +4266,24 @@
    }
    env->vreg_ctr = j;
 
+   /* The very first instruction must be an event check. */
+   amCounter  = X86AMode_IR(offs_Host_EvC_Counter,  hregX86_EBP());
+   amFailAddr = X86AMode_IR(offs_Host_EvC_FailAddr, hregX86_EBP());
+   addInstr(env, X86Instr_EvCheck(amCounter, amFailAddr));
+
+   /* Possibly a block counter increment (for profiling).  At this
+      point we don't know the address of the counter, so just pretend
+      it is zero.  It will have to be patched later, but before this
+      translation is used, by a call to LibVEX_patchProfCtr. */
+   if (addProfInc) {
+      addInstr(env, X86Instr_ProfInc());
+   }
+
    /* Ok, finally we can iterate over the statements. */
    for (i = 0; i < bb->stmts_used; i++)
-      iselStmt(env,bb->stmts[i]);
+      iselStmt(env, bb->stmts[i]);
 
-   iselNext(env,bb->next,bb->jumpkind);
+   iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
 
    /* record the number of vregs we used. */
    env->code->n_vregs = env->vreg_ctr;
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index ae0d090..d38acd7 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -1253,10 +1253,11 @@
       case Ist_Exit:
          vex_printf( "if (" );
          ppIRExpr(s->Ist.Exit.guard);
-         vex_printf( ") goto {");
-         ppIRJumpKind(s->Ist.Exit.jk);
-         vex_printf("} ");
+         vex_printf( ") { PUT(%d) = ", s->Ist.Exit.offsIP);
          ppIRConst(s->Ist.Exit.dst);
+         vex_printf("; exit-");
+         ppIRJumpKind(s->Ist.Exit.jk);
+         vex_printf(" } ");
          break;
       default: 
          vpanic("ppIRStmt");
@@ -1291,10 +1292,10 @@
       ppIRStmt(bb->stmts[i]);
       vex_printf( "\n");
    }
-   vex_printf( "   goto {");
-   ppIRJumpKind(bb->jumpkind);
-   vex_printf( "} ");
+   vex_printf( "   PUT(%d) = ", bb->offsIP );
    ppIRExpr( bb->next );
+   vex_printf( "; exit-");
+   ppIRJumpKind(bb->jumpkind);
    vex_printf( "\n}\n");
 }
 
@@ -1725,12 +1726,14 @@
    s->Ist.MBE.event = event;
    return s;
 }
-IRStmt* IRStmt_Exit ( IRExpr* guard, IRJumpKind jk, IRConst* dst ) {
-   IRStmt* s         = LibVEX_Alloc(sizeof(IRStmt));
-   s->tag            = Ist_Exit;
-   s->Ist.Exit.guard = guard;
-   s->Ist.Exit.jk    = jk;
-   s->Ist.Exit.dst   = dst;
+IRStmt* IRStmt_Exit ( IRExpr* guard, IRJumpKind jk, IRConst* dst,
+                      Int offsIP ) {
+   IRStmt* s          = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag             = Ist_Exit;
+   s->Ist.Exit.guard  = guard;
+   s->Ist.Exit.jk     = jk;
+   s->Ist.Exit.dst    = dst;
+   s->Ist.Exit.offsIP = offsIP;
    return s;
 }
 
@@ -1758,6 +1761,7 @@
    bb->stmts      = LibVEX_Alloc(bb->stmts_size * sizeof(IRStmt*));
    bb->next       = NULL;
    bb->jumpkind   = Ijk_Boring;
+   bb->offsIP     = 0;
    return bb;
 }
 
@@ -1948,7 +1952,8 @@
       case Ist_Exit: 
          return IRStmt_Exit(deepCopyIRExpr(s->Ist.Exit.guard),
                             s->Ist.Exit.jk,
-                            deepCopyIRConst(s->Ist.Exit.dst));
+                            deepCopyIRConst(s->Ist.Exit.dst),
+                            s->Ist.Exit.offsIP);
       default: 
          vpanic("deepCopyIRStmt");
    }
@@ -1975,7 +1980,7 @@
    sts2 = LibVEX_Alloc(bb2->stmts_used * sizeof(IRStmt*));
    for (i = 0; i < bb2->stmts_used; i++)
       sts2[i] = deepCopyIRStmt(bb->stmts[i]);
-   bb2->stmts    = sts2;
+   bb2->stmts = sts2;
    return bb2;
 }
 
@@ -1985,6 +1990,7 @@
    bb2->tyenv    = deepCopyIRTypeEnv(bb->tyenv);
    bb2->next     = deepCopyIRExpr(bb->next);
    bb2->jumpkind = bb->jumpkind;
+   bb2->offsIP   = bb->offsIP;
    return bb2;
 }
 
@@ -3508,6 +3514,9 @@
             sanityCheckFail(bb,stmt,"IRStmt.Exit.dst: bad dst");
          if (typeOfIRConst(stmt->Ist.Exit.dst) != gWordTy)
             sanityCheckFail(bb,stmt,"IRStmt.Exit.dst: not :: guest word type");
+         /* because it would intersect with host_EvC_* */
+         if (stmt->Ist.Exit.offsIP < 16)
+            sanityCheckFail(bb,stmt,"IRStmt.Exit.offsIP: too low");
          break;
       default:
          vpanic("tcStmt");
@@ -3634,6 +3643,10 @@
          tcStmt( bb, bb->stmts[i], guest_word_size );
    if (typeOfIRExpr(bb->tyenv,bb->next) != guest_word_size)
       sanityCheckFail(bb, NULL, "bb->next field has wrong type");
+   /* because it would intersect with host_EvC_* */
+   if (bb->offsIP < 16)
+      sanityCheckFail(bb, NULL, "bb->offsIP: too low");
+
 }
 
 /*---------------------------------------------------------------*/
diff --git a/priv/ir_opt.c b/priv/ir_opt.c
index 118249a..5bf44fd 100644
--- a/priv/ir_opt.c
+++ b/priv/ir_opt.c
@@ -467,7 +467,8 @@
       case Ist_Exit:
          e1 = flatten_Expr(bb, st->Ist.Exit.guard);
          addStmtToIRSB(bb, IRStmt_Exit(e1, st->Ist.Exit.jk,
-                                           st->Ist.Exit.dst));
+                                       st->Ist.Exit.dst,
+                                       st->Ist.Exit.offsIP));
          break;
       default:
          vex_printf("\n");
@@ -489,6 +490,7 @@
          flatten_Stmt( out, in->stmts[i] );
    out->next     = flatten_Expr( out, in->next );
    out->jumpkind = in->jumpkind;
+   out->offsIP   = in->offsIP;
    return out;
 }
 
@@ -815,6 +817,14 @@
    UInt    key = 0; /* keep gcc -O happy */
 
    HashHW* env = newHHW();
+
+   /* Initialise the running env with the fact that the final exit
+      writes the IP (or, whatever it claims to write.  We don't
+      care.) */
+   key = mk_key_GetPut(bb->offsIP, typeOfIRExpr(bb->tyenv, bb->next));
+   addToHHW(env, (HWord)key, 0);
+
+   /* And now scan backwards through the statements. */
    for (i = bb->stmts_used-1; i >= 0; i--) {
       st = bb->stmts[i];
 
@@ -823,13 +833,32 @@
 
       /* Deal with conditional exits. */
       if (st->tag == Ist_Exit) {
-         /* Since control may not get beyond this point, we must empty
-            out the set, since we can no longer claim that the next
-            event for any part of the guest state is definitely a
-            write. */
-         vassert(isIRAtom(st->Ist.Exit.guard));
+         //Bool re_add;
+         /* Need to throw out from the env, any part of it which
+            doesn't overlap with the guest state written by this exit.
+            Since the exit only writes one section, it's simplest to
+            do this: (1) check whether env contains a write that
+            completely overlaps the write done by this exit; (2) empty
+            out env; and (3) if (1) was true, add the write done by
+            this exit.
+
+            To make (1) a bit simpler, merely search for a write that
+            exactly matches the one done by this exit.  That's safe
+            because it will fail as often or more often than a full
+            overlap check, and failure to find an overlapping write in
+            env is the safe case (we just nuke env if that
+            happens). */
+         //vassert(isIRAtom(st->Ist.Exit.guard));
+         /* (1) */
+         //key = mk_key_GetPut(st->Ist.Exit.offsIP,
+         //                    typeOfIRConst(st->Ist.Exit.dst));
+         //re_add = lookupHHW(env, NULL, key);
+         /* (2) */
          for (j = 0; j < env->used; j++)
             env->inuse[j] = False;
+         /* (3) */
+         //if (0 && re_add) 
+         //   addToHHW(env, (HWord)key, 0);
          continue;
       }
 
@@ -926,10 +955,24 @@
    assumed to compute different values. After all the accesses may happen
    at different times and the guest state / memory can have changed in
    the meantime. */
+
+/* JRS 20-Mar-2012: split sameIRExprs_aux into a fast inlineable
+   wrapper that deals with the common tags-don't-match case, and a
+   slower out of line general case.  Saves a few insns. */
+
+__attribute__((noinline))
+static Bool sameIRExprs_aux2 ( IRExpr** env, IRExpr* e1, IRExpr* e2 );
+
+inline
 static Bool sameIRExprs_aux ( IRExpr** env, IRExpr* e1, IRExpr* e2 )
 {
    if (e1->tag != e2->tag) return False;
+   return sameIRExprs_aux2(env, e1, e2);
+}
 
+__attribute__((noinline))
+static Bool sameIRExprs_aux2 ( IRExpr** env, IRExpr* e1, IRExpr* e2 )
+{
    if (num_nodes_visited++ > NODE_LIMIT) return False;
 
    switch (e1->tag) {
@@ -996,6 +1039,7 @@
    return False;
 }
 
+inline
 static Bool sameIRExprs ( IRExpr** env, IRExpr* e1, IRExpr* e2 )
 {
    Bool same;
@@ -2217,7 +2261,8 @@
                   vex_printf("vex iropt: IRStmt_Exit became unconditional\n");
             }
          }
-         return IRStmt_Exit(fcond, st->Ist.Exit.jk, st->Ist.Exit.dst);
+         return IRStmt_Exit(fcond, st->Ist.Exit.jk,
+                                   st->Ist.Exit.dst, st->Ist.Exit.offsIP);
       }
 
    default:
@@ -2294,6 +2339,7 @@
 
    out->next     = subst_Expr( env, in->next );
    out->jumpkind = in->jumpkind;
+   out->offsIP   = in->offsIP;
    return out;
 }
 
@@ -2519,6 +2565,8 @@
          = IRExpr_Const( bb->stmts[i_unconditional_exit]->Ist.Exit.dst );
       bb->jumpkind
          = bb->stmts[i_unconditional_exit]->Ist.Exit.jk;
+      bb->offsIP
+         = bb->stmts[i_unconditional_exit]->Ist.Exit.offsIP;
       for (i = i_unconditional_exit; i < bb->stmts_used; i++)
          bb->stmts[i] = IRStmt_NoOp();
    }
@@ -4604,7 +4652,8 @@
          return IRStmt_Exit(
                    atbSubst_Expr(env, st->Ist.Exit.guard),
                    st->Ist.Exit.jk,
-                   st->Ist.Exit.dst
+                   st->Ist.Exit.dst,
+                   st->Ist.Exit.offsIP
                 );
       case Ist_IMark:
          return IRStmt_IMark(st->Ist.IMark.addr,
@@ -4649,7 +4698,7 @@
    }
 }
 
-/* notstatic */ void ado_treebuild_BB ( IRSB* bb )
+/* notstatic */ Addr64 ado_treebuild_BB ( IRSB* bb )
 {
    Int      i, j, k, m;
    Bool     stmtPuts, stmtStores, invalidateMe;
@@ -4657,19 +4706,37 @@
    IRStmt*  st2;
    ATmpInfo env[A_NENV];
 
+   Bool   max_ga_known = False;
+   Addr64 max_ga       = 0;
+
    Int       n_tmps = bb->tyenv->types_used;
    UShort*   uses   = LibVEX_Alloc(n_tmps * sizeof(UShort));
 
    /* Phase 1.  Scan forwards in bb, counting use occurrences of each
-      temp.  Also count occurrences in the bb->next field. */
+      temp.  Also count occurrences in the bb->next field.  Take the
+      opportunity to also find the maximum guest address in the block,
+      since that will be needed later for deciding when we can safely
+      elide event checks. */
 
    for (i = 0; i < n_tmps; i++)
       uses[i] = 0;
 
    for (i = 0; i < bb->stmts_used; i++) {
       st = bb->stmts[i];
-      if (st->tag == Ist_NoOp)
-         continue;
+      switch (st->tag) {
+         case Ist_NoOp:
+            continue;
+         case Ist_IMark: {
+            Int    len = st->Ist.IMark.len;
+            Addr64 mga = st->Ist.IMark.addr + (len < 1 ? 1 : len) - 1;
+            max_ga_known = True;
+            if (mga > max_ga)
+               max_ga = mga;
+            break;
+         }
+         default:
+            break;
+      }
       aoccCount_Stmt( uses, st );
    }
    aoccCount_Expr(uses, bb->next );
@@ -4842,6 +4909,8 @@
       by definition dead? */
    bb->next = atbSubst_Expr(env, bb->next);
    bb->stmts_used = j;
+
+   return max_ga_known ? max_ga : ~(Addr64)0;
 }
 
 
diff --git a/priv/ir_opt.h b/priv/ir_opt.h
index 9390a1c..ded1c2d 100644
--- a/priv/ir_opt.h
+++ b/priv/ir_opt.h
@@ -60,9 +60,11 @@
 void do_deadcode_BB ( IRSB* bb );
 
 /* The tree-builder.  Make (approximately) maximal safe trees.  bb is
-   destructively modified. */
+   destructively modified.  Returns (unrelatedly, but useful later on)
+   the guest address of the highest addressed byte from any insn in
+   this block, or Addr64_MAX if unknown (can that ever happen?) */
 extern
-void ado_treebuild_BB ( IRSB* bb );
+Addr64 ado_treebuild_BB ( IRSB* bb );
 
 #endif /* ndef __VEX_IR_OPT_H */
 
diff --git a/priv/main_main.c b/priv/main_main.c
index c8777fe..cf6e2f5 100644
--- a/priv/main_main.c
+++ b/priv/main_main.c
@@ -155,6 +155,17 @@
    vassert(VEX_HOST_WORDSIZE == sizeof(void*));
    vassert(VEX_HOST_WORDSIZE == sizeof(HWord));
 
+   /* These take a lot of space, so make sure we don't have
+      any unnoticed size regressions. */
+   if (VEX_HOST_WORDSIZE == 4) {
+      vassert(sizeof(IRExpr) == 24);
+      vassert(sizeof(IRStmt) == 20 /* x86 */
+              || sizeof(IRStmt) == 24 /* arm */);
+   } else {
+      vassert(sizeof(IRExpr) == 48);
+      vassert(sizeof(IRStmt) == 40);
+   }
+
    /* Really start up .. */
    vex_debuglevel         = debuglevel;
    vex_valgrind_support   = valgrind_support;
@@ -183,9 +194,11 @@
    HInstr*      (*directReload) ( HInstr*, HReg, Short );
    void         (*ppInstr)      ( HInstr*, Bool );
    void         (*ppReg)        ( HReg );
-   HInstrArray* (*iselSB)       ( IRSB*, VexArch, VexArchInfo*, 
-                                                  VexAbiInfo* );
-   Int          (*emit)         ( UChar*, Int, HInstr*, Bool, void*, void* );
+   HInstrArray* (*iselSB)       ( IRSB*, VexArch, VexArchInfo*, VexAbiInfo*,
+                                  Int, Int, Bool, Bool, Addr64 );
+   Int          (*emit)         ( /*MB_MOD*/Bool*,
+                                  UChar*, Int, HInstr*, Bool,
+                                  void*, void*, void*, void* );
    IRExpr*      (*specHelper)   ( HChar*, IRExpr**, IRStmt**, Int );
    Bool         (*preciseMemExnsFn) ( Int, Int );
 
@@ -197,11 +210,13 @@
    HInstrArray*    vcode;
    HInstrArray*    rcode;
    Int             i, j, k, out_used, guest_sizeB;
-   Int             offB_TISTART, offB_TILEN;
-   UChar           insn_bytes[48];
+   Int             offB_TISTART, offB_TILEN, offB_GUEST_IP, szB_GUEST_IP;
+   Int             offB_HOST_EvC_COUNTER, offB_HOST_EvC_FAILADDR;
+   UChar           insn_bytes[64];
    IRType          guest_word_type;
    IRType          host_word_type;
-   Bool            mode64;
+   Bool            mode64, chainingAllowed;
+   Addr64          max_ga;
 
    guest_layout           = NULL;
    available_real_regs    = NULL;
@@ -223,12 +238,27 @@
    host_word_type         = Ity_INVALID;
    offB_TISTART           = 0;
    offB_TILEN             = 0;
+   offB_GUEST_IP          = 0;
+   szB_GUEST_IP           = 0;
+   offB_HOST_EvC_COUNTER  = 0;
+   offB_HOST_EvC_FAILADDR = 0;
    mode64                 = False;
+   chainingAllowed        = False;
 
    vex_traceflags = vta->traceflags;
 
    vassert(vex_initdone);
-   vassert(vta->needs_self_check != NULL);
+   vassert(vta->needs_self_check  != NULL);
+   vassert(vta->disp_cp_xassisted != NULL);
+   /* Both the chainers and the indir are either NULL or non-NULL. */
+   if (vta->disp_cp_chain_me_to_slowEP        != NULL) {
+      vassert(vta->disp_cp_chain_me_to_fastEP != NULL);
+      vassert(vta->disp_cp_xindir             != NULL);
+      chainingAllowed = True;
+   } else {
+      vassert(vta->disp_cp_chain_me_to_fastEP == NULL);
+      vassert(vta->disp_cp_xindir             == NULL);
+   }
 
    vexSetAllocModeTEMP_and_clear();
    vexAllocSanityCheck();
@@ -254,14 +284,12 @@
          ppInstr      = (void(*)(HInstr*, Bool)) ppX86Instr;
          ppReg        = (void(*)(HReg)) ppHRegX86;
          iselSB       = iselSB_X86;
-         emit         = (Int(*)(UChar*,Int,HInstr*,Bool,void*,void*))
+         emit         = (Int(*)(Bool*,UChar*,Int,HInstr*,Bool,
+                               void*,void*,void*,void*))
                         emit_X86Instr;
          host_is_bigendian = False;
          host_word_type    = Ity_I32;
          vassert(are_valid_hwcaps(VexArchX86, vta->archinfo_host.hwcaps));
-         /* jump-to-dispatcher scheme */
-         vassert(vta->dispatch_unassisted != NULL);
-         vassert(vta->dispatch_assisted != NULL);
          break;
 
       case VexArchAMD64:
@@ -279,14 +307,12 @@
          ppInstr     = (void(*)(HInstr*, Bool)) ppAMD64Instr;
          ppReg       = (void(*)(HReg)) ppHRegAMD64;
          iselSB      = iselSB_AMD64;
-         emit        = (Int(*)(UChar*,Int,HInstr*,Bool,void*,void*))
+         emit        = (Int(*)(Bool*,UChar*,Int,HInstr*,Bool,
+                               void*,void*,void*,void*))
                        emit_AMD64Instr;
          host_is_bigendian = False;
          host_word_type    = Ity_I64;
          vassert(are_valid_hwcaps(VexArchAMD64, vta->archinfo_host.hwcaps));
-         /* jump-to-dispatcher scheme */
-         vassert(vta->dispatch_unassisted != NULL);
-         vassert(vta->dispatch_assisted != NULL);
          break;
 
       case VexArchPPC32:
@@ -301,14 +327,12 @@
          ppInstr     = (void(*)(HInstr*,Bool)) ppPPCInstr;
          ppReg       = (void(*)(HReg)) ppHRegPPC;
          iselSB      = iselSB_PPC;
-         emit        = (Int(*)(UChar*,Int,HInstr*,Bool,void*,void*))
+         emit        = (Int(*)(Bool*,UChar*,Int,HInstr*,Bool,
+                               void*,void*,void*,void*))
                        emit_PPCInstr;
          host_is_bigendian = True;
          host_word_type    = Ity_I32;
          vassert(are_valid_hwcaps(VexArchPPC32, vta->archinfo_host.hwcaps));
-         /* return-to-dispatcher scheme */
-         vassert(vta->dispatch_unassisted == NULL);
-         vassert(vta->dispatch_assisted == NULL);
          break;
 
       case VexArchPPC64:
@@ -323,14 +347,12 @@
          ppInstr     = (void(*)(HInstr*, Bool)) ppPPCInstr;
          ppReg       = (void(*)(HReg)) ppHRegPPC;
          iselSB      = iselSB_PPC;
-         emit        = (Int(*)(UChar*,Int,HInstr*,Bool,void*,void*))
+         emit        = (Int(*)(Bool*,UChar*,Int,HInstr*,Bool,
+                               void*,void*,void*,void*))
                        emit_PPCInstr;
          host_is_bigendian = True;
          host_word_type    = Ity_I64;
          vassert(are_valid_hwcaps(VexArchPPC64, vta->archinfo_host.hwcaps));
-         /* return-to-dispatcher scheme */
-         vassert(vta->dispatch_unassisted == NULL);
-         vassert(vta->dispatch_assisted == NULL);
          break;
 
       case VexArchS390X:
@@ -345,14 +367,11 @@
          ppInstr     = (void(*)(HInstr*, Bool)) ppS390Instr;
          ppReg       = (void(*)(HReg)) ppHRegS390;
          iselSB      = iselSB_S390;
-         emit        = (Int(*)(UChar*,Int,HInstr*,Bool,void*,void*))
-                       emit_S390Instr;
+         emit        = (Int(*)(Bool*,UChar*,Int,HInstr*,Bool,
+                               void*,void*,void*,void*)) emit_S390Instr;
          host_is_bigendian = True;
          host_word_type    = Ity_I64;
          vassert(are_valid_hwcaps(VexArchS390X, vta->archinfo_host.hwcaps));
-         /* return-to-dispatcher scheme */
-         vassert(vta->dispatch_unassisted == NULL);
-         vassert(vta->dispatch_assisted == NULL);
          break;
 
       case VexArchARM:
@@ -367,14 +386,12 @@
          ppInstr     = (void(*)(HInstr*, Bool)) ppARMInstr;
          ppReg       = (void(*)(HReg)) ppHRegARM;
          iselSB      = iselSB_ARM;
-         emit        = (Int(*)(UChar*,Int,HInstr*,Bool,void*,void*))
+         emit        = (Int(*)(Bool*,UChar*,Int,HInstr*,Bool,
+                               void*,void*,void*,void*))
                        emit_ARMInstr;
          host_is_bigendian = False;
          host_word_type    = Ity_I32;
          vassert(are_valid_hwcaps(VexArchARM, vta->archinfo_host.hwcaps));
-         vassert(vta->dispatch_unassisted == NULL);
-         vassert(vta->dispatch_assisted == NULL);
-         /* return-to-dispatcher scheme */
          break;
 
       default:
@@ -385,14 +402,18 @@
    switch (vta->arch_guest) {
 
       case VexArchX86:
-         preciseMemExnsFn = guest_x86_state_requires_precise_mem_exns;
-         disInstrFn       = disInstr_X86;
-         specHelper       = guest_x86_spechelper;
-         guest_sizeB      = sizeof(VexGuestX86State);
-         guest_word_type  = Ity_I32;
-         guest_layout     = &x86guest_layout;
-         offB_TISTART     = offsetof(VexGuestX86State,guest_TISTART);
-         offB_TILEN       = offsetof(VexGuestX86State,guest_TILEN);
+         preciseMemExnsFn       = guest_x86_state_requires_precise_mem_exns;
+         disInstrFn             = disInstr_X86;
+         specHelper             = guest_x86_spechelper;
+         guest_sizeB            = sizeof(VexGuestX86State);
+         guest_word_type        = Ity_I32;
+         guest_layout           = &x86guest_layout;
+         offB_TISTART           = offsetof(VexGuestX86State,guest_TISTART);
+         offB_TILEN             = offsetof(VexGuestX86State,guest_TILEN);
+         offB_GUEST_IP          = offsetof(VexGuestX86State,guest_EIP);
+         szB_GUEST_IP           = sizeof( ((VexGuestX86State*)0)->guest_EIP );
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestX86State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestX86State,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchX86, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestX86State) % 16);
          vassert(sizeof( ((VexGuestX86State*)0)->guest_TISTART) == 4);
@@ -401,14 +422,18 @@
          break;
 
       case VexArchAMD64:
-         preciseMemExnsFn = guest_amd64_state_requires_precise_mem_exns;
-         disInstrFn       = disInstr_AMD64;
-         specHelper       = guest_amd64_spechelper;
-         guest_sizeB      = sizeof(VexGuestAMD64State);
-         guest_word_type  = Ity_I64;
-         guest_layout     = &amd64guest_layout;
-         offB_TISTART     = offsetof(VexGuestAMD64State,guest_TISTART);
-         offB_TILEN       = offsetof(VexGuestAMD64State,guest_TILEN);
+         preciseMemExnsFn       = guest_amd64_state_requires_precise_mem_exns;
+         disInstrFn             = disInstr_AMD64;
+         specHelper             = guest_amd64_spechelper;
+         guest_sizeB            = sizeof(VexGuestAMD64State);
+         guest_word_type        = Ity_I64;
+         guest_layout           = &amd64guest_layout;
+         offB_TISTART           = offsetof(VexGuestAMD64State,guest_TISTART);
+         offB_TILEN             = offsetof(VexGuestAMD64State,guest_TILEN);
+         offB_GUEST_IP          = offsetof(VexGuestAMD64State,guest_RIP);
+         szB_GUEST_IP           = sizeof( ((VexGuestAMD64State*)0)->guest_RIP );
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestAMD64State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestAMD64State,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchAMD64, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestAMD64State) % 16);
          vassert(sizeof( ((VexGuestAMD64State*)0)->guest_TISTART ) == 8);
@@ -417,14 +442,18 @@
          break;
 
       case VexArchPPC32:
-         preciseMemExnsFn = guest_ppc32_state_requires_precise_mem_exns;
-         disInstrFn       = disInstr_PPC;
-         specHelper       = guest_ppc32_spechelper;
-         guest_sizeB      = sizeof(VexGuestPPC32State);
-         guest_word_type  = Ity_I32;
-         guest_layout     = &ppc32Guest_layout;
-         offB_TISTART     = offsetof(VexGuestPPC32State,guest_TISTART);
-         offB_TILEN       = offsetof(VexGuestPPC32State,guest_TILEN);
+         preciseMemExnsFn       = guest_ppc32_state_requires_precise_mem_exns;
+         disInstrFn             = disInstr_PPC;
+         specHelper             = guest_ppc32_spechelper;
+         guest_sizeB            = sizeof(VexGuestPPC32State);
+         guest_word_type        = Ity_I32;
+         guest_layout           = &ppc32Guest_layout;
+         offB_TISTART           = offsetof(VexGuestPPC32State,guest_TISTART);
+         offB_TILEN             = offsetof(VexGuestPPC32State,guest_TILEN);
+         offB_GUEST_IP          = offsetof(VexGuestPPC32State,guest_CIA);
+         szB_GUEST_IP           = sizeof( ((VexGuestPPC32State*)0)->guest_CIA );
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestPPC32State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestPPC32State,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchPPC32, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestPPC32State) % 16);
          vassert(sizeof( ((VexGuestPPC32State*)0)->guest_TISTART ) == 4);
@@ -433,14 +462,18 @@
          break;
 
       case VexArchPPC64:
-         preciseMemExnsFn = guest_ppc64_state_requires_precise_mem_exns;
-         disInstrFn       = disInstr_PPC;
-         specHelper       = guest_ppc64_spechelper;
-         guest_sizeB      = sizeof(VexGuestPPC64State);
-         guest_word_type  = Ity_I64;
-         guest_layout     = &ppc64Guest_layout;
-         offB_TISTART     = offsetof(VexGuestPPC64State,guest_TISTART);
-         offB_TILEN       = offsetof(VexGuestPPC64State,guest_TILEN);
+         preciseMemExnsFn       = guest_ppc64_state_requires_precise_mem_exns;
+         disInstrFn             = disInstr_PPC;
+         specHelper             = guest_ppc64_spechelper;
+         guest_sizeB            = sizeof(VexGuestPPC64State);
+         guest_word_type        = Ity_I64;
+         guest_layout           = &ppc64Guest_layout;
+         offB_TISTART           = offsetof(VexGuestPPC64State,guest_TISTART);
+         offB_TILEN             = offsetof(VexGuestPPC64State,guest_TILEN);
+         offB_GUEST_IP          = offsetof(VexGuestPPC64State,guest_CIA);
+         szB_GUEST_IP           = sizeof( ((VexGuestPPC64State*)0)->guest_CIA );
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestPPC64State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestPPC64State,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchPPC64, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestPPC64State) % 16);
          vassert(sizeof( ((VexGuestPPC64State*)0)->guest_TISTART    ) == 8);
@@ -458,6 +491,10 @@
          guest_layout     = &s390xGuest_layout;
          offB_TISTART     = offsetof(VexGuestS390XState,guest_TISTART);
          offB_TILEN       = offsetof(VexGuestS390XState,guest_TILEN);
+         offB_GUEST_IP          = offsetof(VexGuestS390XState,guest_IA);
+         szB_GUEST_IP           = sizeof( ((VexGuestS390XState*)0)->guest_IA);
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestS390XState,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestS390XState,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchS390X, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestS390XState) % 16);
          vassert(sizeof( ((VexGuestS390XState*)0)->guest_TISTART    ) == 8);
@@ -466,14 +503,18 @@
          break;
 
       case VexArchARM:
-         preciseMemExnsFn = guest_arm_state_requires_precise_mem_exns;
-         disInstrFn       = disInstr_ARM;
-         specHelper       = guest_arm_spechelper;
-         guest_sizeB      = sizeof(VexGuestARMState);
-         guest_word_type  = Ity_I32;
-         guest_layout     = &armGuest_layout;
-         offB_TISTART     = offsetof(VexGuestARMState,guest_TISTART);
-         offB_TILEN       = offsetof(VexGuestARMState,guest_TILEN);
+         preciseMemExnsFn       = guest_arm_state_requires_precise_mem_exns;
+         disInstrFn             = disInstr_ARM;
+         specHelper             = guest_arm_spechelper;
+         guest_sizeB            = sizeof(VexGuestARMState);
+         guest_word_type        = Ity_I32;
+         guest_layout           = &armGuest_layout;
+         offB_TISTART           = offsetof(VexGuestARMState,guest_TISTART);
+         offB_TILEN             = offsetof(VexGuestARMState,guest_TILEN);
+         offB_GUEST_IP          = offsetof(VexGuestARMState,guest_R15T);
+         szB_GUEST_IP           = sizeof( ((VexGuestARMState*)0)->guest_R15T );
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestARMState,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestARMState,host_EvC_FAILADDR);
          vassert(are_valid_hwcaps(VexArchARM, vta->archinfo_guest.hwcaps));
          vassert(0 == sizeof(VexGuestARMState) % 16);
          vassert(sizeof( ((VexGuestARMState*)0)->guest_TISTART) == 4);
@@ -489,6 +530,7 @@
    VexTranslateResult res;
    res.status       = VexTransOK;
    res.n_sc_extents = 0;
+   res.offs_profInc = -1;
 
    /* yet more sanity checks ... */
    if (vta->arch_guest == vta->arch_host) {
@@ -520,7 +562,9 @@
                      vta->needs_self_check,
                      vta->preamble_function,
                      offB_TISTART,
-                     offB_TILEN );
+                     offB_TILEN,
+                     offB_GUEST_IP,
+                     szB_GUEST_IP );
 
    vexAllocSanityCheck();
 
@@ -627,7 +671,7 @@
 
    /* Turn it into virtual-registerised code.  Build trees -- this
       also throws away any dead bindings. */
-   ado_treebuild_BB( irsb );
+   max_ga = ado_treebuild_BB( irsb );
 
    if (vta->finaltidy) {
       irsb = vta->finaltidy(irsb);
@@ -655,8 +699,19 @@
                    " Instruction selection "
                    "------------------------\n");
 
-   vcode = iselSB ( irsb, vta->arch_host, &vta->archinfo_host, 
-                                          &vta->abiinfo_both );
+   /* No guest has its IP field at offset zero.  If this fails it
+      means some transformation pass somewhere failed to update/copy
+      irsb->offsIP properly. */
+   vassert(irsb->offsIP >= 16);
+
+   vcode = iselSB ( irsb, vta->arch_host,
+                    &vta->archinfo_host, 
+                    &vta->abiinfo_both,
+                    offB_HOST_EvC_COUNTER,
+                    offB_HOST_EvC_FAILADDR,
+                    chainingAllowed,
+                    vta->addProfInc,
+                    max_ga );
 
    vexAllocSanityCheck();
 
@@ -710,13 +765,19 @@
 
    out_used = 0; /* tracks along the host_bytes array */
    for (i = 0; i < rcode->arr_used; i++) {
-      if (vex_traceflags & VEX_TRACE_ASM) {
-         ppInstr(rcode->arr[i], mode64);
+      HInstr* hi           = rcode->arr[i];
+      Bool    hi_isProfInc = False;
+      if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM)) {
+         ppInstr(hi, mode64);
          vex_printf("\n");
       }
-      j = (*emit)( insn_bytes, sizeof insn_bytes, rcode->arr[i], mode64,
-                   vta->dispatch_unassisted, vta->dispatch_assisted );
-      if (vex_traceflags & VEX_TRACE_ASM) {
+      j = emit( &hi_isProfInc,
+                insn_bytes, sizeof insn_bytes, hi, mode64,
+                vta->disp_cp_chain_me_to_slowEP,
+                vta->disp_cp_chain_me_to_fastEP,
+                vta->disp_cp_xindir,
+                vta->disp_cp_xassisted );
+      if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM)) {
          for (k = 0; k < j; k++)
             if (insn_bytes[k] < 16)
                vex_printf("0%x ",  (UInt)insn_bytes[k]);
@@ -724,15 +785,23 @@
                vex_printf("%x ", (UInt)insn_bytes[k]);
          vex_printf("\n\n");
       }
-      if (out_used + j > vta->host_bytes_size) {
+      if (UNLIKELY(out_used + j > vta->host_bytes_size)) {
          vexSetAllocModeTEMP_and_clear();
          vex_traceflags = 0;
          res.status = VexTransOutputFull;
          return res;
       }
-      for (k = 0; k < j; k++) {
-         vta->host_bytes[out_used] = insn_bytes[k];
-         out_used++;
+      if (UNLIKELY(hi_isProfInc)) {
+         vassert(vta->addProfInc); /* else where did it come from? */
+         vassert(res.offs_profInc == -1); /* there can be only one (tm) */
+         vassert(out_used >= 0);
+         res.offs_profInc = out_used;
+      }
+      { UChar* dst = &vta->host_bytes[out_used];
+        for (k = 0; k < j; k++) {
+           dst[k] = insn_bytes[k];
+        }
+        out_used += j;
       }
       vassert(out_used <= vta->host_bytes_size);
    }
@@ -748,6 +817,127 @@
 }
 
 
+/* --------- Chain/Unchain XDirects. --------- */
+
+VexInvalRange LibVEX_Chain ( VexArch arch_host,
+                             void*   place_to_chain,
+                             void*   disp_cp_chain_me_EXPECTED,
+                             void*   place_to_jump_to )
+{
+   VexInvalRange (*chainXDirect)(void*, void*, void*) = NULL;
+   switch (arch_host) {
+      case VexArchX86:
+         chainXDirect = chainXDirect_X86; break;
+      case VexArchAMD64:
+         chainXDirect = chainXDirect_AMD64; break;
+      case VexArchARM:
+         chainXDirect = chainXDirect_ARM; break;
+      case VexArchS390X:
+         chainXDirect = chainXDirect_S390; break;
+      case VexArchPPC32:
+         return chainXDirect_PPC(place_to_chain,
+                                 disp_cp_chain_me_EXPECTED,
+                                 place_to_jump_to, False/*!mode64*/);
+      case VexArchPPC64:
+         return chainXDirect_PPC(place_to_chain,
+                                 disp_cp_chain_me_EXPECTED,
+                                 place_to_jump_to, True/*mode64*/);
+      default:
+         vassert(0);
+   }
+   vassert(chainXDirect);
+   VexInvalRange vir
+      = chainXDirect(place_to_chain, disp_cp_chain_me_EXPECTED,
+                     place_to_jump_to);
+   return vir;
+}
+
+VexInvalRange LibVEX_UnChain ( VexArch arch_host,
+                               void*   place_to_unchain,
+                               void*   place_to_jump_to_EXPECTED,
+                               void*   disp_cp_chain_me )
+{
+   VexInvalRange (*unchainXDirect)(void*, void*, void*) = NULL;
+   switch (arch_host) {
+      case VexArchX86:
+         unchainXDirect = unchainXDirect_X86; break;
+      case VexArchAMD64:
+         unchainXDirect = unchainXDirect_AMD64; break;
+      case VexArchARM:
+         unchainXDirect = unchainXDirect_ARM; break;
+      case VexArchS390X:
+         unchainXDirect = unchainXDirect_S390; break;
+      case VexArchPPC32:
+         return unchainXDirect_PPC(place_to_unchain,
+                                   place_to_jump_to_EXPECTED,
+                                   disp_cp_chain_me, False/*!mode64*/);
+      case VexArchPPC64:
+         return unchainXDirect_PPC(place_to_unchain,
+                                   place_to_jump_to_EXPECTED,
+                                   disp_cp_chain_me, True/*mode64*/);
+      default:
+         vassert(0);
+   }
+   vassert(unchainXDirect);
+   VexInvalRange vir
+      = unchainXDirect(place_to_unchain, place_to_jump_to_EXPECTED,
+                       disp_cp_chain_me);
+   return vir;
+}
+
+Int LibVEX_evCheckSzB ( VexArch arch_host )
+{
+   static Int cached = 0; /* DO NOT MAKE NON-STATIC */
+   if (UNLIKELY(cached == 0)) {
+      switch (arch_host) {
+         case VexArchX86:
+            cached = evCheckSzB_X86(); break;
+         case VexArchAMD64:
+            cached = evCheckSzB_AMD64(); break;
+         case VexArchARM:
+            cached = evCheckSzB_ARM(); break;
+         case VexArchS390X:
+            cached = evCheckSzB_S390(); break;
+         case VexArchPPC32:
+         case VexArchPPC64:
+            cached = evCheckSzB_PPC(); break;
+         default:
+            vassert(0);
+      }
+   }
+   return cached;
+}
+
+VexInvalRange LibVEX_PatchProfInc ( VexArch arch_host,
+                                    void*   place_to_patch,
+                                    ULong*  location_of_counter )
+{
+   VexInvalRange (*patchProfInc)(void*,ULong*) = NULL;
+   switch (arch_host) {
+      case VexArchX86:
+         patchProfInc = patchProfInc_X86; break;
+      case VexArchAMD64:
+         patchProfInc = patchProfInc_AMD64; break;
+      case VexArchARM:
+         patchProfInc = patchProfInc_ARM; break;
+      case VexArchS390X:
+         patchProfInc = patchProfInc_S390; break;
+      case VexArchPPC32:
+         return patchProfInc_PPC(place_to_patch,
+                                 location_of_counter, False/*!mode64*/);
+      case VexArchPPC64:
+         return patchProfInc_PPC(place_to_patch,
+                                 location_of_counter, True/*mode64*/);
+      default:
+         vassert(0);
+   }
+   vassert(patchProfInc);
+   VexInvalRange vir
+      = patchProfInc(place_to_patch, location_of_counter);
+   return vir;
+}
+
+
 /* --------- Emulation warnings. --------- */
 
 HChar* LibVEX_EmWarn_string ( VexEmWarn ew )