Merge patches from JeremyF, to do lazy eflags updating:

- D flag is seperated from the rest (OSZCAP)

- Minimise transfers between real and simulated %eflags since these
  are very expensive.

61-special-d

Make the D flag special. Store it separately in the baseblock rather
than in EFLAGs. This is because it is used almost completely unlike
the other flags, and mashing them together just makes maintaining
eflags hard.

62-lazy-eflags

Implements lazy eflags save and restore. Helps a lot.

Hopefully more documentation to follow.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1346 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index 242de5a..55e4634 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -65,6 +65,7 @@
    them into sequences of legal instructions.  Finally, emitUInstr is
    phrased in terms of the synth_* abstraction layer.  */
 
+/* Static state for the current basic block */
 static UChar* emitted_code;
 static Int    emitted_code_used;
 static Int    emitted_code_size;
@@ -73,6 +74,23 @@
 static UShort jumps[VG_MAX_JUMPS];
 static Int    jumpidx;
 
+static enum eflags_state {
+	UPD_Simd,		/* baseblock copy is up to date */
+	UPD_Real,		/* CPU copy is up to date */
+	UPD_Both,		/* both are current */
+} eflags_state;
+
+/* single site for resetting state */
+static void reset_state(void)
+{
+   emitted_code_used = 0;
+   emitted_code_size = 500; /* reasonable initial size */
+   emitted_code = VG_(arena_malloc)(VG_AR_JITTER, emitted_code_size);
+   jumpidx = 0;
+   eflags_state = UPD_Simd;
+}
+
+
 /* Statistics about C functions called from generated code. */
 static UInt ccalls                 = 0;
 static UInt ccall_reg_saves        = 0;
@@ -196,8 +214,101 @@
    VG_(emitB) ( (l >> 24) & 0x000000FF );
 }
 
-__inline__ void VG_(new_emit) ( void )
+static void emit_get_eflags ( void )
 {
+   Int off = 4 * VGOFF_(m_eflags);
+   vg_assert(off >= 0 && off < 128);
+
+   if (dis)
+      VG_(printf)("\t       %4d: ", emitted_code_used );
+
+   VG_(emitB) ( 0xFF ); /* PUSHL off(%ebp) */
+   VG_(emitB) ( 0x75 );
+   VG_(emitB) ( off );
+   VG_(emitB) ( 0x9D ); /* POPFL */
+   if (dis)
+      VG_(printf)( "\n\t\tpushl %d(%%ebp) ; popfl\n", off );
+}
+
+static void emit_put_eflags ( void )
+{
+   Int off = 4 * VGOFF_(m_eflags);
+   vg_assert(off >= 0 && off < 128);
+
+   if (dis)
+      VG_(printf)("\t       %4d: ", emitted_code_used );
+
+   VG_(emitB) ( 0x9C ); /* PUSHFL */
+   VG_(emitB) ( 0x8F ); /* POPL vg_m_state.m_eflags */
+   VG_(emitB) ( 0x45 );
+   VG_(emitB) ( off );
+   if (dis)
+      VG_(printf)( "\n\t\tpushfl ; popl %d(%%ebp)\n", off );
+}
+
+static void maybe_emit_put_eflags( void )
+{
+   if (eflags_state == UPD_Real) {
+      eflags_state = UPD_Both;
+      emit_put_eflags();
+   }
+}
+
+static void maybe_emit_get_eflags( void )
+{
+   if (eflags_state == UPD_Simd) {
+      eflags_state = UPD_Both;
+      emit_get_eflags();
+   }
+}
+
+/* Call this before emitting each instruction.
+
+   Arguments are:
+   upds_simd_flags: 
+      if true, this instruction updates the simulated %EFLAGS state,
+      otherwise it doesn't
+   use_flags: set of (real) flags the instruction uses
+   set_flags: set of (real) flags the instruction sets
+ */
+__inline__ 
+void VG_(new_emit) ( Bool upds_simd_flags, 
+                     FlagSet use_flags, FlagSet set_flags )
+{
+   Bool use, set;
+
+   use = use_flags != FlagsEmpty 
+         || (set_flags != FlagsEmpty && set_flags != FlagsOSZACP);
+   set = set_flags != FlagsEmpty;
+
+   if (0)
+      VG_(printf)(
+         "new_emit: state=%d upds_simd_flags=%d use_flags=%x set_flags=%x\n",
+         eflags_state, upds_simd_flags, use_flags, set_flags);
+
+   if (upds_simd_flags) {
+      if (use && eflags_state == UPD_Simd) {
+	 /* we need the CPU flags set, but they're not already */
+	 eflags_state = UPD_Both;
+	 emit_get_eflags();
+      }
+      if (set) {
+	 /* if we're setting the flags, then the CPU will have the
+	    only good copy */
+	 eflags_state = UPD_Real;
+      }
+   } else {
+      /* presume that if non-simd code is using flags, it knows what
+	 it's doing (ie, it just set up the flags). */
+      if (set) {
+	 /* This instruction is going to trash the flags, so we'd
+	    better save them away and say that they're only in the
+	    simulated state. */
+	 maybe_emit_put_eflags();
+	 eflags_state = UPD_Simd;
+      }
+   }
+
    if (dis)
       VG_(printf)("\t       %4d: ", emitted_code_used );
 }
@@ -310,6 +421,42 @@
    }
 }
 
+static __inline__ FlagSet nonshiftop_use(Opcode opc)
+{
+   switch(opc) {
+   case ADC:
+   case SBB:
+      return FlagC;
+
+   case ADD:
+   case OR:
+   case AND:
+   case SUB:
+   case XOR:
+      return FlagsEmpty;
+
+   default:
+      VG_(core_panic)("nonshiftop_use");
+   }
+}
+
+static __inline__ FlagSet nonshiftop_set(Opcode opc)
+{
+   switch(opc) {
+   case ADC:
+   case SBB:
+   case ADD:
+   case OR:
+   case AND:
+   case SUB:
+   case XOR:
+      return FlagsOSZACP;
+
+   default:
+      VG_(core_panic)("nonshiftop_set");
+   }
+}
+
 static __inline__ Int mkGrp2opcode ( Opcode opc )
 {
    switch (opc) {
@@ -324,6 +471,44 @@
    }
 }
 
+static __inline__ FlagSet shiftop_use(Opcode opc)
+{
+   switch(opc) {
+   case ROR:
+   case ROL:
+   case SHL:
+   case SHR:
+   case SAR:
+      return FlagsEmpty;
+
+   case RCL:
+   case RCR:
+      return FlagC;
+
+   default:
+      VG_(core_panic)("shiftop_use");
+   }
+}
+
+static __inline__ FlagSet shiftop_set(Opcode opc)
+{
+   switch(opc) {
+   case ROR:
+   case ROL:
+   case RCL:
+   case RCR:
+      return FlagsOC;
+
+   case SHL:
+   case SHR:
+   case SAR:
+      return FlagsOSZACP;
+
+   default:
+      VG_(core_panic)("shiftop_set");
+   }
+}
+
 static __inline__ Int mkGrp3opcode ( Opcode opc )
 {
    switch (opc) {
@@ -371,7 +556,7 @@
 
 void VG_(emit_movv_offregmem_reg) ( Int sz, Int off, Int areg, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    if (sz == 2) VG_(emitB) ( 0x66 );
    VG_(emitB) ( 0x8B ); /* MOV Ev, Gv */
    VG_(emit_amode_offregmem_reg) ( off, areg, reg );
@@ -382,7 +567,7 @@
 
 void VG_(emit_movv_reg_offregmem) ( Int sz, Int reg, Int off, Int areg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    if (sz == 2) VG_(emitB) ( 0x66 );
    VG_(emitB) ( 0x89 ); /* MOV Gv, Ev */
    VG_(emit_amode_offregmem_reg) ( off, areg, reg );
@@ -393,7 +578,7 @@
 
 static void emit_movv_regmem_reg ( Int sz, Int reg1, Int reg2 )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    if (sz == 2) VG_(emitB) ( 0x66 );
    VG_(emitB) ( 0x8B ); /* MOV Ev, Gv */
    emit_amode_regmem_reg ( reg1, reg2 );
@@ -404,7 +589,7 @@
 
 static void emit_movv_reg_regmem ( Int sz, Int reg1, Int reg2 )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    if (sz == 2) VG_(emitB) ( 0x66 );
    VG_(emitB) ( 0x89 ); /* MOV Gv, Ev */
    emit_amode_regmem_reg ( reg2, reg1 );
@@ -415,7 +600,7 @@
 
 void VG_(emit_movv_reg_reg) ( Int sz, Int reg1, Int reg2 )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    if (sz == 2) VG_(emitB) ( 0x66 );
    VG_(emitB) ( 0x89 ); /* MOV Gv, Ev */
    VG_(emit_amode_ereg_greg) ( reg2, reg1 );
@@ -424,9 +609,10 @@
                    nameISize(sz), nameIReg(sz,reg1), nameIReg(sz,reg2));
 }
 
-void VG_(emit_nonshiftopv_lit_reg) ( Int sz, Opcode opc, UInt lit, Int reg )
+void VG_(emit_nonshiftopv_lit_reg) ( Bool upd_cc, Int sz, Opcode opc, UInt lit, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, nonshiftop_use(opc), nonshiftop_set(opc));
+
    if (sz == 2) VG_(emitB) ( 0x66 );
    if (lit == VG_(extend_s_8to32)(lit & 0x000000FF)) {
       /* short form OK */
@@ -444,9 +630,31 @@
                    lit, nameIReg(sz,reg));
 }
 
-void VG_(emit_shiftopv_lit_reg) ( Int sz, Opcode opc, UInt lit, Int reg )
+void VG_(emit_nonshiftopv_lit_offregmem) ( Bool upd_cc, Int sz, Opcode opc, UInt lit, 
+					   Int off, Int regmem )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, nonshiftop_use(opc), nonshiftop_set(opc));
+   if (sz == 2) VG_(emitB) ( 0x66 );
+   if (lit == VG_(extend_s_8to32)(lit & 0x000000FF)) {
+      /* short form OK */
+      VG_(emitB) ( 0x83 ); /* Grp1 Ib,Ev */
+      VG_(emit_amode_offregmem_reg) ( off, regmem, mkGrp1opcode(opc) );
+      VG_(emitB) ( lit & 0x000000FF );
+   } else {
+      VG_(emitB) ( 0x81 ); /* Grp1 Iv,Ev */
+      VG_(emit_amode_offregmem_reg) ( off, regmem, mkGrp1opcode(opc) );
+      if (sz == 2) VG_(emitW) ( lit ); else VG_(emitL) ( lit );
+   }
+   if (dis)
+      VG_(printf)( "\n\t\t%s%c\t$0x%x, 0x%x(%s)\n", 
+                   VG_(name_UOpcode)(False,opc), nameISize(sz), 
+                   lit, off, nameIReg(sz,regmem));
+}
+
+void VG_(emit_shiftopv_lit_reg) ( Bool upd_cc, Int sz, Opcode opc, UInt lit, Int reg )
+{
+   VG_(new_emit)(upd_cc, shiftop_use(opc), shiftop_set(opc));
+
    if (sz == 2) VG_(emitB) ( 0x66 );
    VG_(emitB) ( 0xC1 ); /* Grp2 Ib,Ev */
    VG_(emit_amode_ereg_greg) ( reg, mkGrp2opcode(opc) );
@@ -457,9 +665,9 @@
                    lit, nameIReg(sz,reg));
 }
 
-static void emit_shiftopv_cl_stack0 ( Int sz, Opcode opc )
+static void emit_shiftopv_cl_stack0 ( Bool upd_cc, Int sz, Opcode opc )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, shiftop_use(opc), shiftop_set(opc));
    if (sz == 2) VG_(emitB) ( 0x66 );
    VG_(emitB) ( 0xD3 ); /* Grp2 CL,Ev */
    VG_(emitB) ( mkModRegRM ( 1, mkGrp2opcode(opc), 4 ) );
@@ -470,9 +678,9 @@
                   VG_(name_UOpcode)(False,opc), nameISize(sz) );
 }
 
-static void emit_shiftopb_cl_stack0 ( Opcode opc )
+static void emit_shiftopb_cl_stack0 ( Bool upd_cc, Opcode opc )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, shiftop_use(opc), shiftop_set(opc));
    VG_(emitB) ( 0xD2 ); /* Grp2 CL,Eb */
    VG_(emitB) ( mkModRegRM ( 1, mkGrp2opcode(opc), 4 ) );
    VG_(emitB) ( 0x24 ); /* a SIB, I think `d8(%esp)' */
@@ -482,10 +690,10 @@
                   VG_(name_UOpcode)(False,opc), nameISize(1) );
 }
 
-static void emit_nonshiftopv_offregmem_reg ( Int sz, Opcode opc, 
+static void emit_nonshiftopv_offregmem_reg ( Bool upd_cc, Int sz, Opcode opc, 
                                              Int off, Int areg, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, nonshiftop_use(opc), nonshiftop_set(opc));
    if (sz == 2) VG_(emitB) ( 0x66 );
    VG_(emitB) ( 3 + mkPrimaryOpcode(opc) ); /* op Ev, Gv */
    VG_(emit_amode_offregmem_reg) ( off, areg, reg );
@@ -495,10 +703,23 @@
                    off, nameIReg(4,areg), nameIReg(sz,reg));
 }
 
-void VG_(emit_nonshiftopv_reg_reg) ( Int sz, Opcode opc, 
-                                       Int reg1, Int reg2 )
+static void emit_nonshiftopv_reg_offregmem ( Bool upd_cc, Int sz, Opcode opc, 
+                                             Int off, Int areg, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, nonshiftop_use(opc), nonshiftop_set(opc));
+   if (sz == 2) VG_(emitB) ( 0x66 );
+   VG_(emitB) ( 1 + mkPrimaryOpcode(opc) ); /* op Gv, Ev */
+   VG_(emit_amode_offregmem_reg) ( off, areg, reg );
+   if (dis)
+      VG_(printf)( "\n\t\t%s%c\t0x%s, %x(%s),\n", 
+                   VG_(name_UOpcode)(False,opc), nameISize(sz),
+                   nameIReg(sz,reg), off, nameIReg(4,areg));
+}
+
+void VG_(emit_nonshiftopv_reg_reg) ( Bool upd_cc, Int sz, Opcode opc, 
+				     Int reg1, Int reg2 )
+{
+   VG_(new_emit)(upd_cc, nonshiftop_use(opc), nonshiftop_set(opc));
    if (sz == 2) VG_(emitB) ( 0x66 );
 #  if 0
    /* Perfectly correct, but the GNU assembler uses the other form.
@@ -517,11 +738,12 @@
 
 void VG_(emit_movv_lit_reg) ( Int sz, UInt lit, Int reg )
 {
-   if (lit == 0) {
-      VG_(emit_nonshiftopv_reg_reg) ( sz, XOR, reg, reg );
+   if (lit == 0 && eflags_state != UPD_Real) {
+      /* Only emit this for zeroing if it won't stomp flags */
+      VG_(emit_nonshiftopv_reg_reg) ( False, sz, XOR, reg, reg );
       return;
    }
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    if (sz == 2) VG_(emitB) ( 0x66 );
    VG_(emitB) ( 0xB8+reg ); /* MOV imm, Gv */
    if (sz == 2) VG_(emitW) ( lit ); else VG_(emitL) ( lit );
@@ -530,12 +752,12 @@
                    nameISize(sz), lit, nameIReg(sz,reg));
 }
 
-void VG_(emit_unaryopv_reg) ( Int sz, Opcode opc, Int reg )
+void VG_(emit_unaryopv_reg) ( Bool upd_cc, Int sz, Opcode opc, Int reg )
 {
-   VG_(new_emit)();
-   if (sz == 2) VG_(emitB) ( 0x66 );
    switch (opc) {
       case NEG:
+	 VG_(new_emit)(upd_cc, FlagsEmpty, FlagsOSZACP);
+	 if (sz == 2) VG_(emitB) ( 0x66 );
          VG_(emitB) ( 0xF7 );
          VG_(emit_amode_ereg_greg) ( reg, mkGrp3opcode(NEG) );
          if (dis)
@@ -543,6 +765,8 @@
                          nameISize(sz), nameIReg(sz,reg));
          break;
       case NOT:
+	 VG_(new_emit)(upd_cc, FlagsEmpty, FlagsEmpty);
+	 if (sz == 2) VG_(emitB) ( 0x66 );
          VG_(emitB) ( 0xF7 );
          VG_(emit_amode_ereg_greg) ( reg, mkGrp3opcode(NOT) );
          if (dis)
@@ -550,12 +774,16 @@
                          nameISize(sz), nameIReg(sz,reg));
          break;
       case DEC:
+	 VG_(new_emit)(upd_cc, FlagsEmpty, FlagsOSZAP);
+	 if (sz == 2) VG_(emitB) ( 0x66 );
          VG_(emitB) ( 0x48 + reg );
          if (dis)
             VG_(printf)( "\n\t\tdec%c\t%s\n", 
                          nameISize(sz), nameIReg(sz,reg));
          break;
       case INC:
+	 VG_(new_emit)(upd_cc, FlagsEmpty, FlagsOSZAP);
+	 if (sz == 2) VG_(emitB) ( 0x66 );
          VG_(emitB) ( 0x40 + reg );
          if (dis)
             VG_(printf)( "\n\t\tinc%c\t%s\n", 
@@ -568,7 +796,7 @@
 
 void VG_(emit_pushv_reg) ( Int sz, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    if (sz == 2) {
       VG_(emitB) ( 0x66 ); 
    } else {
@@ -581,7 +809,7 @@
 
 void VG_(emit_popv_reg) ( Int sz, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    if (sz == 2) {
       VG_(emitB) ( 0x66 ); 
    } else {
@@ -594,7 +822,7 @@
 
 void VG_(emit_pushl_lit32) ( UInt int32 )
 {  
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x68 );
    VG_(emitL) ( int32 );
    if (dis)
@@ -604,16 +832,16 @@
 void VG_(emit_pushl_lit8) ( Int lit8 )
 {
    vg_assert(lit8 >= -128 && lit8 < 128);
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x6A );
    VG_(emitB) ( (UChar)((UInt)lit8) );
    if (dis)
       VG_(printf)("\n\t\tpushl $%d\n", lit8 );
 }
 
-void VG_(emit_cmpl_zero_reg) ( Int reg )
+void VG_(emit_cmpl_zero_reg) ( Bool upd_cc, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, False, FlagsOSZACP);
    VG_(emitB) ( 0x83 );
    VG_(emit_amode_ereg_greg) ( reg, 7 /* Grp 3 opcode for CMP */ );
    VG_(emitB) ( 0x00 );
@@ -623,7 +851,7 @@
 
 static void emit_swapl_reg_ECX ( Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x87 ); /* XCHG Gv,Ev */
    VG_(emit_amode_ereg_greg) ( reg, R_ECX );
    if (dis) 
@@ -632,7 +860,7 @@
 
 void VG_(emit_swapl_reg_EAX) ( Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x90 + reg ); /* XCHG Gv,eAX */
    if (dis) 
       VG_(printf)("\n\t\txchgl %%eax, %s\n", nameIReg(4,reg));
@@ -640,7 +868,7 @@
 
 static void emit_swapl_reg_reg ( Int reg1, Int reg2 )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x87 ); /* XCHG Gv,Ev */
    VG_(emit_amode_ereg_greg) ( reg1, reg2 );
    if (dis) 
@@ -650,7 +878,7 @@
 
 static void emit_bswapl_reg ( Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x0F );
    VG_(emitB) ( 0xC8 + reg ); /* BSWAP r32 */
    if (dis) 
@@ -659,7 +887,7 @@
 
 static void emit_movl_reg_reg ( Int regs, Int regd )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x89 ); /* MOV Gv,Ev */
    VG_(emit_amode_ereg_greg) ( regd, regs );
    if (dis) 
@@ -668,7 +896,7 @@
 
 void VG_(emit_movv_lit_offregmem) ( Int sz, UInt lit, Int off, Int memreg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    if (sz == 2) {
       VG_(emitB) ( 0x66 );
    } else {
@@ -691,7 +919,7 @@
    486 insn set.  ToDo: investigate. */
 void VG_(emit_movb_lit_offregmem) ( UInt lit, Int off, Int memreg )
 {                                     
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0xC6 ); /* Grp11 Eb */
    VG_(emit_amode_offregmem_reg) ( off, memreg, 0 /* Grp11 subopcode for MOV */ );
    VG_(emitB) ( lit ); 
@@ -700,10 +928,10 @@
                    lit, off, nameIReg(4,memreg) );
 }              
               
-static void emit_nonshiftopb_offregmem_reg ( Opcode opc, 
+static void emit_nonshiftopb_offregmem_reg ( Bool upd_cc, Opcode opc, 
                                              Int off, Int areg, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, (opc == ADC || opc == SBB) ? FlagC : FlagsEmpty, True);
    VG_(emitB) ( 2 + mkPrimaryOpcode(opc) ); /* op Eb, Gb */
    VG_(emit_amode_offregmem_reg) ( off, areg, reg );
    if (dis)
@@ -712,10 +940,35 @@
                    nameIReg(1,reg));
 }
 
+static void emit_nonshiftopb_lit_offregmem ( Bool upd_cc, Opcode opc, 
+                                             UInt lit, Int off, Int areg )
+{
+   VG_(new_emit)(upd_cc, nonshiftop_use(opc), nonshiftop_set(opc));
+   VG_(emitB) ( 0x80 );
+   VG_(emit_amode_offregmem_reg) ( off, areg, mkGrp1opcode(opc) );
+   VG_(emitB) ( lit );
+   if (dis)
+      VG_(printf)( "\n\t\t%sb\t$0x%x, 0x%x(%s)\n", 
+                   VG_(name_UOpcode)(False,opc), lit, off, nameIReg(4,areg));
+}
+
+static void emit_nonshiftopb_reg_offregmem ( Bool upd_cc, Opcode opc, 
+                                             Int off, Int areg, Int reg )
+{
+   VG_(new_emit)(upd_cc, nonshiftop_use(opc), nonshiftop_set(opc));
+   VG_(emitB) ( 0 + mkPrimaryOpcode(opc) ); /* op Gb, Eb */
+   VG_(emit_amode_offregmem_reg) ( off, areg, reg );
+   if (dis)
+      VG_(printf)( "\n\t\t%sb\t0x%s , %x(%s)\n", 
+                   VG_(name_UOpcode)(False,opc), 
+		   nameIReg(1,reg),
+		   off, nameIReg(4,areg));
+}
+
 void VG_(emit_movb_reg_offregmem) ( Int reg, Int off, Int areg )
 {
    /* Could do better when reg == %al. */
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x88 ); /* MOV G1, E1 */
    VG_(emit_amode_offregmem_reg) ( off, areg, reg );
    if (dis)
@@ -723,9 +976,9 @@
                    nameIReg(1,reg), off, nameIReg(4,areg));
 }
 
-static void emit_nonshiftopb_reg_reg ( Opcode opc, Int reg1, Int reg2 )
+static void emit_nonshiftopb_reg_reg ( Bool upd_cc, Opcode opc, Int reg1, Int reg2 )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, nonshiftop_use(opc), nonshiftop_set(opc));
    VG_(emitB) ( 2 + mkPrimaryOpcode(opc) ); /* op Eb, Gb */
    VG_(emit_amode_ereg_greg) ( reg1, reg2 );
    if (dis)
@@ -736,7 +989,7 @@
 
 static void emit_movb_reg_regmem ( Int reg1, Int reg2 )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x88 ); /* MOV G1, E1 */
    emit_amode_regmem_reg ( reg2, reg1 );
    if (dis)
@@ -744,9 +997,9 @@
                                              nameIReg(4,reg2));
 }
 
-static void emit_nonshiftopb_lit_reg ( Opcode opc, UInt lit, Int reg )
+static void emit_nonshiftopb_lit_reg ( Bool upd_cc, Opcode opc, UInt lit, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, nonshiftop_use(opc), nonshiftop_set(opc));
    VG_(emitB) ( 0x80 ); /* Grp1 Ib,Eb */
    VG_(emit_amode_ereg_greg) ( reg, mkGrp1opcode(opc) );
    VG_(emitB) ( lit & 0x000000FF );
@@ -755,9 +1008,9 @@
                                              lit, nameIReg(1,reg));
 }
 
-static void emit_shiftopb_lit_reg ( Opcode opc, UInt lit, Int reg )
+static void emit_shiftopb_lit_reg ( Bool upd_cc, Opcode opc, UInt lit, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, shiftop_use(opc), shiftop_set(opc));
    VG_(emitB) ( 0xC0 ); /* Grp2 Ib,Eb */
    VG_(emit_amode_ereg_greg) ( reg, mkGrp2opcode(opc) );
    VG_(emitB) ( lit );
@@ -767,29 +1020,32 @@
                    lit, nameIReg(1,reg));
 }
 
-void VG_(emit_unaryopb_reg) ( Opcode opc, Int reg )
+void VG_(emit_unaryopb_reg) ( Bool upd_cc, Opcode opc, Int reg )
 {
-   VG_(new_emit)();
    switch (opc) {
       case INC:
+	 VG_(new_emit)(upd_cc, FlagsEmpty, FlagsOSZAP);
          VG_(emitB) ( 0xFE );
          VG_(emit_amode_ereg_greg) ( reg, mkGrp4opcode(INC) );
          if (dis)
             VG_(printf)( "\n\t\tincb\t%s\n", nameIReg(1,reg));
          break;
       case DEC:
+	 VG_(new_emit)(upd_cc, FlagsEmpty, FlagsOSZAP);
          VG_(emitB) ( 0xFE );
          VG_(emit_amode_ereg_greg) ( reg, mkGrp4opcode(DEC) );
          if (dis)
             VG_(printf)( "\n\t\tdecb\t%s\n", nameIReg(1,reg));
          break;
       case NOT:
+	 VG_(new_emit)(upd_cc, FlagsEmpty, FlagsEmpty);
          VG_(emitB) ( 0xF6 );
          VG_(emit_amode_ereg_greg) ( reg, mkGrp3opcode(NOT) );
          if (dis)
             VG_(printf)( "\n\t\tnotb\t%s\n", nameIReg(1,reg));
          break;
       case NEG:
+	 VG_(new_emit)(upd_cc, FlagsEmpty, FlagsOSZACP);
          VG_(emitB) ( 0xF6 );
          VG_(emit_amode_ereg_greg) ( reg, mkGrp3opcode(NEG) );
          if (dis)
@@ -800,9 +1056,9 @@
    }
 }
 
-void VG_(emit_testb_lit_reg) ( UInt lit, Int reg )
+void VG_(emit_testb_lit_reg) ( Bool upd_cc, UInt lit, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(upd_cc, FlagsEmpty, FlagsOSZACP);
    VG_(emitB) ( 0xF6 ); /* Grp3 Eb */
    VG_(emit_amode_ereg_greg) ( reg, 0 /* Grp3 subopcode for TEST */ );
    VG_(emitB) ( lit );
@@ -816,7 +1072,7 @@
 
 void VG_(emit_movzbl_offregmem_reg) ( Int off, Int regmem, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x0F ); VG_(emitB) ( 0xB6 ); /* MOVZBL */
    VG_(emit_amode_offregmem_reg) ( off, regmem, reg );
    if (dis)
@@ -826,7 +1082,7 @@
 
 static void emit_movzbl_regmem_reg ( Int reg1, Int reg2 )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x0F ); VG_(emitB) ( 0xB6 ); /* MOVZBL */
    emit_amode_regmem_reg ( reg1, reg2 );
    if (dis)
@@ -836,7 +1092,7 @@
 
 void VG_(emit_movzwl_offregmem_reg) ( Int off, Int areg, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x0F ); VG_(emitB) ( 0xB7 ); /* MOVZWL */
    VG_(emit_amode_offregmem_reg) ( off, areg, reg );
    if (dis)
@@ -846,7 +1102,7 @@
 
 static void emit_movzwl_regmem_reg ( Int reg1, Int reg2 )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x0F ); VG_(emitB) ( 0xB7 ); /* MOVZWL */
    emit_amode_regmem_reg ( reg1, reg2 );
    if (dis)
@@ -861,7 +1117,7 @@
 static void emit_get_fpu_state ( void )
 {
    Int off = 4 * VGOFF_(m_fpustate);
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0xDD ); VG_(emitB) ( 0xA5 ); /* frstor d32(%ebp) */
    VG_(emitL) ( off );
    if (dis)
@@ -871,17 +1127,18 @@
 static void emit_put_fpu_state ( void )
 {
    Int off = 4 * VGOFF_(m_fpustate);
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0xDD ); VG_(emitB) ( 0xB5 ); /* fnsave d32(%ebp) */
    VG_(emitL) ( off );
    if (dis)
       VG_(printf)("\n\t\tfnsave\t%d(%%ebp)\n", off );
 }
 
-static void emit_fpu_no_mem ( UChar first_byte, 
+static void emit_fpu_no_mem ( FlagSet uses_flags, FlagSet sets_flags,
+			      UChar first_byte, 
                               UChar second_byte )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, uses_flags, sets_flags);
    VG_(emitB) ( first_byte );
    VG_(emitB) ( second_byte );
    if (dis)
@@ -889,11 +1146,12 @@
                   (UInt)first_byte, (UInt)second_byte );
 }
 
-static void emit_fpu_regmem ( UChar first_byte, 
+static void emit_fpu_regmem ( FlagSet uses_flags, FlagSet sets_flags,
+			      UChar first_byte, 
                               UChar second_byte_masked, 
                               Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, uses_flags, sets_flags);
    VG_(emitB) ( first_byte );
    emit_amode_regmem_reg ( reg, second_byte_masked >> 3 );
    if (dis)
@@ -909,34 +1167,36 @@
 
 void VG_(emit_call_reg) ( Int reg )
 {           
-   VG_(new_emit)();
+   VG_(new_emit)(False, FlagsEmpty, FlagsOSZACP); /* XXX */
    VG_(emitB) ( 0xFF ); /* Grp5 */
    VG_(emit_amode_ereg_greg) ( reg, mkGrp5opcode(CALLM) );
    if (dis) 
       VG_(printf)( "\n\t\tcall\t*%s\n", nameIReg(4,reg) );
 }              
          
-static void emit_call_star_EBP_off ( Int byte_off )
+static void emit_call_star_EBP_off ( Bool upd_cc, Int byte_off, FlagSet use_flag, FlagSet set_flag )
 {
-  VG_(new_emit)();
-  if (byte_off < -128 || byte_off > 127) {
-     VG_(emitB) ( 0xFF );
-     VG_(emitB) ( 0x95 );
-     VG_(emitL) ( byte_off );
-  } else {
-     VG_(emitB) ( 0xFF );
-     VG_(emitB) ( 0x55 );
-     VG_(emitB) ( byte_off );
-  }
-  if (dis)
-     VG_(printf)( "\n\t\tcall * %d(%%ebp)\n", byte_off );
+   /* Used for helpers which expect to see Simd flags in Real flags */
+   VG_(new_emit)(upd_cc, use_flag, set_flag);
+
+   if (byte_off < -128 || byte_off > 127) {
+      VG_(emitB) ( 0xFF );
+      VG_(emitB) ( 0x95 );
+      VG_(emitL) ( byte_off );
+   } else {
+      VG_(emitB) ( 0xFF );
+      VG_(emitB) ( 0x55 );
+      VG_(emitB) ( byte_off );
+   }
+   if (dis)
+      VG_(printf)( "\n\t\tcall * %d(%%ebp)\n", byte_off );
 }
 
 
 static void emit_addlit8_offregmem ( Int lit8, Int regmem, Int off )
 {
    vg_assert(lit8 >= -128 && lit8 < 128);
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsOSZACP);
    VG_(emitB) ( 0x83 ); /* Grp1 Ib,Ev */
    VG_(emit_amode_offregmem_reg) ( off, regmem, 
                               0 /* Grp1 subopcode for ADD */ );
@@ -950,12 +1210,13 @@
 void VG_(emit_add_lit_to_esp) ( Int lit )
 {
    if (lit < -128 || lit > 127) VG_(core_panic)("VG_(emit_add_lit_to_esp)");
-   VG_(new_emit)();
-   VG_(emitB) ( 0x83 );
-   VG_(emitB) ( 0xC4 );
+   VG_(new_emit)(False, FlagsEmpty, FlagsEmpty);
+   VG_(emitB) ( 0x8D );
+   VG_(emitB) ( 0x64 );
+   VG_(emitB) ( 0x24 );
    VG_(emitB) ( lit & 0xFF );
    if (dis)
-      VG_(printf)( "\n\t\taddl $%d, %%esp\n", lit );
+      VG_(printf)( "\n\t\tlea\t%d(%%esp), %%esp\n", lit );
 }
 
 
@@ -963,7 +1224,7 @@
 {
    /* movb %al, 0(%esp) */
    /* 88442400              movb    %al, 0(%esp) */
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x88 );
    VG_(emitB) ( 0x44 );
    VG_(emitB) ( 0x24 );
@@ -976,7 +1237,7 @@
 {
    /* movb 0(%esp), %al */
    /* 8A442400              movb    0(%esp), %al */
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x8A );
    VG_(emitB) ( 0x44 );
    VG_(emitB) ( 0x24 );
@@ -989,10 +1250,10 @@
 /* Emit a jump short with an 8-bit signed offset.  Note that the
    offset is that which should be added to %eip once %eip has been
    advanced over this insn.  */
-void VG_(emit_jcondshort_delta) ( Condcode cond, Int delta )
+void VG_(emit_jcondshort_delta) ( Bool simd, Condcode cond, Int delta )
 {
    vg_assert(delta >= -128 && delta <= 127);
-   VG_(new_emit)();
+   VG_(new_emit)(simd, FlagsOSZCP, False);
    VG_(emitB) ( 0x70 + (UInt)cond );
    VG_(emitB) ( (UChar)delta );
    if (dis)
@@ -1000,35 +1261,9 @@
                    VG_(nameCondcode)(cond), delta );
 }
 
-static void emit_get_eflags ( void )
-{
-   Int off = 4 * VGOFF_(m_eflags);
-   vg_assert(off >= 0 && off < 128);
-   VG_(new_emit)();
-   VG_(emitB) ( 0xFF ); /* PUSHL off(%ebp) */
-   VG_(emitB) ( 0x75 );
-   VG_(emitB) ( off );
-   VG_(emitB) ( 0x9D ); /* POPFL */
-   if (dis)
-      VG_(printf)( "\n\t\tpushl %d(%%ebp) ; popfl\n", off );
-}
-
-static void emit_put_eflags ( void )
-{
-   Int off = 4 * VGOFF_(m_eflags);
-   vg_assert(off >= 0 && off < 128);
-   VG_(new_emit)();
-   VG_(emitB) ( 0x9C ); /* PUSHFL */
-   VG_(emitB) ( 0x8F ); /* POPL vg_m_state.m_eflags */
-   VG_(emitB) ( 0x45 );
-   VG_(emitB) ( off );
-   if (dis)
-      VG_(printf)( "\n\t\tpushfl ; popl %d(%%ebp)\n", off );
-}
-
 static void emit_setb_reg ( Int reg, Condcode cond )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsOSZCP, False);
    VG_(emitB) ( 0x0F ); VG_(emitB) ( 0x90 + (UChar)cond );
    VG_(emit_amode_ereg_greg) ( reg, 0 );
    if (dis)
@@ -1038,7 +1273,8 @@
 
 static void emit_ret ( void )
 {
-   VG_(new_emit)();
+   maybe_emit_put_eflags(); /* make sure flags are stored */
+   VG_(new_emit)(False, False, False);
    VG_(emitB) ( 0xC3 ); /* RET */
    if (dis)
       VG_(printf)("\n\t\tret\n");
@@ -1130,7 +1366,8 @@
 {
    vg_assert(VG_PATCHME_CALLSZ == 5);
 
-   VG_(new_emit)();
+   maybe_emit_put_eflags();		/* save flags before end of BB */
+   VG_(new_emit)(False, False, False);
 
    if (jumpidx >= VG_MAX_JUMPS) {
       /* If there too many jumps in this basic block, fall back to
@@ -1163,7 +1400,7 @@
 
 void VG_(emit_pushal) ( void )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x60 ); /* PUSHAL */
    if (dis)
       VG_(printf)("\n\t\tpushal\n");
@@ -1171,7 +1408,7 @@
 
 void VG_(emit_popal) ( void )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x61 ); /* POPAL */
    if (dis)
       VG_(printf)("\n\t\tpopal\n");
@@ -1179,7 +1416,7 @@
 
 static void emit_lea_litreg_reg ( UInt lit, Int regmem, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x8D ); /* LEA M,Gv */
    VG_(emit_amode_offregmem_reg) ( (Int)lit, regmem, reg );
    if (dis)
@@ -1190,7 +1427,7 @@
 static void emit_lea_sib_reg ( UInt lit, Int scale,
 			       Int regbase, Int regindex, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x8D ); /* LEA M,Gv */
    emit_amode_sib_reg ( (Int)lit, scale, regbase, regindex, reg );
    if (dis)
@@ -1202,7 +1439,7 @@
 
 void VG_(emit_AMD_prefetch_reg) ( Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(True, FlagsEmpty, FlagsEmpty);
    VG_(emitB) ( 0x0F );
    VG_(emitB) ( 0x0D );
    emit_amode_regmem_reg ( reg, 1 /* 0 is prefetch; 1 is prefetchw */ );
@@ -1257,13 +1494,15 @@
 /* Synthesise a call to *baseBlock[offset], ie,
    call * (4 x offset)(%ebp).
 */
-void VG_(synth_call) ( Bool ensure_shortform, Int word_offset )
+void VG_(synth_call) ( Bool ensure_shortform, Int word_offset, 
+		       Bool upd_cc, FlagSet use_flags, FlagSet set_flags )
 {
    vg_assert(word_offset >= 0);
    vg_assert(word_offset < VG_BASEBLOCK_WORDS);
-   if (ensure_shortform)
+   if (ensure_shortform) {
       vg_assert(word_offset < 32);
-   emit_call_star_EBP_off ( 4 * word_offset );
+   }
+   emit_call_star_EBP_off ( upd_cc, 4 * word_offset, use_flags, set_flags );
 }
 
 static void maybe_emit_movl_reg_reg ( UInt src, UInt dst )
@@ -1475,8 +1714,8 @@
       VG_(core_panic)("VG_(synth_call): regparms_n value not in range 0..3");
    }
    
-   /* Call the function */
-   VG_(synth_call) ( False, VG_(helper_offset) ( fn ) );
+   /* Call the function - may trash all flags */
+   VG_(synth_call) ( False, VG_(helper_offset) ( fn ), False, FlagsEmpty, FlagsOSZACP );
 
    /* Clear any args from stack */
    if (0 != stack_used) {
@@ -1525,6 +1764,7 @@
 */
 static void synth_jmp_reg ( Int reg, JmpKind jmpkind )
 {
+   maybe_emit_put_eflags();	/* save flags here */
    load_ebp_from_JmpKind ( jmpkind );
    if (reg != R_EAX)
       VG_(emit_movv_reg_reg) ( 4, reg, R_EAX );
@@ -1536,6 +1776,8 @@
 /* Same deal as synth_jmp_reg. */
 static void synth_jmp_lit ( Addr addr, JmpKind jmpkind )
 {
+   maybe_emit_put_eflags();	/* save flags here */
+
    VG_(emit_movv_lit_reg) ( 4, addr, R_EAX );
 
    if (VG_(clo_chain_bb) && (jmpkind == JmpBoring || jmpkind == JmpCall)) {
@@ -1553,76 +1795,169 @@
                                        Opcode opcode, Int size, 
                                        UInt lit, Int reg );
 
-static void synth_jcond_lit ( Condcode cond, Addr addr )
+static void synth_jcond_lit ( Condcode cond, 
+                              Addr addr,
+                              Bool eax_trashable )
 {
    UInt mask;
-   Int delta;
+   Int  delta;
+   Bool simd;
 
-  /* Do the following:
-        get eflags
-        jmp short if not cond to xyxyxy
-        addr -> eax
-        ret
-        xyxyxy
-
-   2 0000 750C                  jnz     xyxyxy
-   3 0002 B877665544            movl    $0x44556677, %eax
-   4 0007 C3                    ret
-   5 0008 FFE3                  jmp     *%ebx
-   6                    xyxyxy:
-  */
    if (VG_(clo_chain_bb)) {
       /* When using BB chaining, the jump sequence is:
-        jmp short if not cond to xyxyxy
-        addr -> eax
-	call VG_(patch_me)/jmp target
-        xyxyxy
-	 
-		je     1f
-		mov    $0x4000d190,%eax			// 5
-		mov    %eax, VGOFF_(m_eip)(%ebp)	// 3
-		call   0x40050f9a <vgPlain_patch_me>	// 5
-	1:	mov    $0x4000d042,%eax
-		call   0x40050f9a <vgPlain_patch_me>
+         ensure that simulated eflags are up-to-date
+         jmp short if not cond to xyxyxy, using the real
+            machine eflags if we can, synthesising a suitable sequence 
+            to examine the simulated ones otherwise
+         addr -> eax
+         call VG_(patch_me)/jmp target
+         xyxyxy
+         
+                 <possibly sequence to compute some condition>
+                 j<cond>    xyxyxy
+                 mov    $0x4000d190,%eax                     // 5
+                 mov    %eax, VGOFF_(m_eip)(%ebp)            // 3
+                 call   0x40050f9a <vgPlain_patch_me>        // 5
+         xyxyxy: mov    $0x4000d042,%eax
+                 call   0x40050f9a <vgPlain_patch_me>
       */
       delta = 5+3+5;
-   } else
+   } else {
+      /* When not using BB chaining:
+         ensure that simulated eflags are up-to-date
+         jmp short if not cond to xyxyxy, using the real
+            machine eflags if we can, synthesising a suitable sequence 
+            to examine the simulated ones otherwise
+         addr -> eax
+         ret
+         xyxyxy
+
+                 <possibly sequence to compute some condition>
+                 j<cond>     xyxyxy
+                 movl    $0x44556677, %eax    // 5
+                 ret                          // 1
+         xyxyxy:
+      */
       delta = 5+1;
+   }
    
-   if (!VG_(clo_fast_jcc)) {
-      /* We're forced to do it the slow way. */
-      emit_get_eflags();
+   /* Ensure simulated %EFLAGS are up-to-date, by copying back %eflags
+      if need be */
+   maybe_emit_put_eflags();
+   vg_assert(eflags_state == UPD_Both || eflags_state == UPD_Simd);
+
+   if (eflags_state == UPD_Both) {
+      /* The flags are already set up, so we just use them as is. */
+      simd = True;
       cond = invertCondition(cond);
    } else {
-      switch (cond & ~1) {
-         case CondB:  mask = EFlagC;          goto common;  /* C=1        */
-         case CondZ:  mask = EFlagZ;          goto common;  /* Z=1        */
-         case CondBE: mask = EFlagC | EFlagZ; goto common;  /* C=1 || Z=1 */
-         case CondS:  mask = EFlagS;          goto common;  /* S=1        */
-         case CondP:  mask = EFlagP;          goto common;  /* P=1        */
-         default:
-            /* Too complex .. we have to do it the slow way. */
-            emit_get_eflags();
-            cond = invertCondition(cond);
+
+      /* The simd state contains the most recent version, so we emit a
+         sequence to calculate the relevant condition directly out of
+         the simd flags.  This is much cheaper (on P3/P4/Athlon) than
+         copying them back to the real flags via popf.  Notice that
+         some of these sequences trash %eax, but that should be free
+         now since this is the end of a bb and therefore all regs are
+         dead. */
+      simd = False;
+
+      switch (cond) {
+
+         case CondLE: 
+         case CondNLE:
+            vg_assert(eax_trashable);
+
+            VG_(emit_movv_offregmem_reg)
+               ( 4, VGOFF_(m_eflags) * 4, R_EBP, R_EAX );
+            /* eax == %EFLAGS */
+
+            VG_(emit_shiftopv_lit_reg)( False, 4, SHR, 11-7, R_EAX );
+            /* eax has OF in SF's place */
+
+            emit_nonshiftopv_offregmem_reg 
+               ( False, 4, XOR, VGOFF_(m_eflags) * 4, R_EBP, R_EAX );
+            /* eax has (OF xor SF) in SF's place */
+
+            VG_(emit_shiftopv_lit_reg)( False, 4, SHR, 7-6, R_EAX );
+            /* eax has (OF xor SF) in ZF's place */
+
+            emit_nonshiftopv_offregmem_reg 
+               ( False, 4, OR, VGOFF_(m_eflags) * 4, R_EBP, R_EAX );
+            /* eax has ((OF xor SF) or ZF) in SF's place */
+        
+            VG_(emit_nonshiftopv_lit_reg)( False, 4, AND, 1 << 6, R_EAX );
+            /* Z is now set iff ((OF xor SF) or ZF) == 1 */
+
+            if (cond == CondLE) cond = CondZ; else cond = CondNZ;
             break;
 
-        common:
-            VG_(new_emit)();
+         case CondL: 
+         case CondNL:
+            vg_assert(eax_trashable);
+
+            VG_(emit_movv_offregmem_reg)
+               ( 4, VGOFF_(m_eflags) * 4, R_EBP, R_EAX );
+            /* eax == %EFLAGS */
+
+            VG_(emit_shiftopv_lit_reg)( False, 4, SHR, 11-7, R_EAX );
+            /* eax has OF in SF's place */
+
+            emit_nonshiftopv_offregmem_reg 
+               ( False, 4, XOR, VGOFF_(m_eflags) * 4, R_EBP, R_EAX );
+            /* eax has (OF xor SF) in SF's place */
+
+            VG_(emit_nonshiftopv_lit_reg)( False, 4, AND, 1 << 7, R_EAX );
+            /* Z is now set iff (OF xor SF) == 1 */
+
+            if (cond == CondL) cond = CondZ; else cond = CondNZ;
+            break;
+
+         case CondB: 
+         case CondNB: 
+            mask = EFlagC; goto simple; /* C=1        */
+
+         case CondZ: 
+         case CondNZ: 
+            mask = EFlagZ; goto simple; /* Z=1        */
+
+         case CondBE: 
+         case CondNBE: 
+            mask = EFlagC | EFlagZ; goto simple; /* C=1 || Z=1 */
+
+         case CondS:  
+         case CondNS:
+            mask = EFlagS; goto simple; /* S=1        */
+
+         case CondP:
+         //case CondNP: 
+            mask = EFlagP; goto simple; /* P=1        */
+
+         default: 
+            VG_(printf)("synth_jcond_lit: unhandled simd case %d (%s)\n", 
+                        (Int)cond, VG_(nameCondcode)(cond) );
+            VG_(core_panic)("synth_jcond_lit: unhandled simd case");
+
+          simple:
+            VG_(new_emit)(False, False, FlagsOSZACP);
             if ((mask & 0xff) == mask) {
                VG_(emitB) ( 0xF6 );   /* Grp3 */
                VG_(emit_amode_offregmem_reg)(
                   VGOFF_(m_eflags) * 4, R_EBP, 0 /* subcode for TEST */);
                VG_(emitB) (mask);
                if (dis)
-                  VG_(printf)("\n\t\ttestb $%x, %d(%%ebp)\n", 
+                  VG_(printf)("\n\t\ttestb $0x%x, %d(%%ebp)\n", 
                               mask, VGOFF_(m_eflags) * 4);
             } else {
+               /* all cond codes are in lower 16 bits */
+               vg_assert((mask & 0xffff) == mask);
+
+               VG_(emitB) ( 0x66 );
                VG_(emitB) ( 0xF7 );
                VG_(emit_amode_offregmem_reg)(
                   VGOFF_(m_eflags) * 4, R_EBP, 0 /* subcode for TEST */);
-               VG_(emitB) (mask);
+               VG_(emitW) (mask);
                if (dis)
-                  VG_(printf)("\n\t\ttestx $%x, %d(%%ebp)\n", 
+                  VG_(printf)("\n\t\ttestl $0x%x, %d(%%ebp)\n", 
                               mask, VGOFF_(m_eflags) * 4);
             }
 
@@ -1634,7 +1969,7 @@
       }
    }
 
-   VG_(emit_jcondshort_delta) ( cond, delta );
+   VG_(emit_jcondshort_delta) ( simd, cond, delta );
    synth_jmp_lit ( addr, JmpBoring );
 }
 
@@ -1648,11 +1983,11 @@
       000a C3                    ret
       next:
    */
-   VG_(emit_cmpl_zero_reg) ( reg );
+   VG_(emit_cmpl_zero_reg) ( False, reg );
    if (VG_(clo_chain_bb))
-      VG_(emit_jcondshort_delta) ( CondNZ, 5+3+5 );
+      VG_(emit_jcondshort_delta) ( False, CondNZ, 5+3+5 );
    else
-      VG_(emit_jcondshort_delta) ( CondNZ, 5+1 );
+      VG_(emit_jcondshort_delta) ( False, CondNZ, 5+1 );
    synth_jmp_lit ( addr, JmpBoring );
 }
 
@@ -1735,23 +2070,15 @@
 {
    /* NB! opcode is a uinstr opcode, not an x86 one! */
    switch (size) {
-      case 4: if (upd_cc) emit_get_eflags();
-              VG_(emit_unaryopv_reg) ( 4, opcode, reg );
-              if (upd_cc) emit_put_eflags();
+      case 4: VG_(emit_unaryopv_reg) ( upd_cc, 4, opcode, reg );
               break;
-      case 2: if (upd_cc) emit_get_eflags();
-              VG_(emit_unaryopv_reg) ( 2, opcode, reg );
-              if (upd_cc) emit_put_eflags();
+      case 2: VG_(emit_unaryopv_reg) ( upd_cc, 2, opcode, reg );
               break;
       case 1: if (reg < 4) {
-                 if (upd_cc) emit_get_eflags();
-                 VG_(emit_unaryopb_reg) ( opcode, reg );
-                 if (upd_cc) emit_put_eflags();
+                 VG_(emit_unaryopb_reg) ( upd_cc, opcode, reg );
               } else {
                  VG_(emit_swapl_reg_EAX) ( reg );
-                 if (upd_cc) emit_get_eflags();
-                 VG_(emit_unaryopb_reg) ( opcode, R_AL );
-                 if (upd_cc) emit_put_eflags();
+                 VG_(emit_unaryopb_reg) ( upd_cc, opcode, R_AL );
                  VG_(emit_swapl_reg_EAX) ( reg );
               }
               break;
@@ -1767,13 +2094,9 @@
 {
    /* NB! opcode is a uinstr opcode, not an x86 one! */
    switch (size) {
-      case 4: if (upd_cc) emit_get_eflags();
-              VG_(emit_nonshiftopv_reg_reg) ( 4, opcode, reg1, reg2 );
-              if (upd_cc) emit_put_eflags();
+      case 4: VG_(emit_nonshiftopv_reg_reg) ( upd_cc, 4, opcode, reg1, reg2 );
               break;
-      case 2: if (upd_cc) emit_get_eflags();
-              VG_(emit_nonshiftopv_reg_reg) ( 2, opcode, reg1, reg2 );
-              if (upd_cc) emit_put_eflags();
+      case 2: VG_(emit_nonshiftopv_reg_reg) ( upd_cc, 2, opcode, reg1, reg2 );
               break;
       case 1: { /* Horrible ... */
          Int s1, s2;
@@ -1782,44 +2105,34 @@
             sure s1 != s2 and that neither of them equal either reg1 or
             reg2. Then use them as temporaries to make things work. */
          if (reg1 < 4 && reg2 < 4) {
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_reg_reg(opcode, reg1, reg2); 
-            if (upd_cc) emit_put_eflags();
+            emit_nonshiftopb_reg_reg(upd_cc, opcode, reg1, reg2); 
             break;
          }
          for (s1 = 0; s1 == reg1 || s1 == reg2; s1++) ;
          if (reg1 >= 4 && reg2 < 4) {
             emit_swapl_reg_reg ( reg1, s1 );
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_reg_reg(opcode, s1, reg2);
-            if (upd_cc) emit_put_eflags();
+            emit_nonshiftopb_reg_reg(upd_cc, opcode, s1, reg2);
             emit_swapl_reg_reg ( reg1, s1 );
             break;
          }
          for (s2 = 0; s2 == reg1 || s2 == reg2 || s2 == s1; s2++) ;
          if (reg1 < 4 && reg2 >= 4) {
             emit_swapl_reg_reg ( reg2, s2 );
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_reg_reg(opcode, reg1, s2);
-            if (upd_cc) emit_put_eflags();
+            emit_nonshiftopb_reg_reg(upd_cc, opcode, reg1, s2);
             emit_swapl_reg_reg ( reg2, s2 );
             break;
          }
          if (reg1 >= 4 && reg2 >= 4 && reg1 != reg2) {
             emit_swapl_reg_reg ( reg1, s1 );
             emit_swapl_reg_reg ( reg2, s2 );
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_reg_reg(opcode, s1, s2);
-            if (upd_cc) emit_put_eflags();
+            emit_nonshiftopb_reg_reg(upd_cc, opcode, s1, s2);
             emit_swapl_reg_reg ( reg1, s1 );
             emit_swapl_reg_reg ( reg2, s2 );
             break;
          }
          if (reg1 >= 4 && reg2 >= 4 && reg1 == reg2) {
             emit_swapl_reg_reg ( reg1, s1 );
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_reg_reg(opcode, s1, s1);
-            if (upd_cc) emit_put_eflags();
+            emit_nonshiftopb_reg_reg(upd_cc, opcode, s1, s1);
             emit_swapl_reg_reg ( reg1, s1 );
             break;
          }
@@ -1830,6 +2143,32 @@
 }
 
 
+static void synth_nonshiftop_reg_offregmem (
+   Bool upd_cc,
+   Opcode opcode, Int size, 
+   Int off, Int areg, Int reg )
+{
+   switch (size) {
+      case 4: 
+         emit_nonshiftopv_reg_offregmem ( upd_cc, 4, opcode, off, areg, reg ); 
+         break;
+      case 2: 
+         emit_nonshiftopv_reg_offregmem ( upd_cc, 2, opcode, off, areg, reg ); 
+         break;
+      case 1: 
+         if (reg < 4) {
+            emit_nonshiftopb_reg_offregmem ( upd_cc, opcode, off, areg, reg );
+         } else {
+            VG_(emit_swapl_reg_EAX) ( reg );
+            emit_nonshiftopb_reg_offregmem ( upd_cc, opcode, off, areg, R_AL );
+            VG_(emit_swapl_reg_EAX) ( reg );
+         }
+         break;
+      default: 
+         VG_(core_panic)("synth_nonshiftop_reg_offregmem");
+   }
+}
+
 static void synth_nonshiftop_offregmem_reg ( 
    Bool upd_cc,
    Opcode opcode, Int size, 
@@ -1837,25 +2176,17 @@
 {
    switch (size) {
       case 4: 
-         if (upd_cc) emit_get_eflags();
-         emit_nonshiftopv_offregmem_reg ( 4, opcode, off, areg, reg ); 
-         if (upd_cc) emit_put_eflags();
+         emit_nonshiftopv_offregmem_reg ( upd_cc, 4, opcode, off, areg, reg ); 
          break;
       case 2: 
-         if (upd_cc) emit_get_eflags();
-         emit_nonshiftopv_offregmem_reg ( 2, opcode, off, areg, reg ); 
-         if (upd_cc) emit_put_eflags();
+         emit_nonshiftopv_offregmem_reg ( upd_cc, 2, opcode, off, areg, reg ); 
          break;
       case 1: 
          if (reg < 4) {
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_offregmem_reg ( opcode, off, areg, reg );
-            if (upd_cc) emit_put_eflags();
+            emit_nonshiftopb_offregmem_reg ( upd_cc, opcode, off, areg, reg );
          } else {
             VG_(emit_swapl_reg_EAX) ( reg );
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_offregmem_reg ( opcode, off, areg, R_AL );
-            if (upd_cc) emit_put_eflags();
+            emit_nonshiftopb_offregmem_reg ( upd_cc, opcode, off, areg, R_AL );
             VG_(emit_swapl_reg_EAX) ( reg );
          }
          break;
@@ -1870,23 +2201,15 @@
                                        UInt lit, Int reg )
 {
    switch (size) {
-      case 4: if (upd_cc) emit_get_eflags();
-              VG_(emit_nonshiftopv_lit_reg) ( 4, opcode, lit, reg );
-              if (upd_cc) emit_put_eflags();
+      case 4: VG_(emit_nonshiftopv_lit_reg) ( upd_cc, 4, opcode, lit, reg );
               break;
-      case 2: if (upd_cc) emit_get_eflags();
-              VG_(emit_nonshiftopv_lit_reg) ( 2, opcode, lit, reg );
-              if (upd_cc) emit_put_eflags();
+      case 2: VG_(emit_nonshiftopv_lit_reg) ( upd_cc, 2, opcode, lit, reg );
               break;
       case 1: if (reg < 4) {
-                 if (upd_cc) emit_get_eflags();
-                 emit_nonshiftopb_lit_reg ( opcode, lit, reg );
-                 if (upd_cc) emit_put_eflags();
+                 emit_nonshiftopb_lit_reg ( upd_cc, opcode, lit, reg );
               } else {
                  VG_(emit_swapl_reg_EAX) ( reg );
-                 if (upd_cc) emit_get_eflags();
-                 emit_nonshiftopb_lit_reg ( opcode, lit, R_AL );
-                 if (upd_cc) emit_put_eflags();
+                 emit_nonshiftopb_lit_reg ( upd_cc, opcode, lit, R_AL );
                  VG_(emit_swapl_reg_EAX) ( reg );
               }
               break;
@@ -1894,6 +2217,21 @@
    }
 }
 
+static void synth_nonshiftop_lit_offregmem ( Bool upd_cc,
+					     Opcode opcode, Int size, 
+					     UInt lit, Int off, Int regmem )
+{
+   switch (size) {
+      case 4: VG_(emit_nonshiftopv_lit_offregmem) ( upd_cc, 4, opcode, lit, off, regmem );
+              break;
+      case 2: VG_(emit_nonshiftopv_lit_offregmem) ( upd_cc, 2, opcode, lit, off, regmem );
+              break;
+      case 1: emit_nonshiftopb_lit_offregmem ( upd_cc, opcode, lit, off, regmem );
+              break;
+      default: VG_(core_panic)("synth_nonshiftop_lit_offregmem");
+   }
+}
+
 
 static void synth_push_reg ( Int size, Int reg )
 {
@@ -1947,14 +2285,12 @@
 {
    synth_push_reg ( size, regd );
    if (regs != R_ECX) emit_swapl_reg_ECX ( regs );
-   if (upd_cc) emit_get_eflags();
    switch (size) {
-      case 4: emit_shiftopv_cl_stack0 ( 4, opcode ); break;
-      case 2: emit_shiftopv_cl_stack0 ( 2, opcode ); break;
-      case 1: emit_shiftopb_cl_stack0 ( opcode ); break;
+      case 4: emit_shiftopv_cl_stack0 ( upd_cc, 4, opcode ); break;
+      case 2: emit_shiftopv_cl_stack0 ( upd_cc, 2, opcode ); break;
+      case 1: emit_shiftopb_cl_stack0 ( upd_cc, opcode ); break;
       default: VG_(core_panic)("synth_shiftop_reg_reg");
    }
-   if (upd_cc) emit_put_eflags();
    if (regs != R_ECX) emit_swapl_reg_ECX ( regs );
    synth_pop_reg ( size, regd );
 }
@@ -1965,23 +2301,15 @@
                                     UInt lit, Int reg )
 {
    switch (size) {
-      case 4: if (upd_cc) emit_get_eflags();
-              VG_(emit_shiftopv_lit_reg) ( 4, opcode, lit, reg );
-              if (upd_cc) emit_put_eflags();
+      case 4: VG_(emit_shiftopv_lit_reg) ( upd_cc, 4, opcode, lit, reg );
               break;
-      case 2: if (upd_cc) emit_get_eflags();
-              VG_(emit_shiftopv_lit_reg) ( 2, opcode, lit, reg );
-              if (upd_cc) emit_put_eflags();
+      case 2: VG_(emit_shiftopv_lit_reg) ( upd_cc, 2, opcode, lit, reg );
               break;
       case 1: if (reg < 4) {
-                 if (upd_cc) emit_get_eflags();
-                 emit_shiftopb_lit_reg ( opcode, lit, reg );
-                 if (upd_cc) emit_put_eflags();
+                 emit_shiftopb_lit_reg ( upd_cc, opcode, lit, reg );
               } else {
                  VG_(emit_swapl_reg_EAX) ( reg );
-                 if (upd_cc) emit_get_eflags();
-                 emit_shiftopb_lit_reg ( opcode, lit, R_AL );
-                 if (upd_cc) emit_put_eflags();
+                 emit_shiftopb_lit_reg ( upd_cc, opcode, lit, R_AL );
                  VG_(emit_swapl_reg_EAX) ( reg );
               }
               break;
@@ -1992,7 +2320,6 @@
 
 static void synth_setb_reg ( Int reg, Condcode cond )
 {
-   emit_get_eflags();
    if (reg < 4) {
       emit_setb_reg ( reg, cond );
    } else {
@@ -2003,18 +2330,20 @@
 }
 
 
-static void synth_fpu_regmem ( UChar first_byte,
+static void synth_fpu_regmem ( Bool uses_flags, Bool sets_flags,
+			       UChar first_byte,
                                UChar second_byte_masked, 
                                Int reg )
 {
-   emit_fpu_regmem ( first_byte, second_byte_masked, reg );
+   emit_fpu_regmem ( uses_flags, sets_flags, first_byte, second_byte_masked, reg );
 }
 
 
-static void synth_fpu_no_mem ( UChar first_byte,
+static void synth_fpu_no_mem ( Bool uses_flags, Bool sets_flags,
+			       UChar first_byte,
                                UChar second_byte )
 {
-   emit_fpu_no_mem ( first_byte, second_byte );
+   emit_fpu_no_mem ( uses_flags, sets_flags, first_byte, second_byte );
 }
 
 
@@ -2025,8 +2354,7 @@
 
 static void synth_cmovl_reg_reg ( Condcode cond, Int src, Int dst )
 {
-   emit_get_eflags();
-   VG_(emit_jcondshort_delta) ( invertCondition(cond), 
+   VG_(emit_jcondshort_delta) ( True, invertCondition(cond), 
                            2 /* length of the next insn */ );
    emit_movl_reg_reg ( src, dst );
 }
@@ -2114,16 +2442,16 @@
 static void synth_WIDEN_signed ( Int sz_src, Int sz_dst, Int reg )
 {
    if (sz_src == 1 && sz_dst == 4) {
-      VG_(emit_shiftopv_lit_reg) ( 4, SHL, 24, reg );
-      VG_(emit_shiftopv_lit_reg) ( 4, SAR, 24, reg );
+      VG_(emit_shiftopv_lit_reg) ( False, 4, SHL, 24, reg );
+      VG_(emit_shiftopv_lit_reg) ( False, 4, SAR, 24, reg );
    }
    else if (sz_src == 2 && sz_dst == 4) {
-      VG_(emit_shiftopv_lit_reg) ( 4, SHL, 16, reg );
-      VG_(emit_shiftopv_lit_reg) ( 4, SAR, 16, reg );
+      VG_(emit_shiftopv_lit_reg) ( False, 4, SHL, 16, reg );
+      VG_(emit_shiftopv_lit_reg) ( False, 4, SAR, 16, reg );
    }
    else if (sz_src == 1 && sz_dst == 2) {
-      VG_(emit_shiftopv_lit_reg) ( 2, SHL, 8, reg );
-      VG_(emit_shiftopv_lit_reg) ( 2, SAR, 8, reg );
+      VG_(emit_shiftopv_lit_reg) ( False, 2, SHL, 8, reg );
+      VG_(emit_shiftopv_lit_reg) ( False, 2, SAR, 8, reg );
    }
    else
       VG_(core_panic)("synth_WIDEN");
@@ -2152,10 +2480,21 @@
    return (u->flags_w != FlagsEmpty); 
 }
 
+static __inline__
+Bool readFlagUse ( UInstr* u )
+{
+   /* If the UInstr writes some flags but not all, then we still need
+      to consider it as reading flags so that the unchanged values are
+      passed through properly. (D is special) */
+   return
+      (u->flags_r != FlagsEmpty) || 
+      (u->flags_w != FlagsEmpty && u->flags_w != FlagsOSZACP) ; 
+}
+
 static __inline__ 
 Bool anyFlagUse ( UInstr* u )
 {
-   return (u->flags_r != FlagsEmpty || u->flags_w != FlagsEmpty);
+   return readFlagUse(u) || writeFlagUse(u);
 }
 
 
@@ -2310,24 +2649,64 @@
       case GETF: {
          vg_assert(u->size == 2 || u->size == 4);
          vg_assert(u->tag1 == RealReg);
-         synth_mov_offregmem_reg ( 
-            u->size, 
-            eflagsOffset(),
-            R_EBP,
-            u->val1
-         );
+	 
+	 /* This complexity is because the D(irection) flag is stored
+	    separately from the rest of EFLAGS.  */
+
+	 /* We're only fetching from the Simd state, so make sure it's
+	    up to date. */
+	 maybe_emit_put_eflags();
+
+	 /* get D in u->val1 (== 1 or -1) */
+         synth_mov_offregmem_reg (u->size, 4*VGOFF_(m_dflag), R_EBP, u->val1);
+
+	 /* u->val1 &= EFlagD (== 0 or EFlagD) */
+	 synth_nonshiftop_lit_reg(False, AND, u->size, EFlagD, u->val1);
+
+	 /* EFLAGS &= ~EFlagD (make sure there's no surprises) */
+	 synth_nonshiftop_lit_offregmem(False, AND, u->size, ~EFlagD,
+					eflagsOffset(), R_EBP);
+
+	 /* EFLAGS &= ~EFlagD (make sure there's no surprises) */
+	 synth_nonshiftop_lit_offregmem(False, AND, u->size, ~EFlagD,
+					eflagsOffset(), R_EBP);
+
+	 /* u->val1 |= EFLAGS (EFLAGS & EflagD == 0) */
+	 synth_nonshiftop_offregmem_reg(False, OR, u->size, 
+					eflagsOffset(), R_EBP, u->val1);
          break;
       }
             
       case PUTF: {
          vg_assert(u->size == 2 || u->size == 4);
          vg_assert(u->tag1 == RealReg);
-         synth_mov_reg_offregmem ( 
-            u->size, 
-            u->val1,
-            eflagsOffset(),
-            R_EBP
-         );
+
+	 /* When putting a value into EFLAGS, this generates the
+	    correct value for m_dflag (-1 or 1), and clears the D bit
+	    in EFLAGS. */
+
+	 /* We're updating the whole flag state, so the old state
+	    doesn't matter; make sure that the new simulated state
+	    will be fetched when needed. */
+	 eflags_state = UPD_Simd;
+
+	 /* store EFLAGS (with D) */
+         synth_mov_reg_offregmem (u->size, u->val1, eflagsOffset(), R_EBP);
+	 
+	 /* u->val1 &= EFlagD */
+	 synth_nonshiftop_lit_reg(False, AND, u->size, EFlagD, u->val1);
+
+	 /* computes: u->val1 = (u->val1 == 0) ? 1 : -1 */
+	 synth_unaryop_reg(False, NEG, u->size, u->val1);
+	 synth_nonshiftop_reg_reg(False, SBB, u->size, u->val1, u->val1);
+	 synth_nonshiftop_lit_reg(False, SBB, u->size, -1, u->val1);
+
+	 /* save D */
+	 synth_mov_reg_offregmem(u->size, u->val1, 4*VGOFF_(m_dflag), R_EBP);
+	 
+	 /* EFLAGS &= ~EFlagD */
+	 synth_nonshiftop_lit_offregmem(False, AND, u->size, ~EFlagD,
+					eflagsOffset(), R_EBP);
          break;
       }
             
@@ -2471,7 +2850,13 @@
                   break;
                case Literal:
                   vg_assert(u->jmpkind == JmpBoring);
-                  synth_jcond_lit ( u->cond, u->lit32 );
+                  /* %eax had better not be live since synth_jcond_lit
+                     trashes it in some circumstances.  If that turns
+                     out to be a problem we can get synth_jcond_lit to
+                     push/pop it when it is live. */
+                  vg_assert(! IS_RREG_LIVE(VG_(realreg_to_rank)(R_EAX), 
+                                           u->regs_live_after));
+                  synth_jcond_lit ( u->cond, u->lit32, True );
                   break;
                default: 
                   VG_(core_panic)("emitUInstr(JMP, conditional, default)");
@@ -2512,11 +2897,11 @@
 	    emit_put_fpu_state();
 	    *fplive = False;
 	 }
-         if (anyFlagUse ( u )) 
-            emit_get_eflags();
-         VG_(synth_call) ( False, u->val1 );
-         if (writeFlagUse ( u )) 
-            emit_put_eflags();
+	 /* Call to a helper which is pretending to be a real CPU
+	    instruction (and therefore operates on Real flags and
+	    registers) */
+         VG_(synth_call) ( False, u->val1, 
+			   True, u->flags_r, u->flags_w );
          break;
 
       case CCALL: {
@@ -2566,7 +2951,8 @@
 	    emit_get_fpu_state();
 	    *fplive = True;
 	 }
-         synth_fpu_regmem ( (u->val1 >> 8) & 0xFF,
+         synth_fpu_regmem ( u->flags_r, u->flags_w,
+			    (u->val1 >> 8) & 0xFF,
                             u->val1 & 0xFF,
                             u->val2 );
          break;
@@ -2580,10 +2966,9 @@
 	    emit_get_fpu_state();
 	    *fplive = True;
 	 }
-         synth_fpu_no_mem ( (u->val1 >> 8) & 0xFF,
+         synth_fpu_no_mem ( u->flags_r, u->flags_w,
+			    (u->val1 >> 8) & 0xFF,
                             u->val1 & 0xFF );
-         if (writeFlagUse ( u )) 
-            emit_put_eflags();
          break;
 
       default: 
@@ -2623,36 +3008,31 @@
 {
    Int i;
    UChar regs_live_before = 0;   /* No regs live at BB start */
- 
    Bool fplive;
    Addr orig_eip, curr_eip;
   
-   emitted_code_used = 0;
-   emitted_code_size = 500; /* reasonable initial size */
-   emitted_code = VG_(arena_malloc)(VG_AR_JITTER, emitted_code_size);
-   jumpidx = 0;
+   reset_state();
 
    if (dis) VG_(printf)("Generated x86 code:\n");
 
    /* Generate decl VG_(dispatch_ctr) and drop into dispatch if we hit
       zero.  We have to do this regardless of whether we're t-chaining
       or not. */
-   VG_(new_emit)();
+   VG_(new_emit)(False, FlagsEmpty, FlagsOSZAP);
    VG_(emitB) (0xFF);	/* decl */
    emit_amode_litmem_reg((Addr)&VG_(dispatch_ctr), 1);
    if (dis)
       VG_(printf)("\n\t\tdecl (%p)\n", &VG_(dispatch_ctr));
-   VG_(emit_jcondshort_delta)(CondNZ, 5+1);
+   VG_(emit_jcondshort_delta)(False, CondNZ, 5+1);
    VG_(emit_movv_lit_reg) ( 4, VG_TRC_INNER_COUNTERZERO, R_EBP );
    emit_ret();
 
    /* Set up running state. */
    fplive   = False;
-   orig_eip = 0;
+   orig_eip = cb->orig_eip;	/* we know EIP is up to date on BB entry */
    curr_eip = cb->orig_eip;
    vg_assert(curr_eip != 0); /* otherwise the incremental updating
                                 algorithm gets messed up. */
-
    /* for each uinstr ... */
    for (i = 0; i < cb->used; i++) {
       UInstr* u = &cb->instrs[i];
@@ -2671,7 +3051,8 @@
       regs_live_before = u->regs_live_after;
    }
    if (dis) VG_(printf)("\n");
-   vg_assert(!fplive);	/* FPU state must be saved by end of BB */
+   vg_assert(!fplive);			/* FPU state must be saved by end of BB */
+   vg_assert(eflags_state != UPD_Real);	/* flags can't just be in CPU */
 
    if (j != NULL) {
       vg_assert(jumpidx <= VG_MAX_JUMPS);
diff --git a/coregrind/vg_helpers.S b/coregrind/vg_helpers.S
index 9d7356d..3861abe 100644
--- a/coregrind/vg_helpers.S
+++ b/coregrind/vg_helpers.S
@@ -333,33 +333,34 @@
 /* Get the direction flag, and return either 1 or -1. */
 .global VG_(helper_get_dirflag)
 VG_(helper_get_dirflag):
-	pushfl
 	pushl	%eax
 
-	pushfl
+	movl	VGOFF_(m_dflag), %eax
+	movl	(%ebp, %eax, 4), %eax
+	movl	%eax, 8(%esp)
+
 	popl	%eax
-	shrl	$10, %eax
-	andl	$1, %eax
-	jnz	L1
-	movl	$1, %eax
-	jmp	L2
-L1:	movl	$-1, %eax
-L2:	movl	%eax, 12(%esp)
-
-	popl %eax
-	popfl
 	ret
 
-
 /* Clear/set the direction flag. */
 .global VG_(helper_CLD)
 VG_(helper_CLD):
-	cld
+	pushl	%eax
+
+	movl	VGOFF_(m_dflag), %eax
+	movl	$1, (%ebp, %eax, 4)
+
+	popl	%eax
 	ret
 
 .global VG_(helper_STD)
 VG_(helper_STD):
-	std
+	pushl	%eax
+
+	movl	VGOFF_(m_dflag), %eax
+	movl	$-1, (%ebp, %eax, 4)
+
+	popl	%eax
 	ret
 
 /* Clear/set the carry flag. */
diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h
index 07b6ea5..b637e41 100644
--- a/coregrind/vg_include.h
+++ b/coregrind/vg_include.h
@@ -1413,6 +1413,9 @@
 extern UInt VG_(num_scheduling_events_MINOR);
 extern UInt VG_(num_scheduling_events_MAJOR);
 
+/* Insert and extract the D flag from eflags */
+UInt VG_(insertDflag)(UInt eflags, Int d);
+Int VG_(extractDflag)(UInt eflags);
 
 /* ---------------------------------------------------------------------
    Exports of vg_memory.c
@@ -1601,6 +1604,8 @@
 extern Int VGOFF_(m_fpustate);
 extern Int VGOFF_(m_eip);
 
+extern Int VGOFF_(m_dflag);	/* D flag is handled specially */
+
 extern Int VGOFF_(m_cs);
 extern Int VGOFF_(m_ss);
 extern Int VGOFF_(m_ds);
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
index 2c524fb..d953509 100644
--- a/coregrind/vg_main.c
+++ b/coregrind/vg_main.c
@@ -48,6 +48,7 @@
 Int VGOFF_(m_esi) = INVALID_OFFSET;
 Int VGOFF_(m_edi) = INVALID_OFFSET;
 Int VGOFF_(m_eflags) = INVALID_OFFSET;
+Int VGOFF_(m_dflag)  = INVALID_OFFSET;
 Int VGOFF_(m_fpustate) = INVALID_OFFSET;
 Int VGOFF_(ldt)   = INVALID_OFFSET;
 Int VGOFF_(m_cs)  = INVALID_OFFSET;
@@ -235,6 +236,8 @@
    /* I gave up counting at this point.  Since they're above the
       short-amode-boundary, there's no point. */
 
+   VGOFF_(m_dflag) = alloc_BaB(1);
+
    VGOFF_(m_fpustate) = alloc_BaB(VG_SIZE_OF_FPUSTATE_W);
 
    /* This thread's LDT pointer, and segment registers. */
@@ -1098,6 +1101,29 @@
                           + VG_SIZE_OF_FPUSTATE_W /* FPU state */
                          ];
 
+UInt VG_(insertDflag)(UInt eflags, Int d)
+{
+   vg_assert(d == 1 || d == -1);
+   eflags &= ~EFlagD;
+
+   if (d < 0)
+      eflags |= EFlagD;
+
+   return eflags;
+}
+
+Int VG_(extractDflag)(UInt eflags)
+{
+   Int ret;
+
+   if (eflags & EFlagD)
+      ret = -1;
+   else
+      ret = 1;
+
+   return ret;
+}
+
 void VG_(copy_baseBlock_to_m_state_static) ( void )
 {
    Int i;
@@ -1117,7 +1143,8 @@
    VG_(m_state_static)[48/4] = VG_(baseBlock)[VGOFF_(m_esi)];
    VG_(m_state_static)[52/4] = VG_(baseBlock)[VGOFF_(m_edi)];
 
-   VG_(m_state_static)[56/4] = VG_(baseBlock)[VGOFF_(m_eflags)];
+   VG_(m_state_static)[56/4] = VG_(insertDflag)(VG_(baseBlock)[VGOFF_(m_eflags)],
+						VG_(baseBlock)[VGOFF_(m_dflag)]);
    VG_(m_state_static)[60/4] = VG_(baseBlock)[VGOFF_(m_eip)];
 
    for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
@@ -1145,7 +1172,9 @@
    VG_(baseBlock)[VGOFF_(m_esi)] = VG_(m_state_static)[48/4];
    VG_(baseBlock)[VGOFF_(m_edi)] = VG_(m_state_static)[52/4];
 
-   VG_(baseBlock)[VGOFF_(m_eflags)] = VG_(m_state_static)[56/4];
+   VG_(baseBlock)[VGOFF_(m_eflags)] = VG_(m_state_static)[56/4] & ~EFlagD;
+   VG_(baseBlock)[VGOFF_(m_dflag)] = VG_(extractDflag)(VG_(m_state_static)[56/4]);
+
    VG_(baseBlock)[VGOFF_(m_eip)] = VG_(m_state_static)[60/4];
 
    for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
@@ -1299,7 +1328,7 @@
    if (0) { 
       Int p, q;
       VG_(printf)("pid=%d\n", VG_(getpid)());
-      for (p = 0; p < 50000; p++)
+      for (p = 0; p < 5000; p++)
          for (q = 0; q < 50000; q++) ;
    }
 
diff --git a/coregrind/vg_scheduler.c b/coregrind/vg_scheduler.c
index 4e6561b..c549cc4 100644
--- a/coregrind/vg_scheduler.c
+++ b/coregrind/vg_scheduler.c
@@ -391,7 +391,6 @@
    return tst->tid;
 }
 
-
 /* Copy the saved state of a thread into VG_(baseBlock), ready for it
    to be run. */
 __inline__
@@ -416,7 +415,8 @@
    VG_(baseBlock)[VGOFF_(m_edi)] = VG_(threads)[tid].m_edi;
    VG_(baseBlock)[VGOFF_(m_ebp)] = VG_(threads)[tid].m_ebp;
    VG_(baseBlock)[VGOFF_(m_esp)] = VG_(threads)[tid].m_esp;
-   VG_(baseBlock)[VGOFF_(m_eflags)] = VG_(threads)[tid].m_eflags;
+   VG_(baseBlock)[VGOFF_(m_eflags)] = VG_(threads)[tid].m_eflags & ~EFlagD;
+   VG_(baseBlock)[VGOFF_(m_dflag)] = VG_(extractDflag)(VG_(threads)[tid].m_eflags);
    VG_(baseBlock)[VGOFF_(m_eip)] = VG_(threads)[tid].m_eip;
 
    for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
@@ -490,7 +490,8 @@
    VG_(threads)[tid].m_edi = VG_(baseBlock)[VGOFF_(m_edi)];
    VG_(threads)[tid].m_ebp = VG_(baseBlock)[VGOFF_(m_ebp)];
    VG_(threads)[tid].m_esp = VG_(baseBlock)[VGOFF_(m_esp)];
-   VG_(threads)[tid].m_eflags = VG_(baseBlock)[VGOFF_(m_eflags)];
+   VG_(threads)[tid].m_eflags = VG_(insertDflag)(VG_(baseBlock)[VGOFF_(m_eflags)],
+						 VG_(baseBlock)[VGOFF_(m_dflag)]);
    VG_(threads)[tid].m_eip = VG_(baseBlock)[VGOFF_(m_eip)];
 
    for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
diff --git a/include/vg_skin.h b/include/vg_skin.h
index e56c8eb..db21f28 100644
--- a/include/vg_skin.h
+++ b/include/vg_skin.h
@@ -640,11 +640,12 @@
 /* flag positions in eflags */
 #define EFlagC  (1 <<  0)       /* carry */
 #define EFlagP  (1 <<  2)       /* parity */
+#define EFlagA	(1 <<  4)	/* aux carry */
 #define EFlagZ  (1 <<  6)       /* zero */
 #define EFlagS  (1 <<  7)       /* sign */
+#define EFlagD  (1 << 10)	/* direction */
 #define EFlagO  (1 << 11)       /* overflow */
 
-
 /* Liveness of general purpose registers, useful for code generation.
    Reg rank order 0..N-1 corresponds to bits 0..N-1, ie. first
    reg's liveness in bit 0, last reg's in bit N-1.  Note that
@@ -886,7 +887,7 @@
 extern void VG_(emitB)    ( UInt b );
 extern void VG_(emitW)    ( UInt w );
 extern void VG_(emitL)    ( UInt l );
-extern void VG_(new_emit) ( void );
+extern void VG_(new_emit) ( Bool upd_cc, FlagSet uses_flags, FlagSet sets_flags );
 
 /* Finding offsets */
 extern Int  VG_(helper_offset)       ( Addr a );
@@ -899,7 +900,8 @@
 extern Int VG_(rank_to_realreg) ( Int rank    );
 
 /* Call a subroutine.  Does no argument passing, stack manipulations, etc. */
-extern void VG_(synth_call) ( Bool ensure_shortform, Int word_offset );
+extern void VG_(synth_call) ( Bool ensure_shortform, Int word_offset, 
+			      Bool upd_cc, FlagSet use_flags, FlagSet set_flags );
 
 /* For calling C functions -- saves caller save regs, pushes args, calls,
    clears the stack, restores caller save regs.  `fn' must be registered in
@@ -922,20 +924,20 @@
 extern void VG_(emit_movv_offregmem_reg) ( Int sz, Int off, Int areg, Int reg );
 extern void VG_(emit_movv_reg_offregmem) ( Int sz, Int reg, Int off, Int areg );
 extern void VG_(emit_movv_reg_reg)       ( Int sz, Int reg1, Int reg2 );
-extern void VG_(emit_nonshiftopv_lit_reg)( Int sz, Opcode opc, UInt lit,
+extern void VG_(emit_nonshiftopv_lit_reg)( Bool upd_cc, Int sz, Opcode opc, UInt lit,
                                            Int reg );
-extern void VG_(emit_shiftopv_lit_reg)   ( Int sz, Opcode opc, UInt lit,
+extern void VG_(emit_shiftopv_lit_reg)   ( Bool upd_cc, Int sz, Opcode opc, UInt lit,
                                            Int reg );
-extern void VG_(emit_nonshiftopv_reg_reg)( Int sz, Opcode opc,
+extern void VG_(emit_nonshiftopv_reg_reg)( Bool upd_cc, Int sz, Opcode opc,
                                            Int reg1, Int reg2 );
 extern void VG_(emit_movv_lit_reg)       ( Int sz, UInt lit, Int reg );
-extern void VG_(emit_unaryopv_reg)       ( Int sz, Opcode opc, Int reg );
+extern void VG_(emit_unaryopv_reg)       ( Bool upd_cc, Int sz, Opcode opc, Int reg );
 extern void VG_(emit_pushv_reg)          ( Int sz, Int reg );
 extern void VG_(emit_popv_reg)           ( Int sz, Int reg );
 
 extern void VG_(emit_pushl_lit32)        ( UInt int32 );
 extern void VG_(emit_pushl_lit8)         ( Int lit8 );
-extern void VG_(emit_cmpl_zero_reg)      ( Int reg );
+extern void VG_(emit_cmpl_zero_reg)      ( Bool upd_cc, Int reg );
 extern void VG_(emit_swapl_reg_EAX)      ( Int reg );
 extern void VG_(emit_movv_lit_offregmem) ( Int sz, UInt lit, Int off,
                                            Int memreg );
@@ -943,8 +945,8 @@
 /* b-size (1 byte) instruction emitters */
 extern void VG_(emit_movb_lit_offregmem) ( UInt lit, Int off, Int memreg );
 extern void VG_(emit_movb_reg_offregmem) ( Int reg, Int off, Int areg );
-extern void VG_(emit_unaryopb_reg)       ( Opcode opc, Int reg );
-extern void VG_(emit_testb_lit_reg)      ( UInt lit, Int reg );
+extern void VG_(emit_unaryopb_reg)       ( Bool upd_cc, Opcode opc, Int reg );
+extern void VG_(emit_testb_lit_reg)      ( Bool upd_cc, UInt lit, Int reg );
 
 /* zero-extended load emitters */
 extern void VG_(emit_movzbl_offregmem_reg) ( Int off, Int regmem, Int reg );
@@ -953,7 +955,7 @@
 /* misc instruction emitters */
 extern void VG_(emit_call_reg)         ( Int reg );
 extern void VG_(emit_add_lit_to_esp)   ( Int lit );
-extern void VG_(emit_jcondshort_delta) ( Condcode cond, Int delta );
+extern void VG_(emit_jcondshort_delta) ( Bool simd_cc, Condcode cond, Int delta );
 extern void VG_(emit_pushal)           ( void );
 extern void VG_(emit_popal)            ( void );
 extern void VG_(emit_AMD_prefetch_reg) ( Int reg );
diff --git a/memcheck/mc_from_ucode.c b/memcheck/mc_from_ucode.c
index 2185ac0..d47131c 100644
--- a/memcheck/mc_from_ucode.c
+++ b/memcheck/mc_from_ucode.c
@@ -51,7 +51,7 @@
 
 static void emit_testv_lit_reg ( Int sz, UInt lit, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(False, FlagsEmpty, FlagsOSZACP);
    if (sz == 2) {
       VG_(emitB) ( 0x66 );
    } else {
@@ -67,7 +67,7 @@
 
 static void emit_testv_lit_offregmem ( Int sz, UInt lit, Int off, Int reg )
 {
-   VG_(new_emit)();
+   VG_(new_emit)(False, FlagsEmpty, FlagsOSZACP);
    if (sz == 2) {
       VG_(emitB) ( 0x66 );
    } else {
@@ -94,7 +94,7 @@
 {
    if ((lit & 0xFFFFFF00) == 0 && reg32 < 4) {
       /* We can get away with a byte insn. */
-      VG_(emit_testb_lit_reg) ( lit, reg32 );
+      VG_(emit_testb_lit_reg) ( False, lit, reg32 );
    }
    else 
    if ((lit & 0xFFFF0000) == 0) {
@@ -207,7 +207,7 @@
                we might as well compare it against zero, which can be
                done with a shorter insn. */
             /* synth_minimal_test_lit_reg ( 0xFFFFFFFF, val ); */
-            VG_(emit_cmpl_zero_reg) ( val );
+            VG_(emit_cmpl_zero_reg) ( False, val );
             break;
          case 2:
             synth_minimal_test_lit_reg ( 0x0000FFFF, val );
@@ -222,7 +222,7 @@
             VG_(skin_panic)("synth_TESTV(RealReg)");
       }
    }
-   VG_(emit_jcondshort_delta) ( CondZ, 3 );
+   VG_(emit_jcondshort_delta) ( False, CondZ, 3 );
    VG_(synth_call) (
       True, /* needed to guarantee that this insn is indeed 3 bytes long */
       ( sz==4 
@@ -231,7 +231,8 @@
         ? VG_(helper_offset)((Addr) & MC_(helper_value_check2_fail))
         : ( sz==1 
           ? VG_(helper_offset)((Addr) & MC_(helper_value_check1_fail))
-          : VG_(helper_offset)((Addr) & MC_(helper_value_check0_fail)))))
+	  : VG_(helper_offset)((Addr) & MC_(helper_value_check0_fail))))),
+      False, FlagsEmpty, FlagsOSZACP /* helpers don't preserve flags */
    );
 }
 
@@ -247,7 +248,7 @@
       case 2: 
          VG_(emit_movzwl_offregmem_reg) ( VG_(shadow_reg_offset)(arch),
                                           R_EBP, reg );
-         VG_(emit_nonshiftopv_lit_reg) ( 4, OR, 0xFFFF0000, reg );
+         VG_(emit_nonshiftopv_lit_reg) ( False, 4, OR, 0xFFFF0000, reg );
          break;
       case 1: 
          if (arch < 4) {
@@ -257,7 +258,7 @@
             VG_(emit_movzbl_offregmem_reg) ( VG_(shadow_reg_offset)(arch-4)+1,
                                              R_EBP, reg );
          }
-         VG_(emit_nonshiftopv_lit_reg) ( 4, OR, 0xFFFFFF00, reg );
+         VG_(emit_nonshiftopv_lit_reg) ( False, 4, OR, 0xFFFFFF00, reg );
          break;
       default: 
          VG_(skin_panic)("synth_GETV");
@@ -364,25 +365,25 @@
             or 0xFFFFFFFE, %reg   -- invalidate all bits except lowest
       */
       case Tag_PCast40:
-         VG_(emit_unaryopv_reg)(4, NEG, reg);
-         VG_(emit_nonshiftopv_reg_reg)(4, SBB, reg, reg);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFFFFFE, reg);
+         VG_(emit_unaryopv_reg)(False, 4, NEG, reg);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, SBB, reg, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFFFFFE, reg);
          break;
       case Tag_PCast20:
-         VG_(emit_unaryopv_reg)(2, NEG, reg);
-         VG_(emit_nonshiftopv_reg_reg)(4, SBB, reg, reg);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFFFFFE, reg);
+         VG_(emit_unaryopv_reg)(False, 2, NEG, reg);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, SBB, reg, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFFFFFE, reg);
          break;
       case Tag_PCast10:
          if (reg >= 4) {
             VG_(emit_swapl_reg_EAX)(reg);
-            VG_(emit_unaryopb_reg)(NEG, R_EAX);
+            VG_(emit_unaryopb_reg)(False, NEG, R_EAX);
             VG_(emit_swapl_reg_EAX)(reg);
          } else {
-            VG_(emit_unaryopb_reg)(NEG, reg);
+            VG_(emit_unaryopb_reg)(False, NEG, reg);
          }
-         VG_(emit_nonshiftopv_reg_reg)(4, SBB, reg, reg);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFFFFFE, reg);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, SBB, reg, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFFFFFE, reg);
          break;
 
       /* Scheme is
@@ -391,18 +392,18 @@
             and possibly an OR to invalidate unused bits.
       */
       case Tag_PCast04:
-         VG_(emit_nonshiftopv_lit_reg)(4, AND, 0x00000001, reg);
-         VG_(emit_unaryopv_reg)(4, NEG, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, AND, 0x00000001, reg);
+         VG_(emit_unaryopv_reg)(False, 4, NEG, reg);
          break;
       case Tag_PCast02:
-         VG_(emit_nonshiftopv_lit_reg)(4, AND, 0x00000001, reg);
-         VG_(emit_unaryopv_reg)(4, NEG, reg);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFF0000, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, AND, 0x00000001, reg);
+         VG_(emit_unaryopv_reg)(False, 4, NEG, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFF0000, reg);
          break;
       case Tag_PCast01:
-         VG_(emit_nonshiftopv_lit_reg)(4, AND, 0x00000001, reg);
-         VG_(emit_unaryopv_reg)(4, NEG, reg);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFFFF00, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, AND, 0x00000001, reg);
+         VG_(emit_unaryopv_reg)(False, 4, NEG, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFFFF00, reg);
          break;
 
       /* Scheme is
@@ -412,38 +413,39 @@
             and possibly an OR to invalidate unused bits.
       */
       case Tag_PCast14:
-         VG_(emit_shiftopv_lit_reg)(4, SHL, 24, reg);
-         VG_(emit_unaryopv_reg)(4, NEG, reg);
-         VG_(emit_nonshiftopv_reg_reg)(4, SBB, reg, reg);
+         VG_(emit_shiftopv_lit_reg)(False, 4, SHL, 24, reg);
+         VG_(emit_unaryopv_reg)(False, 4, NEG, reg);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, SBB, reg, reg);
          break;
       case Tag_PCast12:
-         VG_(emit_shiftopv_lit_reg)(4, SHL, 24, reg);
-         VG_(emit_unaryopv_reg)(4, NEG, reg);
-         VG_(emit_nonshiftopv_reg_reg)(4, SBB, reg, reg);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFF0000, reg);
+         VG_(emit_shiftopv_lit_reg)(False, 4, SHL, 24, reg);
+         VG_(emit_unaryopv_reg)(False, 4, NEG, reg);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, SBB, reg, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFF0000, reg);
          break;
       case Tag_PCast11:
-         VG_(emit_shiftopv_lit_reg)(4, SHL, 24, reg);
-         VG_(emit_unaryopv_reg)(4, NEG, reg);
-         VG_(emit_nonshiftopv_reg_reg)(4, SBB, reg, reg);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFFFF00, reg);
+         VG_(emit_shiftopv_lit_reg)(False, 4, SHL, 24, reg);
+         VG_(emit_unaryopv_reg)(False, 4, NEG, reg);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, SBB, reg, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFFFF00, reg);
          break;
 
       /* We use any non-live reg (except %reg) as a temporary,
          or push/pop %ebp if none available:
-            (%dead_reg = any dead reg, else %ebp)
-            (pushl %ebp if all regs live)
+            (%dead_reg = any dead reg, else choose anything other than %reg)
+            (pushl %dead_reg if live)
             movl %reg, %dead_reg
             negl %dead_reg
             orl %dead_reg, %reg
-            (popl %ebp if all regs live)
+            (popl %dead_reg if live)
          This sequence turns out to be correct regardless of the 
          operation width.
       */
       case Tag_Left4:
       case Tag_Left2:
       case Tag_Left1: {
-         UInt dead_reg = R_EBP;
+	 Bool push = True;
+         UInt dead_reg = R_ESP;
          Int  i, reg_of_i;
 
          for (i = 0; i < VG_MAX_REALREGS; i++) {
@@ -451,17 +453,22 @@
                reg_of_i = VG_(rank_to_realreg)(i);
                if (reg != reg_of_i) {
                   dead_reg = reg_of_i;
+		  push = False;
                   break;
                }
             }
          }
-
-         if (R_EBP == dead_reg)
+	 
+         if (push) {
+	    dead_reg = (reg != R_EAX) ? R_EAX : R_EBX;
             VG_(emit_pushv_reg)(4, dead_reg);
+	 }
+
          VG_(emit_movv_reg_reg)(4, reg, dead_reg);
-         VG_(emit_unaryopv_reg)(4, NEG, dead_reg);
-         VG_(emit_nonshiftopv_reg_reg)(4, OR, dead_reg, reg);
-         if (R_EBP == dead_reg)
+         VG_(emit_unaryopv_reg)(False, 4, NEG, dead_reg);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, OR, dead_reg, reg);
+
+         if (push)
             VG_(emit_popv_reg)(4, dead_reg);
          break;
       }
@@ -469,27 +476,27 @@
       /* These are all fairly obvious; do the op and then, if
          necessary, invalidate unused bits. */
       case Tag_SWiden14:
-         VG_(emit_shiftopv_lit_reg)(4, SHL, 24, reg);
-         VG_(emit_shiftopv_lit_reg)(4, SAR, 24, reg);
+         VG_(emit_shiftopv_lit_reg)(False, 4, SHL, 24, reg);
+         VG_(emit_shiftopv_lit_reg)(False, 4, SAR, 24, reg);
          break;
       case Tag_SWiden24:
-         VG_(emit_shiftopv_lit_reg)(4, SHL, 16, reg);
-         VG_(emit_shiftopv_lit_reg)(4, SAR, 16, reg);
+         VG_(emit_shiftopv_lit_reg)(False, 4, SHL, 16, reg);
+         VG_(emit_shiftopv_lit_reg)(False, 4, SAR, 16, reg);
          break;
       case Tag_SWiden12:
-         VG_(emit_shiftopv_lit_reg)(4, SHL, 24, reg);
-         VG_(emit_shiftopv_lit_reg)(4, SAR, 24, reg);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFF0000, reg);
+         VG_(emit_shiftopv_lit_reg)(False, 4, SHL, 24, reg);
+         VG_(emit_shiftopv_lit_reg)(False, 4, SAR, 24, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFF0000, reg);
          break;
       case Tag_ZWiden14:
-         VG_(emit_nonshiftopv_lit_reg)(4, AND, 0x000000FF, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, AND, 0x000000FF, reg);
          break;
       case Tag_ZWiden24:
-         VG_(emit_nonshiftopv_lit_reg)(4, AND, 0x0000FFFF, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, AND, 0x0000FFFF, reg);
          break;
       case Tag_ZWiden12:
-         VG_(emit_nonshiftopv_lit_reg)(4, AND, 0x000000FF, reg);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFF0000, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, AND, 0x000000FF, reg);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFF0000, reg);
          break;
 
       default:
@@ -507,14 +514,14 @@
       case Tag_UifU2:
       case Tag_UifU1:
       case Tag_UifU0:
-         VG_(emit_nonshiftopv_reg_reg)(4, OR, regs, regd);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, OR, regs, regd);
          break;
 
       /* DifD is implemented by AND, since 0 means Defined. */
       case Tag_DifD4:
       case Tag_DifD2:
       case Tag_DifD1:
-         VG_(emit_nonshiftopv_reg_reg)(4, AND, regs, regd);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, AND, regs, regd);
          break;
 
       /* ImproveAND(value, tags) = value OR tags.
@@ -523,15 +530,15 @@
          Be paranoid and invalidate unused bits; I don't know whether 
          or not this is actually necessary. */
       case Tag_ImproveAND4_TQ:
-         VG_(emit_nonshiftopv_reg_reg)(4, OR, regs, regd);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, OR, regs, regd);
          break;
       case Tag_ImproveAND2_TQ:
-         VG_(emit_nonshiftopv_reg_reg)(4, OR, regs, regd);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFF0000, regd);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, OR, regs, regd);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFF0000, regd);
          break;
       case Tag_ImproveAND1_TQ:
-         VG_(emit_nonshiftopv_reg_reg)(4, OR, regs, regd);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFFFF00, regd);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, OR, regs, regd);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFFFF00, regd);
          break;
 
       /* ImproveOR(value, tags) = (not value) OR tags.
@@ -542,21 +549,21 @@
          Be paranoid and invalidate unused bits; I don't know whether 
          or not this is actually necessary. */
       case Tag_ImproveOR4_TQ:
-         VG_(emit_unaryopv_reg)(4, NOT, regd);
-         VG_(emit_nonshiftopv_reg_reg)(4, AND, regs, regd);
-         VG_(emit_unaryopv_reg)(4, NOT, regd);
+         VG_(emit_unaryopv_reg)(False, 4, NOT, regd);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, AND, regs, regd);
+         VG_(emit_unaryopv_reg)(False, 4, NOT, regd);
          break;
       case Tag_ImproveOR2_TQ:
-         VG_(emit_unaryopv_reg)(4, NOT, regd);
-         VG_(emit_nonshiftopv_reg_reg)(4, AND, regs, regd);
-         VG_(emit_unaryopv_reg)(4, NOT, regd);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFF0000, regd);
+         VG_(emit_unaryopv_reg)(False, 4, NOT, regd);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, AND, regs, regd);
+         VG_(emit_unaryopv_reg)(False, 4, NOT, regd);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFF0000, regd);
          break;
       case Tag_ImproveOR1_TQ:
-         VG_(emit_unaryopv_reg)(4, NOT, regd);
-         VG_(emit_nonshiftopv_reg_reg)(4, AND, regs, regd);
-         VG_(emit_unaryopv_reg)(4, NOT, regd);
-         VG_(emit_nonshiftopv_lit_reg)(4, OR, 0xFFFFFF00, regd);
+         VG_(emit_unaryopv_reg)(False, 4, NOT, regd);
+         VG_(emit_nonshiftopv_reg_reg)(False, 4, AND, regs, regd);
+         VG_(emit_unaryopv_reg)(False, 4, NOT, regd);
+         VG_(emit_nonshiftopv_lit_reg)(False, 4, OR, 0xFFFFFF00, regd);
          break;
 
       default: