Fill in many amd64 front end and back end cases.


git-svn-id: svn://svn.valgrind.org/vex/trunk@879 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest-amd64/gdefs.h b/priv/guest-amd64/gdefs.h
index 4763aa0..6a6824c 100644
--- a/priv/guest-amd64/gdefs.h
+++ b/priv/guest-amd64/gdefs.h
@@ -127,6 +127,8 @@
 
 //extern void  amd64g_storeF80le ( ULong/*addr*/, ULong );
 
+extern void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* st );
+
 //extern void  amd64g_dirtyhelper_CPUID_sse0 ( VexGuestAMD64State* );
 //extern void  amd64g_dirtyhelper_CPUID_sse1 ( VexGuestAMD64State* );
 //extern void  amd64g_dirtyhelper_CPUID_sse2 ( VexGuestAMD64State* );
diff --git a/priv/guest-amd64/ghelpers.c b/priv/guest-amd64/ghelpers.c
index b5ff928..48e375c 100644
--- a/priv/guest-amd64/ghelpers.c
+++ b/priv/guest-amd64/ghelpers.c
@@ -493,6 +493,7 @@
       case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
       case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
       case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
+      case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
 
       case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
       case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
@@ -1062,6 +1063,85 @@
 }
 
 
+
+/*---------------------------------------------------------------*/
+/*--- Misc integer helpers, including rotates and CPUID.      ---*/
+/*---------------------------------------------------------------*/
+
+/* Claim to be the following CPU:
+   vendor_id       : AuthenticAMD
+   cpu family      : 15
+   model           : 12
+   model name      : AMD Athlon(tm) 64 Processor 3200+
+   stepping        : 0
+   cpu MHz         : 2202.917
+   cache size      : 512 KB
+   fpu             : yes
+   fpu_exception   : yes
+   cpuid level     : 1
+   wp              : yes
+   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr 
+                     pge mca cmov pat pse36 clflush mmx fxsr sse sse2 
+                     pni syscall nx mmxext lm 3dnowext 3dnow
+   bogomips        : 4308.99
+   TLB size        : 1088 4K pages
+   clflush size    : 64
+   cache_alignment : 64
+   address sizes   : 40 bits physical, 48 bits virtual
+   power management: ts fid vid ttp
+*/
+void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* st )
+{
+#  define SET_ABCD(_a,_b,_c,_d)                \
+      do { st->guest_RAX = (ULong)(_a);        \
+           st->guest_RBX = (ULong)(_b);        \
+           st->guest_RCX = (ULong)(_c);        \
+           st->guest_RDX = (ULong)(_d);        \
+      } while (0)
+
+   switch (0xFFFFFFFF & st->guest_RAX) {
+      case 0x0: 
+         SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65); 
+         break;
+      case 0x1: 
+         SET_ABCD(0x00000fc0, 0x00000800, 0x00000000, 0x078bfbff); 
+         break;
+      case 0x80000000: 
+         SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65); 
+         break;
+      case 0x80000001: 
+         SET_ABCD(0x00000fc0, 0x0000010a, 0x00000000, 0xe1d3fbff); 
+         break;
+      case 0x80000002: 
+         SET_ABCD(0x20444d41, 0x6c687441, 0x74286e6f, 0x3620296d); 
+         break;
+      case 0x80000003: 
+         SET_ABCD(0x72502034, 0x7365636f, 0x20726f73, 0x30303233); 
+         break;
+      case 0x80000004: 
+         SET_ABCD(0x0000002b, 0x00000000, 0x00000000, 0x00000000); 
+         break;
+      case 0x80000005: 
+         SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140); 
+         break;
+      case 0x80000006: 
+         SET_ABCD(0x00000000, 0x42004200, 0x02008140, 0x00000000); 
+         break;
+      case 0x80000007: 
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f); 
+         break;
+      case 0x80000008: 
+         SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000); 
+         break;
+      default:         
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); 
+         break;
+   }
+#  undef SET_ABCD
+}
+
+
+
 /*---------------------------------------------------------------*/
 /*--- Helpers for dealing with, and describing,               ---*/
 /*--- guest state as a whole.                                 ---*/
@@ -1095,6 +1175,7 @@
    vex_state->guest_CC_DEP2 = 0;
    vex_state->guest_CC_NDEP = 0;
 
+   vex_state->guest_DFLAG   = 1; /* forwards */
    // XXX: add more here later, for D/ID flags
 
    vex_state->guest_RIP = 0;
diff --git a/priv/guest-amd64/toIR.c b/priv/guest-amd64/toIR.c
index 8b6944f..65c6de3 100644
--- a/priv/guest-amd64/toIR.c
+++ b/priv/guest-amd64/toIR.c
@@ -339,7 +339,7 @@
 
 //.. #define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
 //.. #define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
-//.. #define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
+#define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
 //.. #define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
 //.. #define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
 //.. #define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
@@ -3534,24 +3534,28 @@
 }
 
 
-//.. /*------------------------------------------------------------*/
-//.. /*--- Disassembling string ops (including REP prefixes)    ---*/
-//.. /*------------------------------------------------------------*/
-//.. 
-//.. /* Code shared by all the string ops */
-//.. static
-//.. void dis_string_op_increment(Int sz, Int t_inc)
-//.. {
-//..    if (sz == 4 || sz == 2) {
-//..       assign( t_inc, 
-//..               binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
-//..                                mkU8(sz/2) ) );
-//..    } else {
-//..       assign( t_inc, 
-//..               IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
-//..    }
-//.. }
-//.. 
+/*------------------------------------------------------------*/
+/*--- Disassembling string ops (including REP prefixes)    ---*/
+/*------------------------------------------------------------*/
+
+/* Code shared by all the string ops */
+static
+void dis_string_op_increment ( Int sz, IRTemp t_inc )
+{
+   UChar logSz;
+   if (sz == 8 || sz == 4 || sz == 2) {
+      logSz = 1;
+      if (sz == 4) logSz = 2;
+      if (sz == 8) logSz = 3;
+      assign( t_inc, 
+              binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
+                               mkU8(logSz) ) );
+   } else {
+      assign( t_inc, 
+              IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
+   }
+}
+
 //.. static
 //.. void dis_string_op( void (*dis_OP)( Int, IRTemp ), 
 //..                     Int sz, Char* name, UChar sorb )
@@ -3623,44 +3627,31 @@
 //..    //uInstr2(cb, PUT,   4, TempReg, td,    ArchReg, R_EDI);
 //..    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
 //.. }
-//.. 
-//.. static 
-//.. void dis_CMPS ( Int sz, IRTemp t_inc )
-//.. {
-//..    IRType ty  = szToITy(sz);
-//..    IRTemp tdv = newTemp(ty);      /* (EDI) */
-//..    IRTemp tsv = newTemp(ty);      /* (ESI) */
-//..    //IRTemp res = newTemp(ty);
-//..    IRTemp td  = newTemp(Ity_I32); /*  EDI  */
-//..    IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
-//.. 
-//..    //uInstr2(cb, GET,   4, ArchReg, R_EDI, TempReg, td);
-//..    assign( td, getIReg(4, R_EDI) );
-//.. 
-//..    //uInstr2(cb, GET,   4, ArchReg, R_ESI, TempReg, ts);
-//..    assign( ts, getIReg(4, R_ESI) );
-//.. 
-//..    //uInstr2(cb, LOAD, sz, TempReg, td,    TempReg, tdv);
-//..    assign( tdv, loadLE(ty,mkexpr(td)) );
-//.. 
-//..    //uInstr2(cb, LOAD, sz, TempReg, ts,    TempReg, tsv);
-//..    assign( tsv, loadLE(ty,mkexpr(ts)) );
-//.. 
-//..    //uInstr2(cb, SUB,  sz, TempReg, tdv,   TempReg, tsv); 
-//..    //setFlagsFromUOpcode(cb, SUB);
-//..    //assign( res, binop(mkSizedOp(ty, Iop_Sub8), mkexpr(tsv), mkexpr(tdv)) );
-//..    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
-//.. 
-//..    //uInstr2(cb, ADD,   4, TempReg, t_inc, TempReg, td);
-//..    //uInstr2(cb, ADD,   4, TempReg, t_inc, TempReg, ts);
-//.. 
-//..    //uInstr2(cb, PUT,   4, TempReg, td,    ArchReg, R_EDI);
-//..    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
-//.. 
-//..    //uInstr2(cb, PUT,   4, TempReg, ts,    ArchReg, R_ESI);
-//..    putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
-//.. }
-//.. 
+
+static 
+void dis_CMPS ( Int sz, IRTemp t_inc )
+{
+   IRType ty  = szToITy(sz);
+   IRTemp tdv = newTemp(ty);      /* (RDI) */
+   IRTemp tsv = newTemp(ty);      /* (RSI) */
+   IRTemp td  = newTemp(Ity_I64); /*  RDI  */
+   IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
+
+   assign( td, getIReg64(R_RDI) );
+
+   assign( ts, getIReg64(R_RSI) );
+
+   assign( tdv, loadLE(ty,mkexpr(td)) );
+
+   assign( tsv, loadLE(ty,mkexpr(ts)) );
+
+   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
+
+   putIReg64(R_RDI, binop(Iop_Add64, mkexpr(td), mkexpr(t_inc)) );
+
+   putIReg64(R_RSI, binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc)) );
+}
+
 //.. static 
 //.. void dis_SCAS ( Int sz, IRTemp t_inc )
 //.. {
@@ -3688,45 +3679,40 @@
 //..    //uInstr2(cb, PUT,   4, TempReg, td,    ArchReg, R_EDI);
 //..    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
 //.. }
-//.. 
-//.. 
-//.. /* Wrap the appropriate string op inside a REP/REPE/REPNE.
-//..    We assume the insn is the last one in the basic block, and so emit a jump
-//..    to the next insn, rather than just falling through. */
-//.. static 
-//.. void dis_REP_op ( X86Condcode cond,
-//..                   void (*dis_OP)(Int, IRTemp),
-//..                   Int sz, Addr32 eip, Addr32 eip_next, Char* name )
-//.. {
-//..    IRTemp t_inc = newTemp(Ity_I32);
-//..    IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
-//.. 
-//..    //uInstr2 (cb, GET,   4, ArchReg, R_ECX, TempReg, tc);
-//..    assign( tc, getIReg(4,R_ECX) );
-//.. 
-//..    //uInstr2 (cb, JIFZ,  4, TempReg, tc,    Literal, 0);
-//..    //uLiteral(cb, eip_next);
-//..    stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
-//..                       Ijk_Boring,
-//..                       IRConst_U32(eip_next) ) );
-//.. 
-//..    //uInstr1 (cb, DEC,   4, TempReg, tc);
-//..    //uInstr2 (cb, PUT,   4, TempReg, tc,    ArchReg, R_ECX);
-//..    putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
-//.. 
-//..    dis_string_op_increment(sz, t_inc);
-//..    dis_OP (sz, t_inc);
-//.. 
-//..    if (cond == X86CondAlways) {
-//..       jmp_lit(Ijk_Boring,eip);
-//..    } else {
-//..       stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
-//..                          Ijk_Boring,
-//..                          IRConst_U32(eip) ) );
-//..       jmp_lit(Ijk_Boring,eip_next);
-//..    }
-//..    DIP("%s%c\n", name, nameISize(sz));
-//.. }
+
+
+/* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
+   the insn is the last one in the basic block, and so emit a jump to
+   the next insn, rather than just falling through. */
+static 
+void dis_REP_op ( AMD64Condcode cond,
+                  void (*dis_OP)(Int, IRTemp),
+                  Int sz, Addr64 rip, Addr64 rip_next, HChar* name )
+{
+   IRTemp t_inc = newTemp(Ity_I64);
+   IRTemp tc    = newTemp(Ity_I64);  /*  RCX  */
+
+   assign( tc, getIReg64(R_RCX) );
+
+   stmt( IRStmt_Exit( binop(Iop_CmpEQ64,mkexpr(tc),mkU64(0)),
+                      Ijk_Boring,
+                      IRConst_U64(rip_next) ) );
+
+   putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
+
+   dis_string_op_increment(sz, t_inc);
+   dis_OP (sz, t_inc);
+
+   if (cond == AMD64CondAlways) {
+      jmp_lit(Ijk_Boring,rip);
+   } else {
+      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
+                         Ijk_Boring,
+                         IRConst_U64(rip) ) );
+      jmp_lit(Ijk_Boring,rip_next);
+   }
+   DIP("%s%c\n", name, nameISize(sz));
+}
 
 
 /*------------------------------------------------------------*/
@@ -11250,10 +11236,10 @@
       delta = dis_mov_G_E(pfx, sz, delta);
       break;
 
-//..    case 0x8A: /* MOV Eb,Gb */
-//..       delta = dis_mov_E_G(sorb, 1, delta);
-//..       break;
-//..  
+   case 0x8A: /* MOV Eb,Gb */
+      delta = dis_mov_E_G(pfx, 1, delta);
+      break;
+ 
    case 0x8B: /* MOV Ev,Gv */
       delta = dis_mov_E_G(pfx, sz, delta);
       break;
@@ -11915,13 +11901,13 @@
 //..    case 0xAF:
 //..       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
 //..       break;
-//.. 
-//.. 
-//..    case 0xFC: /* CLD */
-//..       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
-//..       DIP("cld\n");
-//..       break;
-//.. 
+
+
+   case 0xFC: /* CLD */
+      stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
+      DIP("cld\n");
+      break;
+
 //..    case 0xFD: /* STD */
 //..       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
 //..       DIP("std\n");
@@ -11986,10 +11972,24 @@
 //..       }
 //..       break;
 //..    }
-//.. 
-//..    /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
-//..       for the rest, it means REP) */
-//..    case 0xF3: { 
+
+   /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
+      for the rest, it means REP) */
+
+   case 0xA6: /* F3 A6: repe cmpsb */
+   case 0xA7: /* F3 A7: repe cmps{w,l,q} */
+      if (haveF3(pfx) && !haveF2(pfx)) {
+         if (opc == 0xA6)
+            sz = 1;
+         dis_REP_op ( AMD64CondZ, dis_CMPS, sz, 
+                                  guest_rip_curr_instr,
+                                  guest_rip_bbstart+delta, "repe cmps" );
+         whatNext = Dis_StopHere;
+         break;
+      }
+      goto decode_failure;
+
+//..   case 0xF3: { 
 //..       Addr32 eip_orig = guest_eip_bbstart + delta - 1;
 //..       vassert(sorb == 0);
 //..       abyte = getUChar(delta); delta++;
@@ -12389,102 +12389,51 @@
 //.. //--          eip = dis_cmpxchg8b ( cb, sorb, eip );
 //.. //--          break;
 //.. //-- 
-//..       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
-//.. 
-//..       case 0xA2: { /* CPUID */
-//..          /* Uses dirty helper: 
-//..                void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
-//..             declared to mod eax, wr ebx, ecx, edx
-//..          */
-//..          IRDirty* d     = NULL;
-//..          HChar*   fName = NULL;
-//..          void*    fAddr = NULL;
-//..          switch (subarch) {
-//..             case VexSubArchX86_sse0:
-//..                fName = "x86g_dirtyhelper_CPUID_sse0";
-//..                fAddr = &x86g_dirtyhelper_CPUID_sse0; 
-//..                break;
-//..             case VexSubArchX86_sse1:
-//..                fName = "x86g_dirtyhelper_CPUID_sse1";
-//..                fAddr = &x86g_dirtyhelper_CPUID_sse1; 
-//..                break;
-//..             case VexSubArchX86_sse2:
-//..                fName = "x86g_dirtyhelper_CPUID_sse2";
-//..                fAddr = &x86g_dirtyhelper_CPUID_sse2; 
-//..                break;
-//..             default:
-//..                vpanic("disInstr(x86)(cpuid)");
-//..          }
-//..          vassert(fName); vassert(fAddr);
-//..          d = unsafeIRDirty_0_N ( 0/*regparms*/, 
-//..                                  fName, fAddr, mkIRExprVec_0() );
-//..          /* declare guest state effects */
-//..          d->needsBBP = True;
-//..          d->nFxState = 4;
-//..          d->fxState[0].fx     = Ifx_Modify;
-//..          d->fxState[0].offset = OFFB_EAX;
-//..          d->fxState[0].size   = 4;
-//..          d->fxState[1].fx     = Ifx_Write;
-//..          d->fxState[1].offset = OFFB_EBX;
-//..          d->fxState[1].size   = 4;
-//..          d->fxState[2].fx     = Ifx_Write;
-//..          d->fxState[2].offset = OFFB_ECX;
-//..          d->fxState[2].size   = 4;
-//..          d->fxState[3].fx     = Ifx_Write;
-//..          d->fxState[3].offset = OFFB_EDX;
-//..          d->fxState[3].size   = 4;
-//..          /* execute the dirty call, side-effecting guest state */
-//..          stmt( IRStmt_Dirty(d) );
-//..          /* CPUID is a serialising insn.  So, just in case someone is
-//..             using it as a memory fence ... */
-//..          stmt( IRStmt_MFence() );
-//..          DIP("cpuid\n");
-//..          break;
-//..       }
-//.. 
-//.. //--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
-//.. //--             goto decode_failure;
-//.. //-- 
-//.. //--          t1 = newTemp(cb);
-//.. //--          t2 = newTemp(cb);
-//.. //--          t3 = newTemp(cb);
-//.. //--          t4 = newTemp(cb);
-//.. //--          uInstr0(cb, CALLM_S, 0);
-//.. //-- 
-//.. //--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
-//.. //--          uInstr1(cb, PUSH,  4, TempReg, t1);
-//.. //-- 
-//.. //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
-//.. //--          uLiteral(cb, 0);
-//.. //--          uInstr1(cb, PUSH,  4, TempReg, t2);
-//.. //-- 
-//.. //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
-//.. //--          uLiteral(cb, 0);
-//.. //--          uInstr1(cb, PUSH,  4, TempReg, t3);
-//.. //-- 
-//.. //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
-//.. //--          uLiteral(cb, 0);
-//.. //--          uInstr1(cb, PUSH,  4, TempReg, t4);
-//.. //-- 
-//.. //--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
-//.. //--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
-//.. //-- 
-//.. //--          uInstr1(cb, POP,   4, TempReg, t4);
-//.. //--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
-//.. //-- 
-//.. //--          uInstr1(cb, POP,   4, TempReg, t3);
-//.. //--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
-//.. //-- 
-//.. //--          uInstr1(cb, POP,   4, TempReg, t2);
-//.. //--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
-//.. //-- 
-//.. //--          uInstr1(cb, POP,   4, TempReg, t1);
-//.. //--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
-//.. //-- 
-//.. //--          uInstr0(cb, CALLM_E, 0);
-//.. //--          DIP("cpuid\n");
-//.. //--          break;
-//.. //-- 
+      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
+
+      case 0xA2: { /* CPUID */
+         /* Uses dirty helper: 
+               void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
+            declared to mod rax, wr rbx, rcx, rdx
+         */
+         IRDirty* d     = NULL;
+         HChar*   fName = NULL;
+         void*    fAddr = NULL;
+         switch (subarch) {
+            case VexSubArch_NONE:
+               fName = "amd64g_dirtyhelper_CPUID";
+               fAddr = &amd64g_dirtyhelper_CPUID; 
+               break;
+            default:
+               vpanic("disInstr(amd64)(cpuid)");
+         }
+         vassert(fName); vassert(fAddr);
+         d = unsafeIRDirty_0_N ( 0/*regparms*/, 
+                                 fName, fAddr, mkIRExprVec_0() );
+         /* declare guest state effects */
+         d->needsBBP = True;
+         d->nFxState = 4;
+         d->fxState[0].fx     = Ifx_Modify;
+         d->fxState[0].offset = OFFB_RAX;
+         d->fxState[0].size   = 8;
+         d->fxState[1].fx     = Ifx_Write;
+         d->fxState[1].offset = OFFB_RBX;
+         d->fxState[1].size   = 8;
+         d->fxState[2].fx     = Ifx_Write;
+         d->fxState[2].offset = OFFB_RCX;
+         d->fxState[2].size   = 8;
+         d->fxState[3].fx     = Ifx_Write;
+         d->fxState[3].offset = OFFB_RDX;
+         d->fxState[3].size   = 8;
+         /* execute the dirty call, side-effecting guest state */
+         stmt( IRStmt_Dirty(d) );
+         /* CPUID is a serialising insn.  So, just in case someone is
+            using it as a memory fence ... */
+         stmt( IRStmt_MFence() );
+         DIP("cpuid\n");
+         break;
+      }
+
       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
 
       case 0xB6: /* MOVZXb Eb,Gv */
diff --git a/priv/host-amd64/hdefs.c b/priv/host-amd64/hdefs.c
index 00d1df7..cdb1a10 100644
--- a/priv/host-amd64/hdefs.c
+++ b/priv/host-amd64/hdefs.c
@@ -514,13 +514,13 @@
    }
 }
  
-//.. HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
-//..    switch (op) {
-//..       case Xun_NOT: return "not";
-//..       case Xun_NEG: return "neg";
-//..       default: vpanic("showAMD64UnaryOp");
-//..    }
-//.. }
+HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
+   switch (op) {
+      case Aun_NOT: return "not";
+      case Aun_NEG: return "neg";
+      default: vpanic("showAMD64UnaryOp");
+   }
+}
 
 HChar* showAMD64AluOp ( AMD64AluOp op ) {
    switch (op) {
@@ -685,13 +685,13 @@
    i->Ain.Test64.dst = dst;
    return i;
 }
-//.. AMD64Instr* AMD64Instr_Unary32  ( AMD64UnaryOp op, AMD64RM* dst ) {
-//..    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
-//..    i->tag             = Xin_Unary32;
-//..    i->Xin.Unary32.op  = op;
-//..    i->Xin.Unary32.dst = dst;
-//..    return i;
-//.. }
+AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, AMD64RM* dst ) {
+   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag             = Ain_Unary64;
+   i->Ain.Unary64.op  = op;
+   i->Ain.Unary64.dst = dst;
+   return i;
+}
 AMD64Instr* AMD64Instr_MulL ( Bool syned, Int sz, AMD64RM* src ) {
    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    i->tag            = Ain_MulL;
@@ -793,17 +793,13 @@
 //..    i->Xin.Bsfr32.dst    = dst;
 //..    return i;
 //.. }
-//.. AMD64Instr* AMD64Instr_MFence ( VexSubArch subarch )
-//.. {
-//..    AMD64Instr* i           = LibVEX_Alloc(sizeof(AMD64Instr));
-//..    i->tag                = Xin_MFence;
-//..    i->Xin.MFence.subarch = subarch;
-//..    vassert(subarch == VexSubArchAMD64_sse0
-//..            || subarch == VexSubArchAMD64_sse1
-//..            || subarch == VexSubArchAMD64_sse2);
-//..    return i;
-//.. }
-//.. 
+AMD64Instr* AMD64Instr_MFence ( void )
+{
+   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag                = Ain_MFence;
+   return i;
+}
+
 //.. AMD64Instr* AMD64Instr_FpUnary ( AMD64FpOp op, HReg src, HReg dst ) {
 //..    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
 //..    i->tag             = Xin_FpUnary;
@@ -1001,10 +997,10 @@
          vex_printf(",");
          ppAMD64RM(i->Ain.Test64.dst);
          return;
-//..       case Xin_Unary32:
-//..          vex_printf("%sl ", showAMD64UnaryOp(i->Xin.Unary32.op));
-//..          ppAMD64RM(i->Xin.Unary32.dst);
-//..          return;
+      case Ain_Unary64:
+         vex_printf("%sl ", showAMD64UnaryOp(i->Ain.Unary64.op));
+         ppAMD64RM(i->Ain.Unary64.dst);
+         return;
       case Ain_MulL:
          vex_printf("%cmul%s ",
                     i->Ain.MulL.syned ? 's' : 'u',
@@ -1101,10 +1097,9 @@
 //..          vex_printf(",");
 //..          ppHRegAMD64(i->Xin.Bsfr32.dst);
 //..          return;
-//..       case Xin_MFence:
-//..          vex_printf("mfence(%s)",
-//..                     LibVEX_ppVexSubArch(i->Xin.MFence.subarch));
-//..          return;
+      case Ain_MFence:
+         vex_printf("mfence" );
+         return;
 //..       case Xin_FpUnary:
 //..          vex_printf("g%sD ", showAMD64FpOp(i->Xin.FpUnary.op));
 //..          ppHRegAMD64(i->Xin.FpUnary.src);
@@ -1279,9 +1274,9 @@
          addRegUsage_AMD64RI(u, i->Ain.Test64.src);
          addRegUsage_AMD64RM(u, i->Ain.Test64.dst, HRmRead);
          return;
-//..       case Xin_Unary32:
-//..          addRegUsage_AMD64RM(u, i->Xin.Unary32.dst, HRmModify);
-//..          return;
+      case Ain_Unary64:
+         addRegUsage_AMD64RM(u, i->Ain.Unary64.dst, HRmModify);
+         return;
       case Ain_MulL:
          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
          addHRegUse(u, HRmModify, hregAMD64_RAX());
@@ -1367,8 +1362,8 @@
 //..          addHRegUse(u, HRmRead, i->Xin.Bsfr32.src);
 //..          addHRegUse(u, HRmWrite, i->Xin.Bsfr32.dst);
 //..          return;
-//..       case Xin_MFence:
-//..          return;
+      case Ain_MFence:
+         return;
 //..       case Xin_FpUnary:
 //..          addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
 //..          addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
@@ -1510,9 +1505,9 @@
          mapRegs_AMD64RI(m, i->Ain.Test64.src);
          mapRegs_AMD64RM(m, i->Ain.Test64.dst);
          return;
-//..       case Xin_Unary32:
-//..          mapRegs_AMD64RM(m, i->Xin.Unary32.dst);
-//..          return;
+      case Ain_Unary64:
+         mapRegs_AMD64RM(m, i->Ain.Unary64.dst);
+         return;
       case Ain_MulL:
          mapRegs_AMD64RM(m, i->Ain.MulL.src);
          return;
@@ -1554,8 +1549,8 @@
 //..          mapReg(m, &i->Xin.Bsfr32.src);
 //..          mapReg(m, &i->Xin.Bsfr32.dst);
 //..          return;
-//..       case Xin_MFence:
-//..          return;
+      case Ain_MFence:
+         return;
 //..       case Xin_FpUnary:
 //..          mapReg(m, &i->Xin.FpUnary.src);
 //..          mapReg(m, &i->Xin.FpUnary.dst);
@@ -1676,42 +1671,42 @@
    condition codes. */
 
 AMD64Instr* genSpill_AMD64 ( HReg rreg, Int offsetB )
-{vassert(0);
-//..    AMD64AMode* am;
-//..    vassert(offsetB >= 0);
-//..    vassert(!hregIsVirtual(rreg));
-//..    am = AMD64AMode_IR(offsetB, hregAMD64_EBP());
-//.. 
-//..    switch (hregClass(rreg)) {
-//..       case HRcInt32:
-//..          return AMD64Instr_Alu32M ( Xalu_MOV, AMD64RI_Reg(rreg), am );
-//..       case HRcFlt64:
-//..          return AMD64Instr_FpLdSt ( False/*store*/, 8, rreg, am );
-//..       case HRcVec128:
-//..          return AMD64Instr_SseLdSt ( False/*store*/, rreg, am );
-//..       default: 
-//..          ppHRegClass(hregClass(rreg));
-//..          vpanic("genSpill_AMD64: unimplemented regclass");
-//..    }
+{
+   AMD64AMode* am;
+   vassert(offsetB >= 0);
+   vassert(!hregIsVirtual(rreg));
+   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
+
+   switch (hregClass(rreg)) {
+      case HRcInt64:
+         return AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
+	 //case HRcFlt64:
+	 //   return AMD64Instr_FpLdSt ( False/*store*/, 8, rreg, am );
+	 //case HRcVec128:
+	 //   return AMD64Instr_SseLdSt ( False/*store*/, rreg, am );
+      default: 
+         ppHRegClass(hregClass(rreg));
+         vpanic("genSpill_AMD64: unimplemented regclass");
+   }
 }
 
 AMD64Instr* genReload_AMD64 ( HReg rreg, Int offsetB )
-{vassert(0);
-//..    AMD64AMode* am;
-//..    vassert(offsetB >= 0);
-//..    vassert(!hregIsVirtual(rreg));
-//..    am = AMD64AMode_IR(offsetB, hregAMD64_EBP());
-//..    switch (hregClass(rreg)) {
-//..       case HRcInt32:
-//..          return AMD64Instr_Alu32R ( Xalu_MOV, AMD64RMI_Mem(am), rreg );
-//..       case HRcFlt64:
-//..          return AMD64Instr_FpLdSt ( True/*load*/, 8, rreg, am );
-//..       case HRcVec128:
-//..          return AMD64Instr_SseLdSt ( True/*load*/, rreg, am );
-//..       default: 
-//..          ppHRegClass(hregClass(rreg));
-//..          vpanic("genReload_AMD64: unimplemented regclass");
-//..    }
+{
+   AMD64AMode* am;
+   vassert(offsetB >= 0);
+   vassert(!hregIsVirtual(rreg));
+   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
+   switch (hregClass(rreg)) {
+      case HRcInt64:
+         return AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
+	 //case HRcFlt64:
+	 //   return AMD64Instr_FpLdSt ( True/*load*/, 8, rreg, am );
+	 //case HRcVec128:
+	 //   return AMD64Instr_SseLdSt ( True/*load*/, rreg, am );
+      default: 
+         ppHRegClass(hregClass(rreg));
+         vpanic("genReload_AMD64: unimplemented regclass");
+   }
 }
 
 
@@ -2084,21 +2079,23 @@
                goto bad;
          }
       }
-//..       /* MUL */
-//..       if (i->Xin.Alu32R.op == Xalu_MUL) {
-//..          switch (i->Xin.Alu32R.src->tag) {
+      /* MUL */
+      if (i->Ain.Alu64R.op == Aalu_MUL) {
+         switch (i->Ain.Alu64R.src->tag) {
 //..             case Xrmi_Reg:
 //..                *p++ = 0x0F;
 //..                *p++ = 0xAF;
 //..                p = doAMode_R(p, i->Xin.Alu32R.dst,
 //..                                 i->Xin.Alu32R.src->Xrmi.Reg.reg);
 //..                goto done;
-//..             case Xrmi_Mem:
-//..                *p++ = 0x0F;
-//..                *p++ = 0xAF;
-//..                p = doAMode_M(p, i->Xin.Alu32R.dst,
-//..                                 i->Xin.Alu32R.src->Xrmi.Mem.am);
-//..                goto done;
+            case Armi_Mem:
+               *p++ = rexAMode_M(i->Ain.Alu64R.dst,
+                                 i->Ain.Alu64R.src->Armi.Mem.am);
+               *p++ = 0x0F;
+               *p++ = 0xAF;
+               p = doAMode_M(p, i->Ain.Alu64R.dst,
+                                i->Ain.Alu64R.src->Armi.Mem.am);
+               goto done;
 //..             case Xrmi_Imm:
 //..                if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
 //..                   *p++ = 0x6B;
@@ -2110,10 +2107,10 @@
 //..                   p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
 //..                }
 //..                goto done;
-//..             default:
-//..                goto bad;
-//..          }
-//..       }
+            default:
+               goto bad;
+         }
+      }
       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
       opc = opc_rr = subopc_imm = opc_imma = 0;
       switch (i->Ain.Alu64R.op) {
@@ -2272,16 +2269,17 @@
       }
       break;
 
-//..    case Xin_Unary32:
-//..       if (i->Xin.Unary32.op == Xun_NOT) {
-//..          *p++ = 0xF7;
-//..          if (i->Xin.Unary32.dst->tag == Xrm_Reg) {
-//..             p = doAMode_R(p, fake(2), i->Xin.Unary32.dst->Xrm.Reg.reg);
-//..             goto done;
-//..          } else {
-//..             goto bad;
-//..          }
-//..       }
+   case Ain_Unary64:
+      if (i->Ain.Unary64.op == Aun_NOT) {
+         if (i->Ain.Unary64.dst->tag == Arm_Reg) {
+            *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst->Arm.Reg.reg);
+            *p++ = 0xF7;
+            p = doAMode_R(p, fake(2), i->Ain.Unary64.dst->Arm.Reg.reg);
+            goto done;
+         } else {
+            goto bad;
+         }
+      }
 //..       if (i->Xin.Unary32.op == Xun_NEG) {
 //..          *p++ = 0xF7;
 //..          if (i->Xin.Unary32.dst->tag == Xrm_Reg) {
@@ -2291,7 +2289,7 @@
 //..             goto bad;
 //..          }
 //..       }
-//..       break;
+      break;
 
    case Ain_MulL:
       subopc = i->Ain.MulL.syned ? 5 : 4;
@@ -2562,32 +2560,11 @@
 //..       }
 //..       p = doAMode_R(p, i->Xin.Bsfr32.dst, i->Xin.Bsfr32.src);
 //..       goto done;
-//.. 
-//..    case Xin_MFence:
-//..       /* see comment in hdefs.h re this insn */
-//..       if (0) vex_printf("EMIT FENCE\n");
-//..       switch (i->Xin.MFence.subarch) {
-//..          case VexSubArchAMD64_sse0:
-//..             vassert(0); /* awaiting test case */
-//..             /* lock addl $0,0(%esp) */
-//..             *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44; 
-//..             *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
-//..             goto done;
-//..          case VexSubArchAMD64_sse1:
-//..             /* sfence */
-//..             *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF8;
-//..             /* lock addl $0,0(%esp) */
-//..             *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44; 
-//..             *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
-//..             goto done;
-//..          case VexSubArchAMD64_sse2:
-//..             /* mfence */
-//..             *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
-//..             goto done;
-//..          default: 
-//..             vpanic("emit_AMD64Instr:mfence:subarch");
-//..       }
-//..       break;
+
+   case Ain_MFence:
+      /* mfence */
+      *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
+      goto done;
 
    case Ain_Store:
       if (i->Ain.Store.sz == 2) {
diff --git a/priv/host-amd64/hdefs.h b/priv/host-amd64/hdefs.h
index e1e44f9..96026f2 100644
--- a/priv/host-amd64/hdefs.h
+++ b/priv/host-amd64/hdefs.h
@@ -252,17 +252,17 @@
 extern void ppAMD64RM ( AMD64RM* );
 
 
-//.. /* --------- Instructions. --------- */
-//.. 
-//.. /* --------- */
-//.. typedef
-//..    enum {
-//..       Xun_NEG,
-//..       Xun_NOT
-//..    }
-//..    X86UnaryOp;
-//.. 
-//.. extern HChar* showX86UnaryOp ( X86UnaryOp );
+/* --------- Instructions. --------- */
+
+/* --------- */
+typedef
+   enum {
+      Aun_NEG,
+      Aun_NOT
+   }
+   AMD64UnaryOp;
+
+extern HChar* showAMD64UnaryOp ( AMD64UnaryOp );
 
 
 /* --------- */
@@ -358,7 +358,7 @@
       Ain_Alu64M,    /* 64-bit mov/arith/logical, dst=MEM */
       Ain_Sh64,      /* 64-bit shift/rotate, dst=REG or MEM */
       Ain_Test64,    /* 64-bit test (AND, set flags, discard result) */
-//..       Xin_Unary32,   /* 32-bit not and neg */
+      Ain_Unary64,   /* 64-bit not and neg */
       Ain_MulL,      /* widening multiply */
 //..       Xin_Div,       /* div and mod */
 //..       Xin_Sh3232,    /* shldl or shrdl */
@@ -371,7 +371,7 @@
       Ain_Store,     /* store 32/16/8 bit value in memory */
 //..       Xin_Set32,     /* convert condition code to 32-bit value */
 //..       Xin_Bsfr32,    /* 32-bit bsf/bsr */
-//..       Xin_MFence,    /* mem fence (not just sse2, but sse0 and 1 too) */
+      Ain_MFence,    /* mem fence */
 //.. 
 //..       Xin_FpUnary,   /* FP fake unary op */
 //..       Xin_FpBinary,  /* FP fake binary op */
@@ -425,11 +425,11 @@
             AMD64RI* src;
             AMD64RM* dst;
          } Test64;
-//..          /* Not and Neg */
-//..          struct {
-//..             X86UnaryOp op;
-//..             X86RM*     dst;
-//..          } Unary32;
+         /* Not and Neg */
+         struct {
+            AMD64UnaryOp op;
+            AMD64RM*     dst;
+         } Unary64;
          /* DX:AX = AX *s/u r/m16, or EDX:EAX = EAX *s/u r/m32,
             or RDX:RAX = RAX *s/u r/m64 */
          struct {
@@ -503,17 +503,12 @@
 //..             HReg src;
 //..             HReg dst;
 //..          } Bsfr32;
-//..          /* Mem fence (not just sse2, but sse0 and 1 too).  In short,
-//..             an insn which flushes all preceding loads and stores as
-//..             much as possible before continuing.  On SSE2 we emit a
-//..             real "mfence", on SSE1 "sfence ; lock addl $0,0(%esp)" and
-//..             on SSE0 "lock addl $0,0(%esp)".  This insn therefore
-//..             carries the subarch so the assembler knows what to
-//..             emit. */
-//..          struct {
-//..             VexSubArch subarch;
-//..          } MFence;
-//.. 
+         /* Mem fence.  In short, an insn which flushes all preceding
+            loads and stores as much as possible before continuing.
+            On AMD64 we emit a real "mfence". */
+         struct {
+         } MFence;
+
 //..          /* X86 Floating point (fake 3-operand, "flat reg file" insns) */
 //..          struct {
 //..             X86FpOp op;
@@ -633,7 +628,7 @@
 extern AMD64Instr* AMD64Instr_Imm64     ( ULong imm64, HReg dst );
 extern AMD64Instr* AMD64Instr_Alu64R    ( AMD64AluOp, AMD64RMI*, HReg );
 extern AMD64Instr* AMD64Instr_Alu64M    ( AMD64AluOp, AMD64RI*,  AMD64AMode* );
-//.. extern AMD64Instr* AMD64Instr_Unary32   ( AMD64UnaryOp op, AMD64RM* dst );
+extern AMD64Instr* AMD64Instr_Unary64   ( AMD64UnaryOp op, AMD64RM* dst );
 extern AMD64Instr* AMD64Instr_Sh64      ( AMD64ShiftOp, UInt, AMD64RM* );
 extern AMD64Instr* AMD64Instr_Test64    ( AMD64RI* src, AMD64RM* dst );
 extern AMD64Instr* AMD64Instr_MulL      ( Bool syned, Int sz, AMD64RM* );
@@ -649,7 +644,7 @@
 extern AMD64Instr* AMD64Instr_Store     ( UChar sz, HReg src, AMD64AMode* dst );
 //.. extern AMD64Instr* AMD64Instr_Set32     ( AMD64CondCode cond, HReg dst );
 //.. extern AMD64Instr* AMD64Instr_Bsfr32    ( Bool isFwds, HReg src, HReg dst );
-//.. extern AMD64Instr* AMD64Instr_MFence    ( VexSubArch );
+extern AMD64Instr* AMD64Instr_MFence    ( void );
 //.. 
 //.. extern AMD64Instr* AMD64Instr_FpUnary   ( AMD64FpOp op, HReg src, HReg dst );
 //.. extern AMD64Instr* AMD64Instr_FpBinary  ( AMD64FpOp op, HReg srcL, HReg srcR, HReg dst );
diff --git a/priv/host-amd64/isel.c b/priv/host-amd64/isel.c
index b12998f..3c336a6 100644
--- a/priv/host-amd64/isel.c
+++ b/priv/host-amd64/isel.c
@@ -789,7 +789,8 @@
          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64: 
             aluOp = Aalu_XOR; break;
 //..          case Iop_Mul16: case Iop_Mul32: 
-//..             aluOp = Xalu_MUL; break;
+         case Iop_Mul64:
+            aluOp = Aalu_MUL; break;
          default:
             aluOp = Aalu_INVALID; break;
       }
@@ -1104,13 +1105,14 @@
          }
 //.. 	 case Iop_Not8:
 //.. 	 case Iop_Not16:
-//..          case Iop_Not32: {
-//..             HReg dst = newVRegI(env);
-//..             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
-//..             addInstr(env, mk_iMOVsd_RR(src,dst) );
-//..             addInstr(env, X86Instr_Unary32(Xun_NOT,X86RM_Reg(dst)));
-//..             return dst;
-//..          }
+//..          case Iop_Not32:
+         case Iop_Not64: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(src,dst) );
+            addInstr(env, AMD64Instr_Unary64(Aun_NOT,AMD64RM_Reg(dst)));
+            return dst;
+         }
 //..          case Iop_64HIto32: {
 //..             HReg rHi, rLo;
 //..             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
@@ -1661,29 +1663,30 @@
 //..          return iselCondCode(env, mi.bindee[0]);
 //..       }
 //..    }
-//.. 
-//..    /* Cmp*32*(x,y) */
-//..    if (e->tag == Iex_Binop 
-//..        && (e->Iex.Binop.op == Iop_CmpEQ32
-//..            || e->Iex.Binop.op == Iop_CmpNE32
-//..            || e->Iex.Binop.op == Iop_CmpLT32S
-//..            || e->Iex.Binop.op == Iop_CmpLT32U
-//..            || e->Iex.Binop.op == Iop_CmpLE32S
-//..            || e->Iex.Binop.op == Iop_CmpLE32U)) {
-//..       HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
-//..       X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
-//..       addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
-//..       switch (e->Iex.Binop.op) {
-//..          case Iop_CmpEQ32:  return Xcc_Z;
-//..          case Iop_CmpNE32:  return Xcc_NZ;
-//..          case Iop_CmpLT32S: return Xcc_L;
-//..          case Iop_CmpLT32U: return Xcc_B;
-//..          case Iop_CmpLE32S: return Xcc_LE;
-//..          case Iop_CmpLE32U: return Xcc_BE;
-//..          default: vpanic("iselCondCode(x86): CmpXX32");
-//..       }
-//..    }
-//.. 
+
+   /* Cmp*64*(x,y) */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpEQ64
+           || e->Iex.Binop.op == Iop_CmpNE64
+           //|| e->Iex.Binop.op == Iop_CmpLT64S
+           //|| e->Iex.Binop.op == Iop_CmpLT64U
+           //|| e->Iex.Binop.op == Iop_CmpLE64S
+           //|| e->Iex.Binop.op == Iop_CmpLE64U
+          )) {
+      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpEQ64:  return Acc_Z;
+         case Iop_CmpNE64:  return Acc_NZ;
+	   //case Iop_CmpLT64S: return Acc_L;
+	   //case Iop_CmpLT64U: return Acc_B;
+	   //case Iop_CmpLE64S: return Acc_LE;
+	   //case Iop_CmpLE64U: return Acc_BE;
+         default: vpanic("iselCondCode(amd64): CmpXX64");
+      }
+   }
+
 //..    /* CmpNE64(1Sto64(b), 0) ==> b */
 //..    {
 //..       DECLARE_PATTERN(p_CmpNE64_1Sto64);
@@ -3419,48 +3422,39 @@
       break;
    }
 
-//..    /* --------- Call to DIRTY helper --------- */
-//..    case Ist_Dirty: {
-//..       IRType   retty;
-//..       IRDirty* d = stmt->Ist.Dirty.details;
-//..       Bool     passBBP = False;
-//.. 
-//..       if (d->nFxState == 0)
-//..          vassert(!d->needsBBP);
-//..       passBBP = d->nFxState > 0 && d->needsBBP;
-//.. 
-//..       /* Marshal args, do the call, clear stack. */
-//..       doHelperCall( env, passBBP, d->guard, d->cee, d->args );
-//.. 
-//..       /* Now figure out what to do with the returned value, if any. */
-//..       if (d->tmp == IRTemp_INVALID)
-//..          /* No return value.  Nothing to do. */
-//..          return;
-//.. 
-//..       retty = typeOfIRTemp(env->type_env, d->tmp);
-//..       if (retty == Ity_I64) {
-//..          HReg dstHi, dstLo;
-//..          /* The returned value is in %edx:%eax.  Park it in the
-//..             register-pair associated with tmp. */
-//..          lookupIRTemp64( &dstHi, &dstLo, env, d->tmp);
-//..          addInstr(env, mk_iMOVsd_RR(hregX86_EDX(),dstHi) );
-//..          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dstLo) );
-//..          return;
-//..       }
-//..       if (retty == Ity_I32 || retty == Ity_I16 || retty == Ity_I8) {
-//..          /* The returned value is in %eax.  Park it in the register
-//..             associated with tmp. */
-//..          HReg dst = lookupIRTemp(env, d->tmp);
-//..          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dst) );
-//..          return;
-//..       }
-//..       break;
-//..    }
-//.. 
-//..    /* --------- MEM FENCE --------- */
-//..    case Ist_MFence:
-//..       addInstr(env, X86Instr_MFence(env->subarch));
-//..       return;
+   /* --------- Call to DIRTY helper --------- */
+   case Ist_Dirty: {
+      IRType   retty;
+      IRDirty* d = stmt->Ist.Dirty.details;
+      Bool     passBBP = False;
+
+      if (d->nFxState == 0)
+         vassert(!d->needsBBP);
+      passBBP = d->nFxState > 0 && d->needsBBP;
+
+      /* Marshal args, do the call, clear stack. */
+      doHelperCall( env, passBBP, d->guard, d->cee, d->args );
+
+      /* Now figure out what to do with the returned value, if any. */
+      if (d->tmp == IRTemp_INVALID)
+         /* No return value.  Nothing to do. */
+         return;
+
+      retty = typeOfIRTemp(env->type_env, d->tmp);
+      if (retty == Ity_I64) {
+         /* The returned value is in %rax.  Park it in the register
+            associated with tmp. */
+         HReg dst = lookupIRTemp(env, d->tmp);
+         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
+         return;
+      }
+      break;
+   }
+
+   /* --------- MEM FENCE --------- */
+   case Ist_MFence:
+      addInstr(env, AMD64Instr_MFence());
+      return;
 
    /* --------- EXIT --------- */
    case Ist_Exit: {
diff --git a/pub/libvex_guest_amd64.h b/pub/libvex_guest_amd64.h
index 01c9d86..ade65fe 100644
--- a/pub/libvex_guest_amd64.h
+++ b/pub/libvex_guest_amd64.h
@@ -67,7 +67,9 @@
       ULong  guest_CC_DEP1;
       ULong  guest_CC_DEP2;
       ULong  guest_CC_NDEP;
-      /* EIP */
+      /* The D flag is stored here, encoded as either -1 or +1 */
+      ULong  guest_DFLAG;       /* 48 */
+      /* RIP */
       ULong  guest_RIP;
       /* Probably a lot more stuff too. 
          D,ID flags