Implement support for the MMX instruction set.  The scheme used is
the same as that for FPU instructions.  That is, regard the MMX state
(which is the same as the FPU state) opaquely, and every time we
need to do a MMX instruction, move the simulated MMX state into the
real CPU, do the instruction, and move it back.  JeremyF's optimisation
to minimise FPU saves/restores applies automatically here.

So, this scheme is simple.  It will cause memcheck to complain bitterly
if uninitialised data is copied through the MMX registers, in the same
way that memcheck complains if you move uninit data through the FPU
registers.  Whether this turns out to be a problem remains to be seen.

Most instructions are done, and doing the rest is easy enough, I just
need people to send test cases so I can do them on demand.

(Core) UCode has been extended with 7 new uinstrs:

   MMX1 MMX2 MMX3
      -- 1/2/3 byte mmx insns, no references to
         integer regs or memory, copy exactly to the output stream.

   MMX_MemRd  MMX_MemWr
      -- 2 byte mmx insns which read/write memory and therefore need
         to have an address register patched in at code generation
         time.  These are the analogues to FPU_R / FPU_W.

   MMX_RegRd  MMX_RegWr
      -- These have no analogues in FPU land.  They hold 2 byte insns
         which move data to/from a normal integer register (%eax etc),
         and so this has to be made explicit so that (1) a suitable
         int reg can be patched in at codegen time, and (2) so that
         memcheck can do suitable magic with the V bits going into/
         out of the MMX regs.

Nulgrind (ok, this is a nop, but still ...) and AddrCheck's
instrumenters have been extended to cover these new UInstrs.  All
others (cachesim, memcheck, lackey, helgrind, did I forget any)
abort when they see any of them.  This may be overkill but at least
it ensures we don't forget to implement it in those skins.
[A bad thing would be that some skin silently passes along
MMX uinstrs because of a default: case, when it should actually
do something with them.]

If this works out well, I propose to backport this to 2_0_BRANCH.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1483 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/addrcheck/ac_main.c b/addrcheck/ac_main.c
index 1bd842e..8de3290 100644
--- a/addrcheck/ac_main.c
+++ b/addrcheck/ac_main.c
@@ -1094,6 +1094,18 @@
             VG_(copy_UInstr)(cb, u_in);
             break;
 
+         case MMX2_MemRd:
+         case MMX2_MemWr:
+            sk_assert(u_in->size == 8);
+            t_addr = u_in->val2;
+            t_size = newTemp(cb);
+	    uInstr2(cb, MOV, 4, Literal, 0, TempReg, t_size);
+	    uLiteral(cb, 8);
+            uInstr2(cb, CCALL, 0, TempReg, t_addr, TempReg, t_size);
+            uCCall(cb, (Addr) & ac_fpu_ACCESS_check, 2, 2, False );
+            VG_(copy_UInstr)(cb, u_in);
+            break;
+
          default:
             VG_(copy_UInstr)(cb, u_in);
             break;
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index 2a91bc0..0f79ecb 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -959,6 +959,13 @@
             has_rep_prefix = False; 
             break;
 
+         case MMX1: case MMX2: case MMX3:
+         case MMX2_MemRd: case MMX2_MemWr: 
+         case MMX2_RegRd: case MMX2_RegWr:
+            VG_(skin_panic)(
+               "I don't know how to instrument MMXish stuff (yet)");
+            break;
+
          default:
             VG_(copy_UInstr)(cb, u_in);
             break;
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index a70f157..4a48bdd 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -1382,6 +1382,49 @@
                   nameIReg(4,reg) );
 }
 
+static void emit_MMX2_regmem ( FlagSet uses_sflags, 
+                               FlagSet sets_sflags,
+			       UChar first_byte, 
+                               UChar second_byte, 
+                               Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( 0x0F );
+   VG_(emitB) ( first_byte );
+   second_byte &= 0x38; /* mask out mod and rm fields */
+   emit_amode_regmem_reg ( ireg, second_byte >> 3 );
+   if (dis)
+      VG_(printf)("\n\t\tmmx2-0x%x:0x%x-(%s)\n", 
+                  (UInt)first_byte, (UInt)second_byte,
+                  nameIReg(4,ireg) );
+}
+
+static void emit_MMX2_no_mem ( FlagSet uses_sflags, 
+                               FlagSet sets_sflags,
+			       UChar first_byte, 
+                               UChar second_byte )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( 0x0F );
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   if (dis)
+      VG_(printf)("\n\t\tmmx2-0x%x:0x%x\n", 
+                  (UInt)first_byte, (UInt)second_byte );
+}
+
+static void emit_MMX1_no_mem ( FlagSet uses_sflags, 
+                               FlagSet sets_sflags,
+			       UChar first_byte ) 
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( 0x0F );
+   VG_(emitB) ( first_byte );
+   if (dis)
+      VG_(printf)("\n\t\tmmx1-0x%x\n", 
+                  (UInt)first_byte );
+}
+
 
 /*----------------------------------------------------*/
 /*--- misc instruction emitters                    ---*/
@@ -2618,12 +2661,38 @@
 }
 
 
+static void synth_MMX2_regmem ( Bool uses_flags, Bool sets_flags,
+ 			        UChar first_byte,
+                                UChar second_byte, 
+                                Int ireg )
+{
+   emit_MMX2_regmem ( uses_flags, sets_flags, 
+                      first_byte, second_byte, ireg );
+}
+
+
+static void synth_MMX2_no_mem ( Bool uses_flags, Bool sets_flags,
+			        UChar first_byte,
+                                UChar second_byte )
+{
+   emit_MMX2_no_mem ( uses_flags, sets_flags, first_byte, second_byte );
+}
+
+
+static void synth_MMX1_no_mem ( Bool uses_flags, Bool sets_flags,
+			        UChar first_byte )
+{
+   emit_MMX1_no_mem ( uses_flags, sets_flags, first_byte );
+}
+
+
 static void synth_fpu_regmem ( Bool uses_flags, Bool sets_flags,
 			       UChar first_byte,
                                UChar second_byte_masked, 
                                Int reg )
 {
-   emit_fpu_regmem ( uses_flags, sets_flags, first_byte, second_byte_masked, reg );
+   emit_fpu_regmem ( uses_flags, sets_flags, 
+                     first_byte, second_byte_masked, reg );
 }
 
 
@@ -3354,6 +3423,44 @@
                             u->val1 & 0xFF );
          break;
 
+      case MMX2_MemWr:
+      case MMX2_MemRd:
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*fplive)) {
+            emit_get_fpu_state();
+            *fplive = True;
+         }
+         synth_MMX2_regmem ( u->flags_r, u->flags_w,
+                             (u->val1 >> 8) & 0xFF,
+                             u->val1 & 0xFF,
+                             u->val2 );
+         break;
+
+      case MMX1:
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == NoValue);
+	 if (!(*fplive)) {
+	    emit_get_fpu_state();
+	    *fplive = True;
+	 }
+         synth_MMX1_no_mem ( u->flags_r, u->flags_w,
+                             u->val1 & 0xFF );
+         break;
+
+      case MMX2:
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == NoValue);
+	 if (!(*fplive)) {
+	    emit_get_fpu_state();
+	    *fplive = True;
+	 }
+         synth_MMX2_no_mem ( u->flags_r, u->flags_w,
+			     (u->val1 >> 8) & 0xFF,
+                             u->val1 & 0xFF );
+         break;
+
       default: 
          if (VG_(needs).extended_UCode) {
 	    if (*fplive) {
diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c
index 60a1ec8..2f470c9 100644
--- a/coregrind/vg_to_ucode.c
+++ b/coregrind/vg_to_ucode.c
@@ -112,7 +112,8 @@
    static Char* ireg16_names[8] 
      = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
    static Char* ireg8_names[8] 
-     = { "%al", "%cl", "%dl", "%bl", "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
+     = { "%al", "%cl", "%dl", "%bl", 
+         "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
    if (reg < 0 || reg > 7) goto bad;
    switch (size) {
       case 4: return ireg32_names[reg];
@@ -137,6 +138,25 @@
    }
 }
 
+Char* VG_(name_of_mmx_reg) ( Int mmxreg )
+{
+   static Char* mmx_names[8] 
+     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
+   if (mmxreg < 0 || mmxreg > 7) VG_(core_panic)("name_of_mmx_reg");
+   return mmx_names[mmxreg];
+}
+
+Char* VG_(name_of_mmx_gran) ( UChar gran )
+{
+   switch (gran) {
+      case 0: return "b";
+      case 1: return "w";
+      case 2: return "d";
+      case 3: return "q";
+      default: VG_(core_panic)("name_of_mmx_gran");
+   }
+}
+
 Char VG_(name_of_int_size) ( Int size )
 {
    switch (size) {
@@ -3180,6 +3200,54 @@
 }
 
 
+
+/* Simple MMX operations, either 
+       op   (src)mmxreg, (dst)mmxreg
+   or
+       op   (src)address, (dst)mmxreg
+   opc is the byte following the 0x0F prefix.
+*/
+static 
+Addr dis_MMXop_regmem_to_reg ( UCodeBlock* cb, 
+                               UChar sorb,
+                               Addr eip,
+                               UChar opc,
+                               Char* name,
+                               Bool show_granularity )
+{
+   UChar dis_buf[50];
+   UChar modrm;
+   modrm = getUChar(eip);
+   if (epartIsReg(modrm)) {
+      eip++;
+      uInstr1(cb, MMX2, 0, 
+                  Lit16, 
+                  (((UShort)(opc)) << 8) | ((UShort)modrm) );
+      if (dis)
+         VG_(printf)("%s%s %s, %s\n", 
+                     name,
+                     show_granularity ? nameMMXGran(opc & 3) : (Char*)"",
+                     nameMMXReg(eregOfRM(modrm)),
+                     nameMMXReg(gregOfRM(modrm)));
+   } else {
+      UInt pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+      Int  tmpa = LOW24(pair);
+      eip += HI8(pair);
+      uInstr2(cb, MMX2_MemRd, 8, 
+                  Lit16, 
+                  (((UShort)(opc)) << 8) | ((UShort)modrm),
+                  TempReg, tmpa);
+      if (dis)
+         VG_(printf)("%s%s %s, %s\n", 
+                     name,
+                     show_granularity ? nameMMXGran(opc & 3) : (Char*)"",
+                     dis_buf,
+                     nameMMXReg(gregOfRM(modrm)));
+   }
+   return eip;
+}
+
+
 /*------------------------------------------------------------*/
 /*--- Disassembling entire basic blocks                    ---*/
 /*------------------------------------------------------------*/
@@ -4678,17 +4746,206 @@
          eip = dis_xadd_G_E ( cb, sorb, sz, eip );
          break;
 
+      /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
+
+      case 0x77: /* EMMS */
+         vg_assert(sz == 4);
+         uInstr1(cb, MMX1, 0, Lit16, ((UShort)(opc)) );
+         if (dis)
+            VG_(printf)("emms\n");
+         break;
+
+      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         modrm = getUChar(eip);
+         if (epartIsReg(modrm)) {
+            uInstr1(cb, MMX2, 0, 
+                        Lit16, 
+                        (((UShort)(opc)) << 8) | ((UShort)modrm) );
+            if (dis)
+               VG_(printf)("movq %s, %s\n", 
+                           nameMMXReg(eregOfRM(modrm)),
+                           nameMMXReg(gregOfRM(modrm)));
+         } else {
+            pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+            Int tmpa = LOW24(pair);
+            eip += HI8(pair);
+            uInstr2(cb, MMX2_MemRd, 8, 
+                        Lit16, 
+                        (((UShort)(opc)) << 8) | ((UShort)modrm),
+                        TempReg, tmpa);
+            if (dis)
+               VG_(printf)("movq %s, %s\n", 
+                           dis_buf,
+                           nameMMXReg(gregOfRM(modrm)));
+         }
+         break;
+
+      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
+         vg_assert(sz == 4);
+         modrm = getUChar(eip);
+         if (epartIsReg(modrm)) {
+            goto unimp2;
+         } else {
+            pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+            Int tmpa = LOW24(pair);
+            eip += HI8(pair);
+            uInstr2(cb, MMX2_MemWr, 8, 
+                        Lit16, 
+                        (((UShort)(opc)) << 8) | ((UShort)modrm),
+                        TempReg, tmpa);
+            if (dis)
+               VG_(printf)("movq %s, %s\n", 
+                           nameMMXReg(gregOfRM(modrm)),
+                           dis_buf);
+         }
+         break;
+
+      case 0xFC: case 0xFD: case 0xFE: 
+         /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "padd", True );
+         break;
+
+      case 0xEC: case 0xED:
+         /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "padds", True );
+         break;
+
+      case 0xDC: case 0xDD:
+         /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "paddus", True );
+         break;
+
+      case 0xF8: case 0xF9: case 0xFA:
+         /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psub", True );
+         break;
+
+      case 0xE8: case 0xE9:
+         /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psubs", True );
+         break;
+
+      case 0xD8: case 0xD9:
+         /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psubus", True );
+         break;
+
+      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmulhw", False );
+         break;
+
+      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmullw", False );
+         break;
+
+      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmaddwd", False );
+         break;
+
+      case 0x74: case 0x75: case 0x76: 
+         /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pcmpeq", True );
+         break;
+
+      case 0x64: case 0x65: case 0x66: 
+         /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pcmpgt", True );
+         break;
+
+      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "packssdw", False );
+         break;
+
+      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "packsswb", False );
+         break;
+
+      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "packuswb", False );
+         break;
+
+      case 0x68: case 0x69: case 0x6A: 
+         /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "punpckh", True );
+         break;
+
+      case 0x60: case 0x61: case 0x62:
+         /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "punpckl", True );
+         break;
+
+      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pand", False );
+         break;
+
+      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pandn", False );
+         break;
+
+      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "por", False );
+         break;
+
+      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pxor", False );
+         break;
+
+      case 0xF1: case 0xF2: case 0xF3:
+         /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psll", True );
+         break;
+
+      case 0xD1: case 0xD2: case 0xD3:
+         /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psrl", True );
+         break;
+
+      case 0xE1: case 0xE2:
+         /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psra", True );
+         break;
+
       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
 
       default:
-         VG_(printf)("disInstr: unhandled 2-byte opcode 0x%x\n", 
-                     (UInt)opc);
-	 VG_(printf)("This _might_ be the result of executing an "
-                     "MMX, SSE, SSE2 or 3DNow!\n" );
+      unimp2:
+         VG_(printf)("disInstr: unhandled 2-byte opcode: "
+                     "0x%x 0x%x 0x%x\n",
+                     (Int)getUChar(eip-1), 
+                     (Int)getUChar(eip+0), 
+                     (Int)getUChar(eip+1) );
+
+	 VG_(printf)("This _might_ be the result of executing a "
+                     "SSE, SSE2 or 3DNow!\n" );
 	 VG_(printf)("instruction.  Valgrind does not currently "
                      "support such instructions.  Sorry.\n" );
 	 uInstr0(cb, CALLM_S, 0);
-	 uInstr1(cb, CALLM,   0, Lit16, VGOFF_(helper_undefined_instruction));
+	 uInstr1(cb, CALLM,   0, Lit16, 
+                     VGOFF_(helper_undefined_instruction));
 	 uInstr0(cb, CALLM_E, 0);
 
 	 /* just because everything else insists the last instruction
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index e1c4984..545bd32 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -404,6 +404,7 @@
 #  define LIT0 (u->lit32 == 0)
 #  define LIT1 (!(LIT0))
 #  define LITm (u->tag1 == Literal ? True : LIT0 )
+#  define SZ8  (u->size == 8)
 #  define SZ4  (u->size == 4)
 #  define SZ2  (u->size == 2)
 #  define SZ1  (u->size == 1)
@@ -543,6 +544,11 @@
                        (u->argc > 1                   ? TR2 : N2) && 
                        (u->argc > 2 || u->has_ret_val ? TR3 : N3) &&
                        u->regparms_n <= u->argc && XCCALL;
+   /* Fields checked:     lit32   size  flags_r/w tag1   tag2   tag3    (rest) */
+   case MMX1:
+   case MMX2:       return LIT0 && SZ0  && CC0 &&  Ls1 &&  N2 &&  N3 && XOTHER;
+   case MMX2_MemRd: return LIT0 && SZ8  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
+   case MMX2_MemWr: return LIT0 && SZ8  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
    default: 
       if (VG_(needs).extended_UCode)
          return SK_(sane_XUInstr)(beforeRA, beforeLiveness, u);
@@ -556,6 +562,7 @@
 #  undef LIT0
 #  undef LIT1
 #  undef LITm
+#  undef SZ8
 #  undef SZ4
 #  undef SZ2
 #  undef SZ1
@@ -842,6 +849,10 @@
       case FPU_R:   return "FPU_R";
       case FPU_W:   return "FPU_W";
       case FPU:     return "FPU"  ;
+      case MMX1:       return "MMX1" ;
+      case MMX2:       return "MMX2" ;
+      case MMX2_MemRd: return "MMX2_MRd" ;
+      case MMX2_MemWr: return "MMX2_MWr" ;
       default:
          if (VG_(needs).extended_UCode)
             return SK_(name_XUOpcode)(opc);
@@ -960,6 +971,23 @@
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
          break;
 
+      case MMX1:
+         VG_(printf)("\t0x%x",
+                     u->val1 & 0xFF );
+         break;
+
+      case MMX2:
+         VG_(printf)("\t0x%x:0x%x",
+                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
+         break;
+
+      case MMX2_MemWr:
+      case MMX2_MemRd:
+         VG_(printf)("\t0x%x:0x%x",
+                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
+         VG_(pp_UOperand)(u, 2, 4, True);
+         break;
+
       case GET: case PUT: case MOV: case LOAD: case STORE: case CMOV:
       case GETSEG: case PUTSEG:
          VG_(printf)("\t");
@@ -1111,6 +1139,7 @@
       case LEA1: RD(1); WR(2); break;
       case LEA2: RD(1); RD(2); WR(3); break;
 
+      case MMX1: case MMX2:
       case NOP:   case FPU:   case INCEIP: case CALLM_S: case CALLM_E:
       case CLEAR: case CALLM: case LOCK: break;
 
@@ -1121,6 +1150,7 @@
          if (u->has_ret_val) WR(3);
          break;
 
+      case MMX2_MemRd: case MMX2_MemWr:
       case FPU_R: case FPU_W: RD(2); break;
 
       case GETSEG: WR(2); break;
@@ -1257,6 +1287,8 @@
       case CC2VAL:
       case JIFZ:
       case FPU: case FPU_R: case FPU_W:
+      case MMX1: case MMX2:
+      case MMX2_MemRd: case MMX2_MemWr:
       case WIDEN:
       /* GETSEG and USESEG are to do with ArchRegS, not ArchReg */
       case GETSEG: case PUTSEG: 
diff --git a/helgrind/hg_main.c b/helgrind/hg_main.c
index 17d0b69..8859aff 100644
--- a/helgrind/hg_main.c
+++ b/helgrind/hg_main.c
@@ -2055,6 +2055,13 @@
 	    break;
 	 }
 
+         case MMX1: case MMX2: case MMX3:
+         case MMX2_MemRd: case MMX2_MemWr:
+         case MMX2_RegRd: case MMX2_RegWr:
+            VG_(skin_panic)(
+               "I don't know how to instrument MMXish stuff (yet)");
+            break;
+
          default:
 	    /* conservative tromping */
 	    if (0 && u_in->tag1 == TempReg) /* can val1 ever be dest? */
diff --git a/include/vg_skin.h b/include/vg_skin.h
index 1ff3f80..71940d7 100644
--- a/include/vg_skin.h
+++ b/include/vg_skin.h
@@ -516,6 +516,41 @@
       FPU,           /* Doesn't touch memory */
       FPU_R, FPU_W,  /* Reads/writes memory  */
 
+      /* ------------ MMX ops ------------ */
+
+      /* 1 byte, no memrefs, no iregdefs, copy exactly to the
+	 output.  Held in val1[7:0]. */
+      MMX1,
+
+      /* 2 bytes, no memrefs, no iregdefs, copy exactly to the
+	 output.  Held in val1[15:0]. */
+      MMX2,
+
+      /* 3 bytes, no memrefs, no iregdefs, copy exactly to the
+         output.  Held in val1[15:0] and val2[7:0]. */
+      MMX3,
+
+      /* 2 bytes, reads/writes mem.  Insns of the form
+         bbbbbbbb:mod mmxreg r/m.
+         Held in val1[15:0], and mod and rm are to be replaced
+         at codegen time by a reference to the Temp/RealReg holding 
+         the address.  Arg2 holds this Temp/Real Reg.
+         Transfer is always at size 8.
+      */
+      MMX2_MemRd,
+      MMX2_MemWr,
+
+      /* 2 bytes, reads/writes an integer register.  Insns of the form
+         bbbbbbbb:11 mmxreg ireg.
+         Held in val1[15:0], and ireg is to be replaced
+         at codegen time by a reference to the relevant RealReg.
+         Transfer is always at size 4.  Arg2 holds this Temp/Real Reg.
+      */
+      MMX2_RegRd,
+      MMX2_RegWr,
+
+      /* ------------------------ */
+
       /* Not strictly needed, but improve address calculation translations. */
       LEA1,  /* reg2 := const + reg1 */
       LEA2,  /* reg3 := const + reg1 + reg2 * 1,2,4 or 8 */
@@ -931,14 +966,18 @@
 #define R_GS 5
 
 /* For pretty printing x86 code */
+extern Char* VG_(name_of_mmx_gran) ( UChar gran );
+extern Char* VG_(name_of_mmx_reg)  ( Int mmxreg );
 extern Char* VG_(name_of_seg_reg)  ( Int sreg );
 extern Char* VG_(name_of_int_reg)  ( Int size, Int reg );
 extern Char  VG_(name_of_int_size) ( Int size );
 
 /* Shorter macros for convenience */
-#define nameIReg  VG_(name_of_int_reg)
-#define nameISize VG_(name_of_int_size)
-#define nameSReg  VG_(name_of_seg_reg)
+#define nameIReg    VG_(name_of_int_reg)
+#define nameISize   VG_(name_of_int_size)
+#define nameSReg    VG_(name_of_seg_reg)
+#define nameMMXReg  VG_(name_of_mmx_reg)
+#define nameMMXGran VG_(name_of_mmx_gran)
 
 /* Randomly useful things */
 extern UInt  VG_(extend_s_8to32) ( UInt x );
diff --git a/lackey/lk_main.c b/lackey/lk_main.c
index 98c662d..5576a6d 100644
--- a/lackey/lk_main.c
+++ b/lackey/lk_main.c
@@ -181,7 +181,14 @@
                VG_(copy_UInstr)(cb, u);
             }
             break;
-            
+
+         case MMX1: case MMX2: case MMX3:
+         case MMX2_MemRd: case MMX2_MemWr:
+         case MMX2_RegRd: case MMX2_RegWr:
+            VG_(skin_panic)(
+               "I don't know how to instrument MMXish stuff (yet)");
+            break;
+         
          default:
             /* Count UInstr */
             VG_(call_helper_0_0)(cb, (Addr) & add_one_UInstr);
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 5247580..e3fb459 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -1081,6 +1081,13 @@
             VG_(copy_UInstr)(cb, u_in);
             break;
 
+         case MMX1: case MMX2: case MMX3:
+         case MMX2_MemRd: case MMX2_MemWr:
+         case MMX2_RegRd: case MMX2_RegWr:
+            VG_(skin_panic)(
+               "I don't know how to instrument MMXish stuff (yet)");
+            break;
+
          default:
             VG_(pp_UInstr)(0, u_in);
             VG_(skin_panic)( "memcheck_instrument: unhandled case");