Patch from Tom Hughes, for bug 72643:

  Patch to improve SSE/SS2 support

  This patch should implement most of the missing SSE/SSE2 opcodes. About
  the only ones it doesn't do are the MASKMOVxxx ones as they are quite
  horrible and involved an implicit reference to EDI so I need to think
  about them a bit more.

  The patch also includes a set of tests for the MMX/SSE/SSE2 opcodes to
  validate that they have the same effect under valgrind as they do when
  run normally. In one or two cases this wasn't actually the case even
  for some of the implemented opcodes, so I fixed those as well ;-)


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@2202 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/addrcheck/ac_main.c b/addrcheck/ac_main.c
index 044f917..bcd255f 100644
--- a/addrcheck/ac_main.c
+++ b/addrcheck/ac_main.c
@@ -1048,6 +1048,8 @@
 
          case SSE3a_MemRd:
          case SSE2a_MemRd:
+         case SSE3a1_MemRd:
+         case SSE2a1_MemRd:
             helper = (Addr)ac_fpu_READ_check;
 	    goto do_Access_ARG3;
          case SSE2a_MemWr:
@@ -1067,12 +1069,6 @@
             VG_(copy_UInstr)(cb, u_in);
             break;
 
-         case SSE2a1_MemRd:
-         case SSE3a1_MemRd:
-	    VG_(pp_UInstr)(0,u_in);
-	    VG_(skin_panic)("AddrCheck: unhandled SSE uinstr");
-	    break;
-
          case SSE3e1_RegRd:
          case SSE3e_RegWr:
          case SSE3g1_RegWr:
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index 6264abf..4726c90 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -544,7 +544,7 @@
 
          case SSE2a_MemRd:
          case SSE2a1_MemRd:
-            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
+            sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
             t_read = u_in->val3;
             is_FPU_R = True;
             break;
@@ -556,7 +556,7 @@
             break;
 
          case SSE3a1_MemRd:
-            sk_assert(u_in->size == 16);
+            sk_assert(u_in->size == 8 || u_in->size == 16);
             t_read = u_in->val3;
             is_FPU_R = True;
             break;
@@ -577,7 +577,7 @@
             break;
 
          case SSE2a_MemWr:
-            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
+            sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
             t_write = u_in->val3;
             is_FPU_W = True;
             break;
@@ -798,7 +798,7 @@
 
          case SSE2a_MemRd:
          case SSE2a1_MemRd:
-            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
+            sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
             t_read = u_in->val3;
             t_read_addr = newTemp(cb);
             uInstr2(cb, MOV, 4, TempReg, u_in->val3,  TempReg, t_read_addr);
@@ -821,7 +821,7 @@
             break;
 
          case SSE3a1_MemRd:
-            sk_assert(u_in->size == 16);
+            sk_assert(u_in->size == 8 || u_in->size == 16);
             t_read = u_in->val3;
             t_read_addr = newTemp(cb);
             uInstr2(cb, MOV, 4, TempReg, u_in->val3,  TempReg, t_read_addr);
@@ -861,7 +861,7 @@
             break;
 
          case SSE2a_MemWr:
-            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
+            sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
            /* fall through */
          case SSE3a_MemWr:
             sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index 7be34b1..bb24e69 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -1518,6 +1518,78 @@
                   nameIReg(4,ireg) );
 }
 
+static void emit_SSE2e1 ( FlagSet uses_sflags, 
+                          FlagSet sets_sflags,
+                          UChar first_byte, 
+                          UChar second_byte, 
+                         UChar third_byte,
+                          UChar fourth_byte,
+                          Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   third_byte &= 0x38; /* mask out mod and rm fields */
+   third_byte |= 0xC0; /* set top two bits: mod = 11b */
+   third_byte |= (ireg & 7); /* patch in our ireg */
+   VG_(emitB) ( third_byte );
+   VG_(emitB) ( fourth_byte );
+   if (dis)
+      VG_(printf)(
+         "\n\t\tsse2e1--0x%x:0x%x:0x%x:0x%x-(%s)\n", 
+         (UInt)first_byte, (UInt)second_byte, 
+         (UInt)third_byte, (UInt)fourth_byte,
+         nameIReg(4,ireg) 
+      );
+}
+
+static void emit_SSE2g1 ( FlagSet uses_sflags, 
+                          FlagSet sets_sflags,
+                          UChar first_byte, 
+                          UChar second_byte, 
+                          UChar third_byte,
+                          UChar fourth_byte,
+                          Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   third_byte &= 0xC7; /* mask out reg field */
+   third_byte |= 0xC0; /* set top two bits: mod = 11b */
+   third_byte |= ((ireg & 7) << 3); /* patch in our ireg */
+   VG_(emitB) ( third_byte );
+   VG_(emitB) ( fourth_byte );
+   if (dis)
+      VG_(printf)(
+         "\n\t\tsse2g1_reg_wr--0x%x:0x%x:0x%x:0x%x-(%s)\n", 
+         (UInt)first_byte, (UInt)second_byte, 
+         (UInt)third_byte, (UInt)fourth_byte,
+         nameIReg(4,ireg) 
+      );
+}
+
+static void emit_SSE2g ( FlagSet uses_sflags, 
+                         FlagSet sets_sflags,
+                         UChar first_byte, 
+                         UChar second_byte, 
+                        UChar third_byte,
+                         Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   third_byte &= 0xC7; /* mask out reg field */
+   third_byte |= 0xC0; /* set top two bits: mod = 11b */
+   third_byte |= ((ireg & 7) << 3); /* patch in our ireg */
+   VG_(emitB) ( third_byte );
+   if (dis)
+      VG_(printf)(
+         "\n\t\tsse2g--0x%x:0x%x:0x%x-(%s)\n", 
+         (UInt)first_byte, (UInt)second_byte, (UInt)third_byte,
+         nameIReg(4,ireg) 
+      );
+}
+
 static void emit_SSE2a1 ( FlagSet uses_sflags, 
                           FlagSet sets_sflags,
                           UChar first_byte, 
@@ -4075,7 +4147,8 @@
 
       case SSE2a_MemWr:
       case SSE2a_MemRd:
-         vg_assert(u->size == 4 || u->size == 16 || u->size == 512);
+         vg_assert(u->size == 4 || u->size == 8
+                   || u->size == 16 || u->size == 512);
          vg_assert(u->tag1 == Lit16);
          vg_assert(u->tag2 == Lit16);
          vg_assert(u->tag3 == RealReg);
@@ -4090,6 +4163,59 @@
                       u->val3 );
          break;
 
+      case SSE2g_RegWr:
+         vg_assert(u->size == 4);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE2g ( u->flags_r, u->flags_w,
+                      (u->val1 >> 8) & 0xFF,
+                      u->val1 & 0xFF,
+                      u->val2 & 0xFF,
+                      u->val3 );
+         break;
+
+      case SSE2g1_RegWr:
+         vg_assert(u->size == 4);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE2g1 ( u->flags_r, u->flags_w,
+                       (u->val1 >> 8) & 0xFF,
+                       u->val1 & 0xFF,
+                       u->val2 & 0xFF,
+                       u->lit32 & 0xFF,
+                       u->val3 );
+         break;
+
+      case SSE2e1_RegRd:
+         vg_assert(u->size == 2);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE2e1 ( u->flags_r, u->flags_w,
+                       (u->val1 >> 8) & 0xFF,
+                       u->val1 & 0xFF,
+                       u->val2 & 0xFF,
+                       u->lit32 & 0xFF,
+                       u->val3 );
+         break;
+
       case SSE2a1_MemRd:
          vg_assert(u->size == 4 || u->size == 16);
          vg_assert(u->tag1 == Lit16);
@@ -4194,7 +4320,7 @@
          break;
 
       case SSE3a1_MemRd:
-         vg_assert(u->size == 16);
+         vg_assert(u->size == 8 || u->size == 16);
          vg_assert(u->tag1 == Lit16);
          vg_assert(u->tag2 == Lit16);
          vg_assert(u->tag3 == RealReg);
@@ -4208,7 +4334,7 @@
                       u->val1 & 0xFF,
                       (u->val2 >> 8) & 0xFF,
                       u->val2 & 0xFF,
-                      (u->lit32 >> 8) & 0xFF,
+                      u->lit32 & 0xFF,
                       u->val3 );
          break;
 
diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c
index 0d82f8f..bb474b9 100644
--- a/coregrind/vg_to_ucode.c
+++ b/coregrind/vg_to_ucode.c
@@ -3397,6 +3397,193 @@
    return eip;
 }
 
+
+/* Simple SSE operations, either 
+       op   (src)xmmreg, (dst)mmxreg
+   or
+       op   (src)address, (dst)mmxreg
+   2 opcode bytes.
+   Supplied eip points to the first address mode byte.
+*/
+static
+Addr dis_SSE2_to_MMX ( UCodeBlock *cb,
+                       UChar sorb,
+                       Addr eip,
+                       Int sz, 
+                       Char* name, 
+                       UChar opc1, 
+                       UChar opc2 )
+{
+   UChar dis_buf[50];
+   UChar modrm = getUChar(eip);
+   if (epartIsReg(modrm)) {
+      /* Completely internal SSE insn. */
+      uInstr2(cb, SSE3, 0,  /* ignore sz for internal ops */
+                  Lit16, (((UShort)opc1) << 8) | (UShort)opc2,
+                  Lit16, (UShort)modrm );
+      if (dis)
+         VG_(printf)("%s %s, %s\n", name, 
+                     nameXMMReg(eregOfRM(modrm)), 
+                     nameMMXReg(gregOfRM(modrm)) );
+      eip++;
+   } else {
+      UInt pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+      Int  tmpa = LOW24(pair);
+      eip += HI8(pair);
+      uInstr3(cb, SSE2a_MemRd, sz,
+                  Lit16, (((UShort)(opc1)) << 8) | ((UShort)opc2),
+                  Lit16, ((UShort)modrm),
+                  TempReg, tmpa);
+      if (dis)
+         VG_(printf)("%s %s, %s\n", 
+                     name,
+                     dis_buf,
+                     nameMMXReg(gregOfRM(modrm)));
+   }
+   return eip;
+}
+
+
+/* Simple SSE operations, either 
+       op   (src)mmxreg, (dst)xmmreg
+   or
+       op   (src)address, (dst)xmmreg
+   2 opcode bytes.
+   Supplied eip points to the first address mode byte.
+*/
+static
+Addr dis_SSE2_from_MMX ( UCodeBlock *cb,
+                         UChar sorb,
+                         Addr eip,
+                         Int sz, 
+                         Char* name, 
+                         UChar opc1, 
+                         UChar opc2 )
+{
+   UChar dis_buf[50];
+   UChar modrm = getUChar(eip);
+   if (epartIsReg(modrm)) {
+      /* Completely internal SSE insn. */
+      uInstr2(cb, SSE3, 0,  /* ignore sz for internal ops */
+                  Lit16, (((UShort)opc1) << 8) | (UShort)opc2,
+                  Lit16, (UShort)modrm );
+      if (dis)
+         VG_(printf)("%s %s, %s\n", name, 
+                     nameMMXReg(eregOfRM(modrm)), 
+                     nameXMMReg(gregOfRM(modrm)) );
+      eip++;
+   } else {
+      UInt pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+      Int  tmpa = LOW24(pair);
+      eip += HI8(pair);
+      uInstr3(cb, SSE2a_MemRd, sz,
+                  Lit16, (((UShort)(opc1)) << 8) | ((UShort)opc2),
+                  Lit16, ((UShort)modrm),
+                  TempReg, tmpa);
+      if (dis)
+         VG_(printf)("%s %s, %s\n", 
+                     name,
+                     dis_buf,
+                     nameXMMReg(gregOfRM(modrm)));
+   }
+   return eip;
+}
+
+
+/* Simple SSE operations, either 
+       op   (src)xmmreg, (dst)mmxreg
+   or
+       op   (src)address, (dst)mmxreg
+   3 opcode bytes.
+   Supplied eip points to the first address mode byte.
+*/
+static
+Addr dis_SSE3_to_MMX ( UCodeBlock *cb,
+                       UChar sorb,
+                       Addr eip,
+                       Int sz,
+                       Char* name, 
+                       UChar opc1, 
+                       UChar opc2, 
+                       UChar opc3 )
+{
+   UChar dis_buf[50];
+   UChar modrm = getUChar(eip);
+   if (epartIsReg(modrm)) {
+      /* Completely internal SSE insn. */
+      uInstr2(cb, SSE4, 0,  /* ignore sz for internal ops */
+                  Lit16, (((UShort)opc1) << 8) | (UShort)opc2,
+                  Lit16, (((UShort)opc3) << 8) | (UShort)modrm );
+      if (dis)
+         VG_(printf)("%s %s, %s\n", name, 
+                     nameXMMReg(eregOfRM(modrm)), 
+                     nameMMXReg(gregOfRM(modrm)) );
+      eip++;
+   } else {
+      UInt pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+      Int  tmpa = LOW24(pair);
+      eip += HI8(pair);
+      uInstr3(cb, SSE3a_MemRd, sz,
+                  Lit16, (((UShort)(opc1)) << 8) | ((UShort)opc2),
+                  Lit16, (((UShort)(opc3)) << 8) | ((UShort)modrm),
+                  TempReg, tmpa);
+      if (dis)
+         VG_(printf)("%s %s, %s\n", 
+                     name,
+                     dis_buf,
+                     nameMMXReg(gregOfRM(modrm)));
+   }
+   return eip;
+}
+
+
+/* Simple SSE operations, either 
+       op   (src)mmxreg, (dst)xmmreg
+   or
+       op   (src)address, (dst)xmmreg
+   3 opcode bytes.
+   Supplied eip points to the first address mode byte.
+*/
+static
+Addr dis_SSE3_from_MMX ( UCodeBlock *cb,
+                         UChar sorb,
+                         Addr eip,
+                         Int sz,
+                         Char* name, 
+                         UChar opc1, 
+                         UChar opc2, 
+                         UChar opc3 )
+{
+   UChar dis_buf[50];
+   UChar modrm = getUChar(eip);
+   if (epartIsReg(modrm)) {
+      /* Completely internal SSE insn. */
+      uInstr2(cb, SSE4, 0,  /* ignore sz for internal ops */
+                  Lit16, (((UShort)opc1) << 8) | (UShort)opc2,
+                  Lit16, (((UShort)opc3) << 8) | (UShort)modrm );
+      if (dis)
+         VG_(printf)("%s %s, %s\n", name, 
+                     nameMMXReg(eregOfRM(modrm)), 
+                     nameXMMReg(gregOfRM(modrm)) );
+      eip++;
+   } else {
+      UInt pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+      Int  tmpa = LOW24(pair);
+      eip += HI8(pair);
+      uInstr3(cb, SSE3a_MemRd, sz,
+                  Lit16, (((UShort)(opc1)) << 8) | ((UShort)opc2),
+                  Lit16, (((UShort)(opc3)) << 8) | ((UShort)modrm),
+                  TempReg, tmpa);
+      if (dis)
+         VG_(printf)("%s %s, %s\n", 
+                     name,
+                     dis_buf,
+                     nameXMMReg(gregOfRM(modrm)));
+   }
+   return eip;
+}
+
+
 static 
 void dis_push_segreg ( UCodeBlock* cb, UInt sreg, Int sz )
 {
@@ -3596,6 +3783,57 @@
       goto decode_success;
    }
 
+   /* CLFLUSH -- flush cache line */
+   if (insn[0] == 0x0F && insn[1] == 0xAE
+       && (!epartIsReg(insn[2]))
+       && (gregOfRM(insn[2]) == 7))
+   {
+      vg_assert(sz == 4);
+      pair = disAMode ( cb, sorb, eip+2, dis?dis_buf:NULL );
+      t1   = LOW24(pair);
+      eip += 2+HI8(pair);
+      uInstr3(cb, SSE2a_MemRd, 0,  /* ignore sz for internal ops */
+                  Lit16, (((UShort)0x0F) << 8) | (UShort)0xAE,
+                  Lit16, (UShort)insn[2],
+                  TempReg, t1 );
+      if (dis)
+         VG_(printf)("clflush %s\n", dis_buf);
+      goto decode_success;
+   }
+
+   /* CVTPI2PS (0x0F,0x2A) -- mm/m64, xmm */
+   /* CVTPI2PD (0x66,0x0F,0x2A) -- mm/m64, xmm */
+   if (insn[0] == 0x0F && insn[1] == 0x2A) {
+      if (sz == 4) {
+         eip = dis_SSE2_from_MMX
+                  ( cb, sorb, eip+2, 8, "cvtpi2ps",
+                        insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_from_MMX
+                  ( cb, sorb, eip+2, 8, "cvtpi2pd",
+                        0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
+   /* CVTTPS2PI (0x0F,0x2C) -- xmm/m64, mm */
+   /* CVTPS2PI (0x0F,0x2D) -- xmm/m64, mm */
+   /* CVTTPD2PI (0x66,0x0F,0x2C) -- xmm/m128, mm */
+   /* CVTPD2PI (0x66,0x0F,0x2D) -- xmm/m128, mm */
+   if (insn[0] == 0x0F
+       && (insn[1] == 0x2C || insn[1] == 0x2D)) {
+      if (sz == 4) {
+         eip = dis_SSE2_to_MMX
+                  ( cb, sorb, eip+2, 8, "cvt{t}ps2pi",
+                        insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_to_MMX
+                  ( cb, sorb, eip+2, 16, "cvt{t}pd2pi",
+                        0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
    /* CVTTSD2SI (0xF2,0x0F,0x2C) -- convert a double-precision float
       value in memory or xmm reg to int and put it in an ireg.
       Truncate. */
@@ -3695,6 +3933,20 @@
       goto decode_success;
    }
 
+   /* CVTPS2PD -- convert two packed floats to two packed doubles. */
+   /* 0x66: CVTPD2PS -- convert two packed doubles to two packed floats. */
+   if (insn[0] == 0x0F && insn[1] == 0x5A) {
+      vg_assert(sz == 2 || sz == 4);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 8, "cvtps2pd",
+                                         insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "cvtpd2ps",
+                                     0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
    /* CVTSS2SD -- convert one single float to double. */
    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
       vg_assert(sz == 4);
@@ -3711,6 +3963,60 @@
       goto decode_success;
    }
 
+   /* CVTDQ2PS -- convert four ints to four packed floats. */
+   /* 0x66: CVTPS2DQ -- convert four packed floats to four ints. */
+   if (insn[0] == 0x0F && insn[1] == 0x5B) {
+      vg_assert(sz == 2 || sz == 4);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "cvtdq2ps",
+                                         insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "cvtps2dq",
+                                         0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
+   /* CVTPD2DQ -- convert two packed doubles to two ints. */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0xE6) {
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 8, "cvtpd2dq",
+                                      0x66, insn[0], insn[1] );
+      goto decode_success;
+   }
+
+   /* CVTTPD2DQ -- convert two packed doubles to two ints with truncation. */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
+      vg_assert(sz == 4);
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8, "cvttpd2dq",
+                                      insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* CVTDQ2PD -- convert two ints to two packed doubles. */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
+      vg_assert(sz == 4);
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8, "cvtdq2pd",
+                                      insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* CVTTPS2DQ -- convert four packed floats to four ints with truncation. */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
+      vg_assert(sz == 4);
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 16, "cvttps2dq",
+                                      insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* CMPSS -- compare scalar floats. */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
+      vg_assert(sz == 4);
+      eip = dis_SSE3_reg_or_mem_Imm8 ( cb, sorb, eip+3, 8, "cmpss",
+                                       insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
    /* CMPSD -- compare scalar doubles. */
    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
       vg_assert(sz == 4);
@@ -3742,6 +4048,22 @@
       goto decode_success;
    }
 
+   /* PSHUFLW */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
+      eip = dis_SSE3_reg_or_mem_Imm8 ( cb, sorb, eip+3, 16, 
+                                           "pshuflw",
+                                           insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* PSHUFHW */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
+      eip = dis_SSE3_reg_or_mem_Imm8 ( cb, sorb, eip+3, 16, 
+                                           "pshufhw",
+                                           insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
    /* PSHUFW */
    if (sz == 4
        && insn[0] == 0x0F && insn[1] == 0x70) {
@@ -3882,6 +4204,20 @@
       goto decode_success;
    }
 
+   /* MINPS */
+   /* 0x66: MINPD */
+   if (insn[0] == 0x0F && insn[1] == 0x5D) {
+      vg_assert(sz == 4 || sz == 2);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "minps",
+                                         insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "minpd",
+                                         0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
    /* 0xF3: MAXSD */
    /* 0xF3: MAXSS */
    if ((insn[0] == 0xF2 || insn[0] == 0xF3) 
@@ -3944,11 +4280,17 @@
       goto decode_success;
    }
 
-   /* ORPD (src)xmmreg-or-mem, (dst)xmmreg */
-   if (sz == 2
-       && insn[0] == 0x0F && insn[1] == 0x56) {
+   /* ORPS */
+   /* 0x66: ORPD (src)xmmreg-or-mem, (dst)xmmreg */
+   if (insn[0] == 0x0F && insn[1] == 0x56) {
+      vg_assert(sz == 4 || sz == 2);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "orps",
+                                         insn[0], insn[1] );
+      } else {
       eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "orpd",
                                       0x66, insn[0], insn[1] );
+      }
       goto decode_success;
    }
 
@@ -4013,23 +4355,25 @@
                                       0x66, insn[0], insn[1] );
       goto decode_success;
    }
-   /* 0xE0: PAVGB(src)xmmreg-or-mem, (dst)xmmreg, size 4 */
-   if (sz == 4
-       && insn[0] == 0x0F 
-       && insn[1] == 0xE0 ) {
-      eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "pavg{b,w}",
-                                      insn[0], insn[1] );
+ 
+   /* 0xF6: PSADBW(src)xmmreg-or-mem, (dst)xmmreg */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0xF6) {
+     eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "psadbw",
+                                      0x66, insn[0], insn[1] );
       goto decode_success;
    }
  
    /* 0x60: PUNPCKLBW (src)xmmreg-or-mem, (dst)xmmreg */
    /* 0x61: PUNPCKLWD (src)xmmreg-or-mem, (dst)xmmreg */
    /* 0x62: PUNPCKLDQ (src)xmmreg-or-mem, (dst)xmmreg */
+   /* 0x6C: PUNPCKQLQDQ (src)xmmreg-or-mem, (dst)xmmreg */
    if (sz == 2
        && insn[0] == 0x0F 
-       && (insn[1] == 0x60 || insn[1] == 0x61 || insn[1] == 0x62)) {
+       && (insn[1] == 0x60 || insn[1] == 0x61
+           || insn[1] == 0x62 || insn[1] == 0x6C)) {
       eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, 
-                                      "punpckl{bw,wd,dq}",
+                                      "punpckl{bw,wd,dq,qdq}",
                                       0x66, insn[0], insn[1] );
       goto decode_success;
    }
@@ -4037,11 +4381,13 @@
    /* 0x68: PUNPCKHBW (src)xmmreg-or-mem, (dst)xmmreg */
    /* 0x69: PUNPCKHWD (src)xmmreg-or-mem, (dst)xmmreg */
    /* 0x6A: PUNPCKHDQ (src)xmmreg-or-mem, (dst)xmmreg */
+   /* 0x6D: PUNPCKHQDQ (src)xmmreg-or-mem, (dst)xmmreg */
    if (sz == 2
        && insn[0] == 0x0F 
-       && (insn[1] == 0x68 || insn[1] == 0x69 || insn[1] == 0x6A)) {
+       && (insn[1] == 0x68 || insn[1] == 0x69
+           || insn[1] == 0x6A || insn[1] == 0x6D)) {
       eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, 
-                                      "punpckh{bw,wd,dq}",
+                                      "punpckh{bw,wd,dq,qdq}",
                                       0x66, insn[0], insn[1] );
       goto decode_success;
    }
@@ -4142,12 +4488,21 @@
       goto decode_success;
    }
 
+   /* 0xE4: PMULHUW(src)xmmreg-or-mem, (dst)xmmreg */
    /* 0xE5: PMULHW(src)xmmreg-or-mem, (dst)xmmreg */
    /* 0xD5: PMULLW(src)xmmreg-or-mem, (dst)xmmreg */
    if (sz == 2
        && insn[0] == 0x0F 
-       && (insn[1] == 0xE5 || insn[1] == 0xD5)) {
-      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "pmul{h,l}w",
+       && (insn[1] == 0xE4 || insn[1] == 0xE5 || insn[1] == 0xD5)) {
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "pmul{hu,h,l}w",
+                                      0x66, insn[0], insn[1] );
+      goto decode_success;
+   }
+
+   /* 0xD5: PMULUDQ(src)xmmreg-or-mem, (dst)xmmreg */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0xF4) {
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "pmuludq",
                                       0x66, insn[0], insn[1] );
       goto decode_success;
    }
@@ -4291,6 +4646,26 @@
       goto decode_success;
    }
 
+   /* MOVDQ2Q -- move low 4 bytes of XMM reg to MMX reg. */
+   if (insn[0] == 0xF2
+       && insn[1] == 0x0F
+       && insn[2] == 0xD6) {
+      eip = dis_SSE3_to_MMX
+               ( cb, sorb, eip+3, 8, "movdq2q",
+                     insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* MOVQ2DQ -- move MMX reg to low 4 bytes of XMM reg. */
+   if (insn[0] == 0xF3
+       && insn[1] == 0x0F
+       && insn[2] == 0xD6) {
+      eip = dis_SSE3_from_MMX
+               ( cb, sorb, eip+3, 8, "movq2dq",
+                     insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
    /* MOVSS -- move 4 bytes of XMM reg to/from XMM reg or mem. */
    if (insn[0] == 0xF3
        && insn[1] == 0x0F 
@@ -4356,19 +4731,6 @@
       goto decode_success;
    }
 
-   /* MOVLPD -- 8-byte load/store. */
-   if (sz == 2 
-       && insn[0] == 0x0F 
-       && (insn[1] == 0x12 || insn[1] == 0x13)) {
-      Bool is_store = insn[1]==0x13;
-      /* Cannot be used for reg-reg moves, according to Intel docs. */
-      vg_assert(!epartIsReg(insn[2]));
-      eip = dis_SSE3_load_store_or_mov
-               (cb, sorb, eip+2, 8, is_store, "movlpd", 
-                    0x66, insn[0], insn[1] );
-      goto decode_success;
-   }
-
    /* MOVDQU -- unaligned 16-byte load/store. */
    if (insn[0] == 0xF3
        && insn[1] == 0x0F 
@@ -4522,20 +4884,6 @@
       goto decode_success;
    }
 
-   /* MOVLPS -- 8-byte load/store.  How is this different from MOVLPS
-      ? */
-   if (insn[0] == 0x0F 
-       && (insn[1] == 0x12 || insn[1] == 0x13)) {
-      Bool is_store = insn[1]==0x13;
-      vg_assert(sz == 4);
-      /* Cannot be used for reg-reg moves, according to Intel docs. */
-      //      vg_assert(!epartIsReg(insn[2]));
-      eip = dis_SSE2_load_store_or_mov
-               (cb, sorb, eip+2, 8, is_store, "movlps", 
-                    insn[0], insn[1] );
-      goto decode_success;
-   }
-
    /* 0xF3: RCPSS -- reciprocal of scalar float */
    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
       vg_assert(sz == 4);
@@ -4547,19 +4895,31 @@
 
    /* MOVMSKPD -- extract 2 sign bits from a xmm reg and copy them to 
       an ireg.  Top 30 bits of ireg are set to zero. */
-   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x50) {
+   /* MOVMSKPS -- extract 4 sign bits from a xmm reg and copy them to 
+      an ireg.  Top 28 bits of ireg are set to zero. */
+   if (insn[0] == 0x0F && insn[1] == 0x50) {
+      vg_assert(sz == 4 || sz == 2);
       modrm = insn[2];
       /* Intel docs don't say anything about a memory source being
 	 allowed here. */
       vg_assert(epartIsReg(modrm));
       t1 = newTemp(cb);
-      uInstr3(cb, SSE3g_RegWr, 4,
-                  Lit16, (((UShort)0x66) << 8) | (UShort)insn[0],
-                  Lit16, (((UShort)insn[1]) << 8) | (UShort)modrm,
-                  TempReg, t1 );
-      uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+      if (sz == 4) {
+         uInstr3(cb, SSE2g_RegWr, 4,
+                     Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
+                     Lit16, (UShort)modrm,
+                     TempReg, t1 );
+         uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+      } else {
+         uInstr3(cb, SSE3g_RegWr, 4,
+                     Lit16, (((UShort)0x66) << 8) | (UShort)insn[0],
+                     Lit16, (((UShort)insn[1]) << 8) | (UShort)modrm,
+                     TempReg, t1 );
+         uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+      }
       if (dis)
-         VG_(printf)("movmskpd %s, %s\n", 
+         VG_(printf)("movmskp%c %s, %s\n",
+                      sz == 4 ? 's' : 'd',
                       nameXMMReg(eregOfRM(modrm)),
                       nameIReg(4,gregOfRM(modrm)));
       eip += 3;
@@ -4580,16 +4940,55 @@
       goto decode_success;
    }
 
-   /* MOVHPD -- 8-byte load/store. */
-   if (sz == 2 
-       && insn[0] == 0x0F 
+   /* MOVHLPS -- move two packed floats from high quadword to low quadword */
+   /* MOVLPS -- load/store two packed floats to/from low quadword. */
+   /* MOVLPD -- load/store packed double to/from low quadword. */
+   if (insn[0] == 0x0F 
+       && (insn[1] == 0x12 || insn[1] == 0x13)) {
+      Bool is_store = insn[1]==0x13;
+      vg_assert(sz == 4 || sz == 2);
+      if (sz == 4) {
+         if (epartIsReg(insn[2])) {
+            vg_assert(insn[1]==0x12);
+            eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "movhlps",
+                                            insn[0], insn[1] );
+         } else {
+            eip = dis_SSE2_load_store_or_mov
+                     (cb, sorb, eip+2, 8, is_store, "movlps", 
+                          insn[0], insn[1] );
+         }
+      } else {
+         vg_assert(!epartIsReg(insn[2]));
+         eip = dis_SSE3_load_store_or_mov
+                  (cb, sorb, eip+2, 8, is_store, "movlpd", 
+                       0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
+   /* MOVLHPS -- move two packed floats from low quadword to high quadword */
+   /* MOVHPS -- load/store two packed floats to/from high quadword. */
+   /* MOVHPD -- load/store packed double to/from high quadword. */
+   if (insn[0] == 0x0F 
        && (insn[1] == 0x16 || insn[1] == 0x17)) {
       Bool is_store = insn[1]==0x17;
-      /* Cannot be used for reg-reg moves, according to Intel docs. */
+      vg_assert(sz == 4 || sz == 2);
+      if (sz == 4) {
+         if (epartIsReg(insn[2])) {
+            vg_assert(insn[1]==0x16);
+            eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "movlhps",
+                                            insn[0], insn[1] );
+         } else {
+            eip = dis_SSE2_load_store_or_mov
+                     (cb, sorb, eip+2, 8, is_store, "movhps", 
+                          insn[0], insn[1] );
+         }
+      } else {
       vg_assert(!epartIsReg(insn[2]));
       eip = dis_SSE3_load_store_or_mov
                (cb, sorb, eip+2, 8, is_store, "movhpd", 
                     0x66, insn[0], insn[1] );
+      }
       goto decode_success;
    }
 
@@ -4614,28 +5013,28 @@
       goto decode_success;
    }
 
-   /* CVTDQ2PD -- convert one single double. to float. */
-   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
-      vg_assert(sz == 4);
-      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8, "cvtdq2pd",
-                                      insn[0], insn[1], insn[2] );
-      goto decode_success;
-   }
-
-   /* CVTPD2PS -- convert two doubles to two floats. */
-   if (sz == 2 &&
-       insn[0] == 0x0F && insn[1] == 0x5A) {
-      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "cvtpd2ps",
+   /* sz==4: SQRTPS: square root of packed float. */
+   /* sz==2: SQRTPD: square root of packed double. */
+   if (insn[0] == 0x0F && insn[1] == 0x51) {
+      vg_assert(sz == 2 || sz == 4);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, 
+                                         "sqrtps",
+                                         insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, 
+                                         "sqrtpd",
                                  0x66, insn[0], insn[1] );
+      }
       goto decode_success;
    }
 
-   /* SQRTPD: square root of packed double. */
-   if (sz == 2
-       && insn[0] == 0x0F && insn[1] == 0x51) {
-      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, 
-                                      "sqrtpd",
-                                      0x66, insn[0], insn[1] );
+   /* RSQRTPS: square root reciprocal of packed float. */
+   if (insn[0] == 0x0F && insn[1] == 0x52) {
+      vg_assert(sz == 4);
+      eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, 
+                                      "rsqrtps",
+                                      insn[0], insn[1] );
       goto decode_success;
    }
 
@@ -6072,6 +6471,24 @@
          eip = dis_movx_E_G ( cb, sorb, eip, 2, 4, True );
          break;
 
+      /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
+
+      case 0xC3: /* MOVNTI Gv,Ev */
+         vg_assert(sz == 4);
+         modrm = getUChar(eip);
+         vg_assert(!epartIsReg(modrm));
+         t1 = newTemp(cb);
+         uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
+         pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+         t2 = LOW24(pair);
+         eip += HI8(pair);
+         uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
+         if (dis)
+           VG_(printf)("movnti %s,%s\n",
+                       nameIReg(4,gregOfRM(modrm)),
+                       dis_buf);
+         break;
+
       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
 
       case 0xAF: /* IMUL Ev, Gv */
@@ -6428,6 +6845,12 @@
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "padd", True );
          break;
 
+      case 0xD4: 
+         /* PADDQ (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "paddq", False );
+         break;
+
       case 0xEC: case 0xED:
          /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
          vg_assert(sz == 4);
@@ -6440,7 +6863,7 @@
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "paddus", True );
          break;
 
-      case 0xF8: case 0xF9: case 0xFA:
+      case 0xF8: case 0xF9: case 0xFA: case 0xFB:
          /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
          vg_assert(sz == 4);
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psub", True );
@@ -6458,6 +6881,11 @@
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psubus", True );
          break;
 
+      case 0xE4: /* PMULHUW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmulhuw", False );
+         break;
+
       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
          vg_assert(sz == 4);
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmulhw", False );
@@ -6468,6 +6896,11 @@
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmullw", False );
          break;
 
+      case 0xF4: /* PMULUDQ (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmuludq", False );
+         break;
+
       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
          vg_assert(sz == 4);
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmaddwd", False );
@@ -6550,6 +6983,105 @@
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psra", True );
          break;
 
+      case 0xDA:
+         /* PMINUB (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pminub", False );
+         break;
+
+      case 0xDE:
+         /* PMAXUB (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmaxub", False );
+         break;
+
+      case 0xEA:
+         /* PMINSW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pminsw", False );
+         break;
+
+      case 0xEE:
+         /* PMAXSW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmaxsw", False );
+         break;
+
+      case 0xE0:
+         /* PAVGB (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pavgb", False );
+         break;
+
+      case 0xE3:
+         /* PAVGW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pavgw", False );
+         break;
+
+      case 0xF6:
+         /* PSADBW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psadbw", False );
+         break;
+
+      case 0xD7:
+         /* PMOVMSKB (src)mmxreg, (dst)ireg */
+         vg_assert(sz == 4);
+         modrm = getUChar(eip);
+         vg_assert(epartIsReg(modrm));
+         t1 = newTemp(cb);
+         uInstr3(cb, SSE2g_RegWr, 4,
+                     Lit16, (((UShort)(0x0F)) << 8) | (UShort)(opc),
+                     Lit16, (UShort)modrm,
+                     TempReg, t1 );
+         uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+         if (dis)
+            VG_(printf)("pmovmskb %s, %s\n", 
+                        nameMMXReg(eregOfRM(modrm)),
+                        nameIReg(4,gregOfRM(modrm)));
+         eip++;         
+         break;
+
+      case 0xC5:
+         /* PEXTRW (src)mmxreg, (dst)ireg */
+         vg_assert(sz == 4);
+         t1 = newTemp(cb);
+         modrm = getUChar(eip); eip++;
+         abyte = getUChar(eip); eip++;
+         vg_assert(epartIsReg(modrm));
+         uInstr3(cb, SSE2g1_RegWr, 4,
+                     Lit16, (((UShort)(0x0F)) << 8) | (UShort)(opc),
+                     Lit16, (UShort)modrm,
+                     TempReg, t1 );
+         uLiteral(cb, abyte);
+         uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+         if (dis)
+            VG_(printf)("pextrw %s, %d, %s\n",
+                        nameMMXReg(eregOfRM(modrm)), (Int)abyte, 
+                        nameIReg(4, gregOfRM(modrm)));
+         break;
+
+      case 0xC4:
+         /* PINSRW (src)ireg, (dst)mmxreg */
+         vg_assert(sz == 4);
+         t1 = newTemp(cb);
+         modrm = getUChar(eip); eip++;
+         abyte = getUChar(eip); eip++;
+         vg_assert(epartIsReg(modrm));
+         uInstr2(cb, GET, 2, ArchReg, eregOfRM(modrm), TempReg, t1);
+         uInstr3(cb, SSE2e1_RegRd, 2,
+                     Lit16, (((UShort)(0x0F)) << 8) | (UShort)(opc),
+                     Lit16, (UShort)modrm,
+                     TempReg, t1 );
+         uLiteral(cb, abyte);
+         if (dis)
+            VG_(printf)("pinsrw %s, %d, %s\n",
+                        nameIReg(2, eregOfRM(modrm)),
+                        (Int)abyte, 
+                        nameMMXReg(gregOfRM(modrm)));
+         break;
+
       case 0xA1: /* POP %FS */
          dis_pop_segreg( cb, R_FS, sz ); break;
       case 0xA9: /* POP %GS */
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index 0a231c1..b0f9d70 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -414,7 +414,8 @@
 #  define SZ42 (u->size == 4 || u->size == 2)
 #  define SZ48 (u->size == 4 || u->size == 8)
 #  define SZ416 (u->size == 4 || u->size == 16)
-#  define SZsse2 (u->size == 4 || u->size == 16 || u->size == 512)
+#  define SZ816 (u->size == 8 || u->size == 16)
+#  define SZsse2 (u->size == 4 || u->size == 8 || u->size == 16 || u->size == 512)
 #  define SZsse3 (u->size == 4 || u->size == 8 || u->size == 16)
 #  define SZi  (u->size == 4 || u->size == 2 || u->size == 1)
 #  define SZf  (  u->size ==  4 || u->size ==  8 || u->size ==   2     \
@@ -567,11 +568,14 @@
    case SSE2a_MemWr:  return LIT0 && SZsse2 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE2a_MemRd:  return LIT0 && SZsse2 && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE2a1_MemRd: return LIT0 && SZ416  && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE2g_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE2g1_RegWr: return LIT8 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE2e1_RegRd: return LIT8 && SZ2    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3a_MemWr:  return LIT0 && SZsse3 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3a_MemRd:  return LIT0 && SZsse3 && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e_RegRd:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3a1_MemRd: return LIT8 && SZ16   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3a1_MemRd: return LIT8 && SZ816  && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3g_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3g1_RegWr: return LIT8 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e1_RegRd: return LIT8 && SZ2    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
@@ -897,7 +901,10 @@
       case MMX2_ERegWr: return "MMX2_eRWr" ;
       case SSE2a_MemWr: return "SSE2a_MWr";
       case SSE2a_MemRd: return "SSE2a_MRd";
+      case SSE2g_RegWr: return "SSE2g_RWr";
       case SSE2a1_MemRd: return "SSE2a1_MRd";
+      case SSE2g1_RegWr: return "SSE2g1_RWr";
+      case SSE2e1_RegRd: return "SSE2e1_RRd";
       case SSE3e_RegRd: return "SSE3e_RRd";
       case SSE3e_RegWr: return "SSE3e_RWr";
       case SSE3g_RegWr: return "SSE3g_RWr";
@@ -1062,6 +1069,9 @@
 
       case SSE2a_MemWr:
       case SSE2a_MemRd:
+      case SSE2g_RegWr:
+      case SSE2g1_RegWr:
+      case SSE2e1_RegRd:
          VG_(printf)("0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, u->val2 & 0xFF );
          VG_(pp_UOperand)(u, 3, 4, True);
@@ -1270,6 +1280,7 @@
 
       case SSE3a1_MemRd:
       case SSE2a1_MemRd:
+      case SSE2e1_RegRd:
       case SSE3e_RegRd:
       case SSE3a_MemWr:
       case SSE3a_MemRd:
@@ -1277,6 +1288,8 @@
       case SSE3e1_RegRd:
       case SSE2a_MemRd: RD(3); break;
 
+      case SSE2g_RegWr:
+      case SSE2g1_RegWr:
       case SSE3e_RegWr:
       case SSE3g1_RegWr:
       case SSE3g_RegWr: WR(3); break;
@@ -1441,6 +1454,7 @@
       case MMX2_MemRd: case MMX2_MemWr:
       case MMX2_ERegRd: case MMX2_ERegWr:
       case SSE2a_MemWr: case SSE2a_MemRd: case SSE2a1_MemRd:
+      case SSE2g_RegWr: case SSE2g1_RegWr: case SSE2e1_RegRd:
       case SSE3a_MemWr: case SSE3a_MemRd: case SSE3a1_MemRd:
       case SSE3e_RegRd: case SSE3g_RegWr: case SSE3e_RegWr:
       case SSE3g1_RegWr: case SSE3e1_RegRd:
diff --git a/include/vg_skin.h.base b/include/vg_skin.h.base
index 52fc2c8..324163b 100644
--- a/include/vg_skin.h.base
+++ b/include/vg_skin.h.base
@@ -710,6 +710,32 @@
       SSE2a_MemRd,
       SSE2a_MemWr,
 
+      /* 4 bytes, writes an integer register.  Insns of the form
+         bbbbbbbb:bbbbbbbb:11 ireg bbb.
+         Held in val1[15:0] and val2[7:0], and ireg is to be replaced
+         at codegen time by a reference to the relevant RealReg.
+         Transfer is always at size 4.  Arg3 holds this Temp/Real Reg.
+      */
+      SSE2g_RegWr,
+
+      /* 5 bytes, writes an integer register.  Insns of the form
+         bbbbbbbb:bbbbbbbb:11 ireg bbb :bbbbbbbb. Held in
+         val1[15:0] and val2[7:0] and lit32[7:0], and ireg is to be
+         replaced at codegen time by a reference to the relevant
+         RealReg.  Transfer is always at size 4.  Arg3 holds this
+         Temp/Real Reg.
+      */
+      SSE2g1_RegWr,
+
+      /* 5 bytes, reads an integer register.  Insns of the form
+         bbbbbbbb:bbbbbbbb:11 bbb ireg :bbbbbbbb. Held in
+         val1[15:0] and val2[7:0] and lit32[7:0], and ireg is to be
+         replaced at codegen time by a reference to the relevant
+         RealReg.  Transfer is always at size 4.  Arg3 holds this
+         Temp/Real Reg.
+      */
+      SSE2e1_RegRd,
+
       /* 4 bytes, no memrefs, no iregdefs, copy exactly to the
          output.  Held in val1[15:0] and val2[15:0]. */
       SSE4,
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index d21bb86..8f24e2a 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -1076,7 +1076,10 @@
          }
 
 	 /* SSE ins referencing scalar integer registers */
-	 case SSE3g_RegWr:
+         case SSE2g_RegWr:
+         case SSE2g1_RegWr:
+         case SSE2e1_RegRd:
+         case SSE3g_RegWr:
          case SSE3e_RegRd:
          case SSE3e_RegWr: 
          case SSE3g1_RegWr:
diff --git a/none/tests/Makefile.am b/none/tests/Makefile.am
index 0dd6e89..c6e23d0 100644
--- a/none/tests/Makefile.am
+++ b/none/tests/Makefile.am
@@ -24,6 +24,9 @@
 	fpu_lazy_eflags.vgtest \
 	fucomip.stderr.exp fucomip.vgtest \
 	gxx304.stderr.exp gxx304.vgtest \
+	insn_mmx.stderr.exp insn_mmx.stdout.exp insn_mmx.vgtest \
+	insn_sse.stderr.exp insn_sse.stdout.exp insn_sse.vgtest \
+	insn_sse2.stderr.exp insn_sse2.stdout.exp insn_sse2.vgtest \
 	map_unmap.stdout.exp map_unmap.vgtest \
 	mremap.stdout.exp mremap.vgtest \
 	munmap_exe.stderr.exp munmap_exe.vgtest \
@@ -47,7 +50,8 @@
 check_PROGRAMS = \
 	args bitfield1 bt_everything bt_literal coolo_strlen \
 	cpuid dastest discard exec-sigmask floored fork fpu_lazy_eflags \
-	fucomip munmap_exe map_unmap mremap rcl_assert \
+	fucomip insn_mmx insn_sse insn_sse2 \
+	munmap_exe map_unmap mremap rcl_assert \
 	rcrl readline1 resolv seg_override sha1_test shortpush shorts smc1 \
 	pth_blockedsig \
 	syscall-restart1 syscall-restart2 \
@@ -71,6 +75,12 @@
 floored_LDADD 		= -lm
 fpu_lazy_eflags_SOURCES	= fpu_lazy_eflags.c
 fucomip_SOURCES 	= fucomip.c
+insn_mmx_SOURCES	= insn_mmx.def
+insn_mmx_LDADD		= -lm
+insn_sse_SOURCES	= insn_sse.def
+insn_sse_LDADD		= -lm
+insn_sse2_SOURCES	= insn_sse2.def
+insn_sse2_LDADD		= -lm
 map_unmap_SOURCES	= map_unmap.c
 mremap_SOURCES		= mremap.c
 munmap_exe_SOURCES 	= munmap_exe.c
@@ -99,3 +109,5 @@
 # must be built with these flags -- bug only occurred with them
 fpu_lazy_eflags.o: CFLAGS += -O2 -mcpu=pentiumpro -march=pentiumpro
 
+.def.c:
+	$(PERL) gen_insn_test.pl < $< > $@
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 8659f05..13660d7 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -12,12 +12,13 @@
 EXTRA_DIST = $(noinst_SCRIPTS)
 
 check_PROGRAMS = \
-	true
+	true \
+	cputest
 
 AM_CFLAGS   = $(WERROR) -Winline -Wall -Wshadow -g
 AM_CXXFLAGS = $(AM_CFLAGS)
 
 # generic C ones
 true_SOURCES 	= true.c
-
+cputest_SOURCES = cputest.c
 
diff --git a/tests/vg_regtest.in b/tests/vg_regtest.in
index 84182d8..965df82 100755
--- a/tests/vg_regtest.in
+++ b/tests/vg_regtest.in
@@ -49,6 +49,7 @@
 #   - vgopts: <Valgrind options>                    (default: none)
 #   - stdout_filter: <filter to run stdout through> (default: none)
 #   - stderr_filter: <filter to run stderr through> (default: ./filter_stderr)
+#   - cpu_test: <cpu feature required for test>     (default: none)
 #
 # Note that filters are necessary for stderr results to filter out things that
 # always change, eg. process id numbers.
@@ -80,6 +81,7 @@
 my $args;               # test prog args
 my $stdout_filter;      # filter program to run stdout results file through
 my $stderr_filter;      # filter program to run stderr results file through
+my $cpu_test;           # cpu feature to check for before running test
 
 my @failures;           # List of failed tests
 
@@ -165,8 +167,8 @@
     my ($f) = @_;
 
     # Defaults.
-    ($vgopts, $prog, $args, $stdout_filter, $stderr_filter) = 
-        ("", undef, "", undef, undef);
+    ($vgopts, $prog, $args, $stdout_filter, $stderr_filter, $cpu_test) = 
+        ("", undef, "", undef, undef, undef);
 
     # Every test directory must have a "filter_stderr"
     $stderr_filter = validate_program(".", $default_stderr_filter, 1);
@@ -184,6 +186,8 @@
             $stdout_filter = validate_program(".", $1, 1);
         } elsif ($line =~ /^\s*stderr_filter:\s*(.*)$/) {
             $stderr_filter = validate_program(".", $1, 1);
+        } elsif ($line =~ /^\s*cpu_test:\s*(.*)$/) {
+            $cpu_test = $1;
         } else {
             die "Bad line in $f: $line\n";
         }
@@ -222,6 +226,10 @@
 
     read_vgtest_file($vgtest);
 
+    if (defined $cpu_test) {
+        return unless system("../../tests/cputest $cpu_test") == 0;
+    }
+
     printf("%-16s valgrind $vgopts $prog $args\n", "$name:");
 
     # Pass the appropriate --tool option for the directory (can be overridden