diff --git a/addrcheck/ac_main.c b/addrcheck/ac_main.c
index 044f917..bcd255f 100644
--- a/addrcheck/ac_main.c
+++ b/addrcheck/ac_main.c
@@ -1048,6 +1048,8 @@
 
          case SSE3a_MemRd:
          case SSE2a_MemRd:
+         case SSE3a1_MemRd:
+         case SSE2a1_MemRd:
             helper = (Addr)ac_fpu_READ_check;
 	    goto do_Access_ARG3;
          case SSE2a_MemWr:
@@ -1067,12 +1069,6 @@
             VG_(copy_UInstr)(cb, u_in);
             break;
 
-         case SSE2a1_MemRd:
-         case SSE3a1_MemRd:
-	    VG_(pp_UInstr)(0,u_in);
-	    VG_(skin_panic)("AddrCheck: unhandled SSE uinstr");
-	    break;
-
          case SSE3e1_RegRd:
          case SSE3e_RegWr:
          case SSE3g1_RegWr:
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index 6264abf..4726c90 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -544,7 +544,7 @@
 
          case SSE2a_MemRd:
          case SSE2a1_MemRd:
-            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
+            sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
             t_read = u_in->val3;
             is_FPU_R = True;
             break;
@@ -556,7 +556,7 @@
             break;
 
          case SSE3a1_MemRd:
-            sk_assert(u_in->size == 16);
+            sk_assert(u_in->size == 8 || u_in->size == 16);
             t_read = u_in->val3;
             is_FPU_R = True;
             break;
@@ -577,7 +577,7 @@
             break;
 
          case SSE2a_MemWr:
-            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
+            sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
             t_write = u_in->val3;
             is_FPU_W = True;
             break;
@@ -798,7 +798,7 @@
 
          case SSE2a_MemRd:
          case SSE2a1_MemRd:
-            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
+            sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
             t_read = u_in->val3;
             t_read_addr = newTemp(cb);
             uInstr2(cb, MOV, 4, TempReg, u_in->val3,  TempReg, t_read_addr);
@@ -821,7 +821,7 @@
             break;
 
          case SSE3a1_MemRd:
-            sk_assert(u_in->size == 16);
+            sk_assert(u_in->size == 8 || u_in->size == 16);
             t_read = u_in->val3;
             t_read_addr = newTemp(cb);
             uInstr2(cb, MOV, 4, TempReg, u_in->val3,  TempReg, t_read_addr);
@@ -861,7 +861,7 @@
             break;
 
          case SSE2a_MemWr:
-            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
+            sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
            /* fall through */
          case SSE3a_MemWr:
             sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index 7be34b1..bb24e69 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -1518,6 +1518,78 @@
                   nameIReg(4,ireg) );
 }
 
+static void emit_SSE2e1 ( FlagSet uses_sflags, 
+                          FlagSet sets_sflags,
+                          UChar first_byte, 
+                          UChar second_byte, 
+                         UChar third_byte,
+                          UChar fourth_byte,
+                          Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   third_byte &= 0x38; /* mask out mod and rm fields */
+   third_byte |= 0xC0; /* set top two bits: mod = 11b */
+   third_byte |= (ireg & 7); /* patch in our ireg */
+   VG_(emitB) ( third_byte );
+   VG_(emitB) ( fourth_byte );
+   if (dis)
+      VG_(printf)(
+         "\n\t\tsse2e1--0x%x:0x%x:0x%x:0x%x-(%s)\n", 
+         (UInt)first_byte, (UInt)second_byte, 
+         (UInt)third_byte, (UInt)fourth_byte,
+         nameIReg(4,ireg) 
+      );
+}
+
+static void emit_SSE2g1 ( FlagSet uses_sflags, 
+                          FlagSet sets_sflags,
+                          UChar first_byte, 
+                          UChar second_byte, 
+                          UChar third_byte,
+                          UChar fourth_byte,
+                          Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   third_byte &= 0xC7; /* mask out reg field */
+   third_byte |= 0xC0; /* set top two bits: mod = 11b */
+   third_byte |= ((ireg & 7) << 3); /* patch in our ireg */
+   VG_(emitB) ( third_byte );
+   VG_(emitB) ( fourth_byte );
+   if (dis)
+      VG_(printf)(
+         "\n\t\tsse2g1_reg_wr--0x%x:0x%x:0x%x:0x%x-(%s)\n", 
+         (UInt)first_byte, (UInt)second_byte, 
+         (UInt)third_byte, (UInt)fourth_byte,
+         nameIReg(4,ireg) 
+      );
+}
+
+static void emit_SSE2g ( FlagSet uses_sflags, 
+                         FlagSet sets_sflags,
+                         UChar first_byte, 
+                         UChar second_byte, 
+                        UChar third_byte,
+                         Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   third_byte &= 0xC7; /* mask out reg field */
+   third_byte |= 0xC0; /* set top two bits: mod = 11b */
+   third_byte |= ((ireg & 7) << 3); /* patch in our ireg */
+   VG_(emitB) ( third_byte );
+   if (dis)
+      VG_(printf)(
+         "\n\t\tsse2g--0x%x:0x%x:0x%x-(%s)\n", 
+         (UInt)first_byte, (UInt)second_byte, (UInt)third_byte,
+         nameIReg(4,ireg) 
+      );
+}
+
 static void emit_SSE2a1 ( FlagSet uses_sflags, 
                           FlagSet sets_sflags,
                           UChar first_byte, 
@@ -4075,7 +4147,8 @@
 
       case SSE2a_MemWr:
       case SSE2a_MemRd:
-         vg_assert(u->size == 4 || u->size == 16 || u->size == 512);
+         vg_assert(u->size == 4 || u->size == 8
+                   || u->size == 16 || u->size == 512);
          vg_assert(u->tag1 == Lit16);
          vg_assert(u->tag2 == Lit16);
          vg_assert(u->tag3 == RealReg);
@@ -4090,6 +4163,59 @@
                       u->val3 );
          break;
 
+      case SSE2g_RegWr:
+         vg_assert(u->size == 4);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE2g ( u->flags_r, u->flags_w,
+                      (u->val1 >> 8) & 0xFF,
+                      u->val1 & 0xFF,
+                      u->val2 & 0xFF,
+                      u->val3 );
+         break;
+
+      case SSE2g1_RegWr:
+         vg_assert(u->size == 4);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE2g1 ( u->flags_r, u->flags_w,
+                       (u->val1 >> 8) & 0xFF,
+                       u->val1 & 0xFF,
+                       u->val2 & 0xFF,
+                       u->lit32 & 0xFF,
+                       u->val3 );
+         break;
+
+      case SSE2e1_RegRd:
+         vg_assert(u->size == 2);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE2e1 ( u->flags_r, u->flags_w,
+                       (u->val1 >> 8) & 0xFF,
+                       u->val1 & 0xFF,
+                       u->val2 & 0xFF,
+                       u->lit32 & 0xFF,
+                       u->val3 );
+         break;
+
       case SSE2a1_MemRd:
          vg_assert(u->size == 4 || u->size == 16);
          vg_assert(u->tag1 == Lit16);
@@ -4194,7 +4320,7 @@
          break;
 
       case SSE3a1_MemRd:
-         vg_assert(u->size == 16);
+         vg_assert(u->size == 8 || u->size == 16);
          vg_assert(u->tag1 == Lit16);
          vg_assert(u->tag2 == Lit16);
          vg_assert(u->tag3 == RealReg);
@@ -4208,7 +4334,7 @@
                       u->val1 & 0xFF,
                       (u->val2 >> 8) & 0xFF,
                       u->val2 & 0xFF,
-                      (u->lit32 >> 8) & 0xFF,
+                      u->lit32 & 0xFF,
                       u->val3 );
          break;
 
diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c
index 0d82f8f..bb474b9 100644
--- a/coregrind/vg_to_ucode.c
+++ b/coregrind/vg_to_ucode.c
@@ -3397,6 +3397,193 @@
    return eip;
 }
 
+
+/* Simple SSE operations, either 
+       op   (src)xmmreg, (dst)mmxreg
+   or
+       op   (src)address, (dst)mmxreg
+   2 opcode bytes.
+   Supplied eip points to the first address mode byte.
+*/
+static
+Addr dis_SSE2_to_MMX ( UCodeBlock *cb,
+                       UChar sorb,
+                       Addr eip,
+                       Int sz, 
+                       Char* name, 
+                       UChar opc1, 
+                       UChar opc2 )
+{
+   UChar dis_buf[50];
+   UChar modrm = getUChar(eip);
+   if (epartIsReg(modrm)) {
+      /* Completely internal SSE insn. */
+      uInstr2(cb, SSE3, 0,  /* ignore sz for internal ops */
+                  Lit16, (((UShort)opc1) << 8) | (UShort)opc2,
+                  Lit16, (UShort)modrm );
+      if (dis)
+         VG_(printf)("%s %s, %s\n", name, 
+                     nameXMMReg(eregOfRM(modrm)), 
+                     nameMMXReg(gregOfRM(modrm)) );
+      eip++;
+   } else {
+      UInt pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+      Int  tmpa = LOW24(pair);
+      eip += HI8(pair);
+      uInstr3(cb, SSE2a_MemRd, sz,
+                  Lit16, (((UShort)(opc1)) << 8) | ((UShort)opc2),
+                  Lit16, ((UShort)modrm),
+                  TempReg, tmpa);
+      if (dis)
+         VG_(printf)("%s %s, %s\n", 
+                     name,
+                     dis_buf,
+                     nameMMXReg(gregOfRM(modrm)));
+   }
+   return eip;
+}
+
+
+/* Simple SSE operations, either 
+       op   (src)mmxreg, (dst)xmmreg
+   or
+       op   (src)address, (dst)xmmreg
+   2 opcode bytes.
+   Supplied eip points to the first address mode byte.
+*/
+static
+Addr dis_SSE2_from_MMX ( UCodeBlock *cb,
+                         UChar sorb,
+                         Addr eip,
+                         Int sz, 
+                         Char* name, 
+                         UChar opc1, 
+                         UChar opc2 )
+{
+   UChar dis_buf[50];
+   UChar modrm = getUChar(eip);
+   if (epartIsReg(modrm)) {
+      /* Completely internal SSE insn. */
+      uInstr2(cb, SSE3, 0,  /* ignore sz for internal ops */
+                  Lit16, (((UShort)opc1) << 8) | (UShort)opc2,
+                  Lit16, (UShort)modrm );
+      if (dis)
+         VG_(printf)("%s %s, %s\n", name, 
+                     nameMMXReg(eregOfRM(modrm)), 
+                     nameXMMReg(gregOfRM(modrm)) );
+      eip++;
+   } else {
+      UInt pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+      Int  tmpa = LOW24(pair);
+      eip += HI8(pair);
+      uInstr3(cb, SSE2a_MemRd, sz,
+                  Lit16, (((UShort)(opc1)) << 8) | ((UShort)opc2),
+                  Lit16, ((UShort)modrm),
+                  TempReg, tmpa);
+      if (dis)
+         VG_(printf)("%s %s, %s\n", 
+                     name,
+                     dis_buf,
+                     nameXMMReg(gregOfRM(modrm)));
+   }
+   return eip;
+}
+
+
+/* Simple SSE operations, either 
+       op   (src)xmmreg, (dst)mmxreg
+   or
+       op   (src)address, (dst)mmxreg
+   3 opcode bytes.
+   Supplied eip points to the first address mode byte.
+*/
+static
+Addr dis_SSE3_to_MMX ( UCodeBlock *cb,
+                       UChar sorb,
+                       Addr eip,
+                       Int sz,
+                       Char* name, 
+                       UChar opc1, 
+                       UChar opc2, 
+                       UChar opc3 )
+{
+   UChar dis_buf[50];
+   UChar modrm = getUChar(eip);
+   if (epartIsReg(modrm)) {
+      /* Completely internal SSE insn. */
+      uInstr2(cb, SSE4, 0,  /* ignore sz for internal ops */
+                  Lit16, (((UShort)opc1) << 8) | (UShort)opc2,
+                  Lit16, (((UShort)opc3) << 8) | (UShort)modrm );
+      if (dis)
+         VG_(printf)("%s %s, %s\n", name, 
+                     nameXMMReg(eregOfRM(modrm)), 
+                     nameMMXReg(gregOfRM(modrm)) );
+      eip++;
+   } else {
+      UInt pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+      Int  tmpa = LOW24(pair);
+      eip += HI8(pair);
+      uInstr3(cb, SSE3a_MemRd, sz,
+                  Lit16, (((UShort)(opc1)) << 8) | ((UShort)opc2),
+                  Lit16, (((UShort)(opc3)) << 8) | ((UShort)modrm),
+                  TempReg, tmpa);
+      if (dis)
+         VG_(printf)("%s %s, %s\n", 
+                     name,
+                     dis_buf,
+                     nameMMXReg(gregOfRM(modrm)));
+   }
+   return eip;
+}
+
+
+/* Simple SSE operations, either 
+       op   (src)mmxreg, (dst)xmmreg
+   or
+       op   (src)address, (dst)xmmreg
+   3 opcode bytes.
+   Supplied eip points to the first address mode byte.
+*/
+static
+Addr dis_SSE3_from_MMX ( UCodeBlock *cb,
+                         UChar sorb,
+                         Addr eip,
+                         Int sz,
+                         Char* name, 
+                         UChar opc1, 
+                         UChar opc2, 
+                         UChar opc3 )
+{
+   UChar dis_buf[50];
+   UChar modrm = getUChar(eip);
+   if (epartIsReg(modrm)) {
+      /* Completely internal SSE insn. */
+      uInstr2(cb, SSE4, 0,  /* ignore sz for internal ops */
+                  Lit16, (((UShort)opc1) << 8) | (UShort)opc2,
+                  Lit16, (((UShort)opc3) << 8) | (UShort)modrm );
+      if (dis)
+         VG_(printf)("%s %s, %s\n", name, 
+                     nameMMXReg(eregOfRM(modrm)), 
+                     nameXMMReg(gregOfRM(modrm)) );
+      eip++;
+   } else {
+      UInt pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+      Int  tmpa = LOW24(pair);
+      eip += HI8(pair);
+      uInstr3(cb, SSE3a_MemRd, sz,
+                  Lit16, (((UShort)(opc1)) << 8) | ((UShort)opc2),
+                  Lit16, (((UShort)(opc3)) << 8) | ((UShort)modrm),
+                  TempReg, tmpa);
+      if (dis)
+         VG_(printf)("%s %s, %s\n", 
+                     name,
+                     dis_buf,
+                     nameXMMReg(gregOfRM(modrm)));
+   }
+   return eip;
+}
+
+
 static 
 void dis_push_segreg ( UCodeBlock* cb, UInt sreg, Int sz )
 {
@@ -3596,6 +3783,57 @@
       goto decode_success;
    }
 
+   /* CLFLUSH -- flush cache line */
+   if (insn[0] == 0x0F && insn[1] == 0xAE
+       && (!epartIsReg(insn[2]))
+       && (gregOfRM(insn[2]) == 7))
+   {
+      vg_assert(sz == 4);
+      pair = disAMode ( cb, sorb, eip+2, dis?dis_buf:NULL );
+      t1   = LOW24(pair);
+      eip += 2+HI8(pair);
+      uInstr3(cb, SSE2a_MemRd, 0,  /* ignore sz for internal ops */
+                  Lit16, (((UShort)0x0F) << 8) | (UShort)0xAE,
+                  Lit16, (UShort)insn[2],
+                  TempReg, t1 );
+      if (dis)
+         VG_(printf)("clflush %s\n", dis_buf);
+      goto decode_success;
+   }
+
+   /* CVTPI2PS (0x0F,0x2A) -- mm/m64, xmm */
+   /* CVTPI2PD (0x66,0x0F,0x2A) -- mm/m64, xmm */
+   if (insn[0] == 0x0F && insn[1] == 0x2A) {
+      if (sz == 4) {
+         eip = dis_SSE2_from_MMX
+                  ( cb, sorb, eip+2, 8, "cvtpi2ps",
+                        insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_from_MMX
+                  ( cb, sorb, eip+2, 8, "cvtpi2pd",
+                        0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
+   /* CVTTPS2PI (0x0F,0x2C) -- xmm/m64, mm */
+   /* CVTPS2PI (0x0F,0x2D) -- xmm/m64, mm */
+   /* CVTTPD2PI (0x66,0x0F,0x2C) -- xmm/m128, mm */
+   /* CVTPD2PI (0x66,0x0F,0x2D) -- xmm/m128, mm */
+   if (insn[0] == 0x0F
+       && (insn[1] == 0x2C || insn[1] == 0x2D)) {
+      if (sz == 4) {
+         eip = dis_SSE2_to_MMX
+                  ( cb, sorb, eip+2, 8, "cvt{t}ps2pi",
+                        insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_to_MMX
+                  ( cb, sorb, eip+2, 16, "cvt{t}pd2pi",
+                        0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
    /* CVTTSD2SI (0xF2,0x0F,0x2C) -- convert a double-precision float
       value in memory or xmm reg to int and put it in an ireg.
       Truncate. */
@@ -3695,6 +3933,20 @@
       goto decode_success;
    }
 
+   /* CVTPS2PD -- convert two packed floats to two packed doubles. */
+   /* 0x66: CVTPD2PS -- convert two packed doubles to two packed floats. */
+   if (insn[0] == 0x0F && insn[1] == 0x5A) {
+      vg_assert(sz == 2 || sz == 4);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 8, "cvtps2pd",
+                                         insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "cvtpd2ps",
+                                     0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
    /* CVTSS2SD -- convert one single float to double. */
    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
       vg_assert(sz == 4);
@@ -3711,6 +3963,60 @@
       goto decode_success;
    }
 
+   /* CVTDQ2PS -- convert four ints to four packed floats. */
+   /* 0x66: CVTPS2DQ -- convert four packed floats to four ints. */
+   if (insn[0] == 0x0F && insn[1] == 0x5B) {
+      vg_assert(sz == 2 || sz == 4);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "cvtdq2ps",
+                                         insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "cvtps2dq",
+                                         0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
+   /* CVTPD2DQ -- convert two packed doubles to two ints. */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0xE6) {
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 8, "cvtpd2dq",
+                                      0x66, insn[0], insn[1] );
+      goto decode_success;
+   }
+
+   /* CVTTPD2DQ -- convert two packed doubles to two ints with truncation. */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
+      vg_assert(sz == 4);
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8, "cvttpd2dq",
+                                      insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* CVTDQ2PD -- convert two ints to two packed doubles. */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
+      vg_assert(sz == 4);
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8, "cvtdq2pd",
+                                      insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* CVTTPS2DQ -- convert four packed floats to four ints with truncation. */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
+      vg_assert(sz == 4);
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 16, "cvttps2dq",
+                                      insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* CMPSS -- compare scalar floats. */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
+      vg_assert(sz == 4);
+      eip = dis_SSE3_reg_or_mem_Imm8 ( cb, sorb, eip+3, 8, "cmpss",
+                                       insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
    /* CMPSD -- compare scalar doubles. */
    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
       vg_assert(sz == 4);
@@ -3742,6 +4048,22 @@
       goto decode_success;
    }
 
+   /* PSHUFLW */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
+      eip = dis_SSE3_reg_or_mem_Imm8 ( cb, sorb, eip+3, 16, 
+                                           "pshuflw",
+                                           insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* PSHUFHW */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
+      eip = dis_SSE3_reg_or_mem_Imm8 ( cb, sorb, eip+3, 16, 
+                                           "pshufhw",
+                                           insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
    /* PSHUFW */
    if (sz == 4
        && insn[0] == 0x0F && insn[1] == 0x70) {
@@ -3882,6 +4204,20 @@
       goto decode_success;
    }
 
+   /* MINPS */
+   /* 0x66: MINPD */
+   if (insn[0] == 0x0F && insn[1] == 0x5D) {
+      vg_assert(sz == 4 || sz == 2);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "minps",
+                                         insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "minpd",
+                                         0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
    /* 0xF3: MAXSD */
    /* 0xF3: MAXSS */
    if ((insn[0] == 0xF2 || insn[0] == 0xF3) 
@@ -3944,11 +4280,17 @@
       goto decode_success;
    }
 
-   /* ORPD (src)xmmreg-or-mem, (dst)xmmreg */
-   if (sz == 2
-       && insn[0] == 0x0F && insn[1] == 0x56) {
+   /* ORPS */
+   /* 0x66: ORPD (src)xmmreg-or-mem, (dst)xmmreg */
+   if (insn[0] == 0x0F && insn[1] == 0x56) {
+      vg_assert(sz == 4 || sz == 2);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "orps",
+                                         insn[0], insn[1] );
+      } else {
       eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "orpd",
                                       0x66, insn[0], insn[1] );
+      }
       goto decode_success;
    }
 
@@ -4013,23 +4355,25 @@
                                       0x66, insn[0], insn[1] );
       goto decode_success;
    }
-   /* 0xE0: PAVGB(src)xmmreg-or-mem, (dst)xmmreg, size 4 */
-   if (sz == 4
-       && insn[0] == 0x0F 
-       && insn[1] == 0xE0 ) {
-      eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "pavg{b,w}",
-                                      insn[0], insn[1] );
+ 
+   /* 0xF6: PSADBW(src)xmmreg-or-mem, (dst)xmmreg */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0xF6) {
+     eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "psadbw",
+                                      0x66, insn[0], insn[1] );
       goto decode_success;
    }
  
    /* 0x60: PUNPCKLBW (src)xmmreg-or-mem, (dst)xmmreg */
    /* 0x61: PUNPCKLWD (src)xmmreg-or-mem, (dst)xmmreg */
    /* 0x62: PUNPCKLDQ (src)xmmreg-or-mem, (dst)xmmreg */
+   /* 0x6C: PUNPCKQLQDQ (src)xmmreg-or-mem, (dst)xmmreg */
    if (sz == 2
        && insn[0] == 0x0F 
-       && (insn[1] == 0x60 || insn[1] == 0x61 || insn[1] == 0x62)) {
+       && (insn[1] == 0x60 || insn[1] == 0x61
+           || insn[1] == 0x62 || insn[1] == 0x6C)) {
       eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, 
-                                      "punpckl{bw,wd,dq}",
+                                      "punpckl{bw,wd,dq,qdq}",
                                       0x66, insn[0], insn[1] );
       goto decode_success;
    }
@@ -4037,11 +4381,13 @@
    /* 0x68: PUNPCKHBW (src)xmmreg-or-mem, (dst)xmmreg */
    /* 0x69: PUNPCKHWD (src)xmmreg-or-mem, (dst)xmmreg */
    /* 0x6A: PUNPCKHDQ (src)xmmreg-or-mem, (dst)xmmreg */
+   /* 0x6D: PUNPCKHQDQ (src)xmmreg-or-mem, (dst)xmmreg */
    if (sz == 2
        && insn[0] == 0x0F 
-       && (insn[1] == 0x68 || insn[1] == 0x69 || insn[1] == 0x6A)) {
+       && (insn[1] == 0x68 || insn[1] == 0x69
+           || insn[1] == 0x6A || insn[1] == 0x6D)) {
       eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, 
-                                      "punpckh{bw,wd,dq}",
+                                      "punpckh{bw,wd,dq,qdq}",
                                       0x66, insn[0], insn[1] );
       goto decode_success;
    }
@@ -4142,12 +4488,21 @@
       goto decode_success;
    }
 
+   /* 0xE4: PMULHUW(src)xmmreg-or-mem, (dst)xmmreg */
    /* 0xE5: PMULHW(src)xmmreg-or-mem, (dst)xmmreg */
    /* 0xD5: PMULLW(src)xmmreg-or-mem, (dst)xmmreg */
    if (sz == 2
        && insn[0] == 0x0F 
-       && (insn[1] == 0xE5 || insn[1] == 0xD5)) {
-      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "pmul{h,l}w",
+       && (insn[1] == 0xE4 || insn[1] == 0xE5 || insn[1] == 0xD5)) {
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "pmul{hu,h,l}w",
+                                      0x66, insn[0], insn[1] );
+      goto decode_success;
+   }
+
+   /* 0xD5: PMULUDQ(src)xmmreg-or-mem, (dst)xmmreg */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0xF4) {
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "pmuludq",
                                       0x66, insn[0], insn[1] );
       goto decode_success;
    }
@@ -4291,6 +4646,26 @@
       goto decode_success;
    }
 
+   /* MOVDQ2Q -- move low 4 bytes of XMM reg to MMX reg. */
+   if (insn[0] == 0xF2
+       && insn[1] == 0x0F
+       && insn[2] == 0xD6) {
+      eip = dis_SSE3_to_MMX
+               ( cb, sorb, eip+3, 8, "movdq2q",
+                     insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* MOVQ2DQ -- move MMX reg to low 4 bytes of XMM reg. */
+   if (insn[0] == 0xF3
+       && insn[1] == 0x0F
+       && insn[2] == 0xD6) {
+      eip = dis_SSE3_from_MMX
+               ( cb, sorb, eip+3, 8, "movq2dq",
+                     insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
    /* MOVSS -- move 4 bytes of XMM reg to/from XMM reg or mem. */
    if (insn[0] == 0xF3
        && insn[1] == 0x0F 
@@ -4356,19 +4731,6 @@
       goto decode_success;
    }
 
-   /* MOVLPD -- 8-byte load/store. */
-   if (sz == 2 
-       && insn[0] == 0x0F 
-       && (insn[1] == 0x12 || insn[1] == 0x13)) {
-      Bool is_store = insn[1]==0x13;
-      /* Cannot be used for reg-reg moves, according to Intel docs. */
-      vg_assert(!epartIsReg(insn[2]));
-      eip = dis_SSE3_load_store_or_mov
-               (cb, sorb, eip+2, 8, is_store, "movlpd", 
-                    0x66, insn[0], insn[1] );
-      goto decode_success;
-   }
-
    /* MOVDQU -- unaligned 16-byte load/store. */
    if (insn[0] == 0xF3
        && insn[1] == 0x0F 
@@ -4522,20 +4884,6 @@
       goto decode_success;
    }
 
-   /* MOVLPS -- 8-byte load/store.  How is this different from MOVLPS
-      ? */
-   if (insn[0] == 0x0F 
-       && (insn[1] == 0x12 || insn[1] == 0x13)) {
-      Bool is_store = insn[1]==0x13;
-      vg_assert(sz == 4);
-      /* Cannot be used for reg-reg moves, according to Intel docs. */
-      //      vg_assert(!epartIsReg(insn[2]));
-      eip = dis_SSE2_load_store_or_mov
-               (cb, sorb, eip+2, 8, is_store, "movlps", 
-                    insn[0], insn[1] );
-      goto decode_success;
-   }
-
    /* 0xF3: RCPSS -- reciprocal of scalar float */
    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
       vg_assert(sz == 4);
@@ -4547,19 +4895,31 @@
 
    /* MOVMSKPD -- extract 2 sign bits from a xmm reg and copy them to 
       an ireg.  Top 30 bits of ireg are set to zero. */
-   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x50) {
+   /* MOVMSKPS -- extract 4 sign bits from a xmm reg and copy them to 
+      an ireg.  Top 28 bits of ireg are set to zero. */
+   if (insn[0] == 0x0F && insn[1] == 0x50) {
+      vg_assert(sz == 4 || sz == 2);
       modrm = insn[2];
       /* Intel docs don't say anything about a memory source being
 	 allowed here. */
       vg_assert(epartIsReg(modrm));
       t1 = newTemp(cb);
-      uInstr3(cb, SSE3g_RegWr, 4,
-                  Lit16, (((UShort)0x66) << 8) | (UShort)insn[0],
-                  Lit16, (((UShort)insn[1]) << 8) | (UShort)modrm,
-                  TempReg, t1 );
-      uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+      if (sz == 4) {
+         uInstr3(cb, SSE2g_RegWr, 4,
+                     Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
+                     Lit16, (UShort)modrm,
+                     TempReg, t1 );
+         uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+      } else {
+         uInstr3(cb, SSE3g_RegWr, 4,
+                     Lit16, (((UShort)0x66) << 8) | (UShort)insn[0],
+                     Lit16, (((UShort)insn[1]) << 8) | (UShort)modrm,
+                     TempReg, t1 );
+         uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+      }
       if (dis)
-         VG_(printf)("movmskpd %s, %s\n", 
+         VG_(printf)("movmskp%c %s, %s\n",
+                      sz == 4 ? 's' : 'd',
                       nameXMMReg(eregOfRM(modrm)),
                       nameIReg(4,gregOfRM(modrm)));
       eip += 3;
@@ -4580,16 +4940,55 @@
       goto decode_success;
    }
 
-   /* MOVHPD -- 8-byte load/store. */
-   if (sz == 2 
-       && insn[0] == 0x0F 
+   /* MOVHLPS -- move two packed floats from high quadword to low quadword */
+   /* MOVLPS -- load/store two packed floats to/from low quadword. */
+   /* MOVLPD -- load/store packed double to/from low quadword. */
+   if (insn[0] == 0x0F 
+       && (insn[1] == 0x12 || insn[1] == 0x13)) {
+      Bool is_store = insn[1]==0x13;
+      vg_assert(sz == 4 || sz == 2);
+      if (sz == 4) {
+         if (epartIsReg(insn[2])) {
+            vg_assert(insn[1]==0x12);
+            eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "movhlps",
+                                            insn[0], insn[1] );
+         } else {
+            eip = dis_SSE2_load_store_or_mov
+                     (cb, sorb, eip+2, 8, is_store, "movlps", 
+                          insn[0], insn[1] );
+         }
+      } else {
+         vg_assert(!epartIsReg(insn[2]));
+         eip = dis_SSE3_load_store_or_mov
+                  (cb, sorb, eip+2, 8, is_store, "movlpd", 
+                       0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
+   /* MOVLHPS -- move two packed floats from low quadword to high quadword */
+   /* MOVHPS -- load/store two packed floats to/from high quadword. */
+   /* MOVHPD -- load/store packed double to/from high quadword. */
+   if (insn[0] == 0x0F 
        && (insn[1] == 0x16 || insn[1] == 0x17)) {
       Bool is_store = insn[1]==0x17;
-      /* Cannot be used for reg-reg moves, according to Intel docs. */
+      vg_assert(sz == 4 || sz == 2);
+      if (sz == 4) {
+         if (epartIsReg(insn[2])) {
+            vg_assert(insn[1]==0x16);
+            eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "movlhps",
+                                            insn[0], insn[1] );
+         } else {
+            eip = dis_SSE2_load_store_or_mov
+                     (cb, sorb, eip+2, 8, is_store, "movhps", 
+                          insn[0], insn[1] );
+         }
+      } else {
       vg_assert(!epartIsReg(insn[2]));
       eip = dis_SSE3_load_store_or_mov
                (cb, sorb, eip+2, 8, is_store, "movhpd", 
                     0x66, insn[0], insn[1] );
+      }
       goto decode_success;
    }
 
@@ -4614,28 +5013,28 @@
       goto decode_success;
    }
 
-   /* CVTDQ2PD -- convert one single double. to float. */
-   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
-      vg_assert(sz == 4);
-      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8, "cvtdq2pd",
-                                      insn[0], insn[1], insn[2] );
-      goto decode_success;
-   }
-
-   /* CVTPD2PS -- convert two doubles to two floats. */
-   if (sz == 2 &&
-       insn[0] == 0x0F && insn[1] == 0x5A) {
-      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "cvtpd2ps",
+   /* sz==4: SQRTPS: square root of packed float. */
+   /* sz==2: SQRTPD: square root of packed double. */
+   if (insn[0] == 0x0F && insn[1] == 0x51) {
+      vg_assert(sz == 2 || sz == 4);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, 
+                                         "sqrtps",
+                                         insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, 
+                                         "sqrtpd",
                                  0x66, insn[0], insn[1] );
+      }
       goto decode_success;
    }
 
-   /* SQRTPD: square root of packed double. */
-   if (sz == 2
-       && insn[0] == 0x0F && insn[1] == 0x51) {
-      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, 
-                                      "sqrtpd",
-                                      0x66, insn[0], insn[1] );
+   /* RSQRTPS: square root reciprocal of packed float. */
+   if (insn[0] == 0x0F && insn[1] == 0x52) {
+      vg_assert(sz == 4);
+      eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, 
+                                      "rsqrtps",
+                                      insn[0], insn[1] );
       goto decode_success;
    }
 
@@ -6072,6 +6471,24 @@
          eip = dis_movx_E_G ( cb, sorb, eip, 2, 4, True );
          break;
 
+      /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
+
+      case 0xC3: /* MOVNTI Gv,Ev */
+         vg_assert(sz == 4);
+         modrm = getUChar(eip);
+         vg_assert(!epartIsReg(modrm));
+         t1 = newTemp(cb);
+         uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
+         pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+         t2 = LOW24(pair);
+         eip += HI8(pair);
+         uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
+         if (dis)
+           VG_(printf)("movnti %s,%s\n",
+                       nameIReg(4,gregOfRM(modrm)),
+                       dis_buf);
+         break;
+
       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
 
       case 0xAF: /* IMUL Ev, Gv */
@@ -6428,6 +6845,12 @@
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "padd", True );
          break;
 
+      case 0xD4: 
+         /* PADDQ (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "paddq", False );
+         break;
+
       case 0xEC: case 0xED:
          /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
          vg_assert(sz == 4);
@@ -6440,7 +6863,7 @@
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "paddus", True );
          break;
 
-      case 0xF8: case 0xF9: case 0xFA:
+      case 0xF8: case 0xF9: case 0xFA: case 0xFB:
          /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
          vg_assert(sz == 4);
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psub", True );
@@ -6458,6 +6881,11 @@
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psubus", True );
          break;
 
+      case 0xE4: /* PMULHUW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmulhuw", False );
+         break;
+
       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
          vg_assert(sz == 4);
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmulhw", False );
@@ -6468,6 +6896,11 @@
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmullw", False );
          break;
 
+      case 0xF4: /* PMULUDQ (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmuludq", False );
+         break;
+
       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
          vg_assert(sz == 4);
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmaddwd", False );
@@ -6550,6 +6983,105 @@
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psra", True );
          break;
 
+      case 0xDA:
+         /* PMINUB (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pminub", False );
+         break;
+
+      case 0xDE:
+         /* PMAXUB (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmaxub", False );
+         break;
+
+      case 0xEA:
+         /* PMINSW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pminsw", False );
+         break;
+
+      case 0xEE:
+         /* PMAXSW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pmaxsw", False );
+         break;
+
+      case 0xE0:
+         /* PAVGB (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pavgb", False );
+         break;
+
+      case 0xE3:
+         /* PAVGW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "pavgw", False );
+         break;
+
+      case 0xF6:
+         /* PSADBW (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psadbw", False );
+         break;
+
+      case 0xD7:
+         /* PMOVMSKB (src)mmxreg, (dst)ireg */
+         vg_assert(sz == 4);
+         modrm = getUChar(eip);
+         vg_assert(epartIsReg(modrm));
+         t1 = newTemp(cb);
+         uInstr3(cb, SSE2g_RegWr, 4,
+                     Lit16, (((UShort)(0x0F)) << 8) | (UShort)(opc),
+                     Lit16, (UShort)modrm,
+                     TempReg, t1 );
+         uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+         if (dis)
+            VG_(printf)("pmovmskb %s, %s\n", 
+                        nameMMXReg(eregOfRM(modrm)),
+                        nameIReg(4,gregOfRM(modrm)));
+         eip++;         
+         break;
+
+      case 0xC5:
+         /* PEXTRW (src)mmxreg, (dst)ireg */
+         vg_assert(sz == 4);
+         t1 = newTemp(cb);
+         modrm = getUChar(eip); eip++;
+         abyte = getUChar(eip); eip++;
+         vg_assert(epartIsReg(modrm));
+         uInstr3(cb, SSE2g1_RegWr, 4,
+                     Lit16, (((UShort)(0x0F)) << 8) | (UShort)(opc),
+                     Lit16, (UShort)modrm,
+                     TempReg, t1 );
+         uLiteral(cb, abyte);
+         uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+         if (dis)
+            VG_(printf)("pextrw %s, %d, %s\n",
+                        nameMMXReg(eregOfRM(modrm)), (Int)abyte, 
+                        nameIReg(4, gregOfRM(modrm)));
+         break;
+
+      case 0xC4:
+         /* PINSRW (src)ireg, (dst)mmxreg */
+         vg_assert(sz == 4);
+         t1 = newTemp(cb);
+         modrm = getUChar(eip); eip++;
+         abyte = getUChar(eip); eip++;
+         vg_assert(epartIsReg(modrm));
+         uInstr2(cb, GET, 2, ArchReg, eregOfRM(modrm), TempReg, t1);
+         uInstr3(cb, SSE2e1_RegRd, 2,
+                     Lit16, (((UShort)(0x0F)) << 8) | (UShort)(opc),
+                     Lit16, (UShort)modrm,
+                     TempReg, t1 );
+         uLiteral(cb, abyte);
+         if (dis)
+            VG_(printf)("pinsrw %s, %d, %s\n",
+                        nameIReg(2, eregOfRM(modrm)),
+                        (Int)abyte, 
+                        nameMMXReg(gregOfRM(modrm)));
+         break;
+
       case 0xA1: /* POP %FS */
          dis_pop_segreg( cb, R_FS, sz ); break;
       case 0xA9: /* POP %GS */
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index 0a231c1..b0f9d70 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -414,7 +414,8 @@
 #  define SZ42 (u->size == 4 || u->size == 2)
 #  define SZ48 (u->size == 4 || u->size == 8)
 #  define SZ416 (u->size == 4 || u->size == 16)
-#  define SZsse2 (u->size == 4 || u->size == 16 || u->size == 512)
+#  define SZ816 (u->size == 8 || u->size == 16)
+#  define SZsse2 (u->size == 4 || u->size == 8 || u->size == 16 || u->size == 512)
 #  define SZsse3 (u->size == 4 || u->size == 8 || u->size == 16)
 #  define SZi  (u->size == 4 || u->size == 2 || u->size == 1)
 #  define SZf  (  u->size ==  4 || u->size ==  8 || u->size ==   2     \
@@ -567,11 +568,14 @@
    case SSE2a_MemWr:  return LIT0 && SZsse2 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE2a_MemRd:  return LIT0 && SZsse2 && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE2a1_MemRd: return LIT0 && SZ416  && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE2g_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE2g1_RegWr: return LIT8 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE2e1_RegRd: return LIT8 && SZ2    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3a_MemWr:  return LIT0 && SZsse3 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3a_MemRd:  return LIT0 && SZsse3 && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e_RegRd:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3a1_MemRd: return LIT8 && SZ16   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3a1_MemRd: return LIT8 && SZ816  && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3g_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3g1_RegWr: return LIT8 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e1_RegRd: return LIT8 && SZ2    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
@@ -897,7 +901,10 @@
       case MMX2_ERegWr: return "MMX2_eRWr" ;
       case SSE2a_MemWr: return "SSE2a_MWr";
       case SSE2a_MemRd: return "SSE2a_MRd";
+      case SSE2g_RegWr: return "SSE2g_RWr";
       case SSE2a1_MemRd: return "SSE2a1_MRd";
+      case SSE2g1_RegWr: return "SSE2g1_RWr";
+      case SSE2e1_RegRd: return "SSE2e1_RRd";
       case SSE3e_RegRd: return "SSE3e_RRd";
       case SSE3e_RegWr: return "SSE3e_RWr";
       case SSE3g_RegWr: return "SSE3g_RWr";
@@ -1062,6 +1069,9 @@
 
       case SSE2a_MemWr:
       case SSE2a_MemRd:
+      case SSE2g_RegWr:
+      case SSE2g1_RegWr:
+      case SSE2e1_RegRd:
          VG_(printf)("0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, u->val2 & 0xFF );
          VG_(pp_UOperand)(u, 3, 4, True);
@@ -1270,6 +1280,7 @@
 
       case SSE3a1_MemRd:
       case SSE2a1_MemRd:
+      case SSE2e1_RegRd:
       case SSE3e_RegRd:
       case SSE3a_MemWr:
       case SSE3a_MemRd:
@@ -1277,6 +1288,8 @@
       case SSE3e1_RegRd:
       case SSE2a_MemRd: RD(3); break;
 
+      case SSE2g_RegWr:
+      case SSE2g1_RegWr:
       case SSE3e_RegWr:
       case SSE3g1_RegWr:
       case SSE3g_RegWr: WR(3); break;
@@ -1441,6 +1454,7 @@
       case MMX2_MemRd: case MMX2_MemWr:
       case MMX2_ERegRd: case MMX2_ERegWr:
       case SSE2a_MemWr: case SSE2a_MemRd: case SSE2a1_MemRd:
+      case SSE2g_RegWr: case SSE2g1_RegWr: case SSE2e1_RegRd:
       case SSE3a_MemWr: case SSE3a_MemRd: case SSE3a1_MemRd:
       case SSE3e_RegRd: case SSE3g_RegWr: case SSE3e_RegWr:
       case SSE3g1_RegWr: case SSE3e1_RegRd:
diff --git a/include/vg_skin.h.base b/include/vg_skin.h.base
index 52fc2c8..324163b 100644
--- a/include/vg_skin.h.base
+++ b/include/vg_skin.h.base
@@ -710,6 +710,32 @@
       SSE2a_MemRd,
       SSE2a_MemWr,
 
+      /* 4 bytes, writes an integer register.  Insns of the form
+         bbbbbbbb:bbbbbbbb:11 ireg bbb.
+         Held in val1[15:0] and val2[7:0], and ireg is to be replaced
+         at codegen time by a reference to the relevant RealReg.
+         Transfer is always at size 4.  Arg3 holds this Temp/Real Reg.
+      */
+      SSE2g_RegWr,
+
+      /* 5 bytes, writes an integer register.  Insns of the form
+         bbbbbbbb:bbbbbbbb:11 ireg bbb :bbbbbbbb. Held in
+         val1[15:0] and val2[7:0] and lit32[7:0], and ireg is to be
+         replaced at codegen time by a reference to the relevant
+         RealReg.  Transfer is always at size 4.  Arg3 holds this
+         Temp/Real Reg.
+      */
+      SSE2g1_RegWr,
+
+      /* 5 bytes, reads an integer register.  Insns of the form
+         bbbbbbbb:bbbbbbbb:11 bbb ireg :bbbbbbbb. Held in
+         val1[15:0] and val2[7:0] and lit32[7:0], and ireg is to be
+         replaced at codegen time by a reference to the relevant
+         RealReg.  Transfer is always at size 4.  Arg3 holds this
+         Temp/Real Reg.
+      */
+      SSE2e1_RegRd,
+
       /* 4 bytes, no memrefs, no iregdefs, copy exactly to the
          output.  Held in val1[15:0] and val2[15:0]. */
       SSE4,
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index d21bb86..8f24e2a 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -1076,7 +1076,10 @@
          }
 
 	 /* SSE ins referencing scalar integer registers */
-	 case SSE3g_RegWr:
+         case SSE2g_RegWr:
+         case SSE2g1_RegWr:
+         case SSE2e1_RegRd:
+         case SSE3g_RegWr:
          case SSE3e_RegRd:
          case SSE3e_RegWr: 
          case SSE3g1_RegWr:
diff --git a/none/tests/Makefile.am b/none/tests/Makefile.am
index 0dd6e89..c6e23d0 100644
--- a/none/tests/Makefile.am
+++ b/none/tests/Makefile.am
@@ -24,6 +24,9 @@
 	fpu_lazy_eflags.vgtest \
 	fucomip.stderr.exp fucomip.vgtest \
 	gxx304.stderr.exp gxx304.vgtest \
+	insn_mmx.stderr.exp insn_mmx.stdout.exp insn_mmx.vgtest \
+	insn_sse.stderr.exp insn_sse.stdout.exp insn_sse.vgtest \
+	insn_sse2.stderr.exp insn_sse2.stdout.exp insn_sse2.vgtest \
 	map_unmap.stdout.exp map_unmap.vgtest \
 	mremap.stdout.exp mremap.vgtest \
 	munmap_exe.stderr.exp munmap_exe.vgtest \
@@ -47,7 +50,8 @@
 check_PROGRAMS = \
 	args bitfield1 bt_everything bt_literal coolo_strlen \
 	cpuid dastest discard exec-sigmask floored fork fpu_lazy_eflags \
-	fucomip munmap_exe map_unmap mremap rcl_assert \
+	fucomip insn_mmx insn_sse insn_sse2 \
+	munmap_exe map_unmap mremap rcl_assert \
 	rcrl readline1 resolv seg_override sha1_test shortpush shorts smc1 \
 	pth_blockedsig \
 	syscall-restart1 syscall-restart2 \
@@ -71,6 +75,12 @@
 floored_LDADD 		= -lm
 fpu_lazy_eflags_SOURCES	= fpu_lazy_eflags.c
 fucomip_SOURCES 	= fucomip.c
+insn_mmx_SOURCES	= insn_mmx.def
+insn_mmx_LDADD		= -lm
+insn_sse_SOURCES	= insn_sse.def
+insn_sse_LDADD		= -lm
+insn_sse2_SOURCES	= insn_sse2.def
+insn_sse2_LDADD		= -lm
 map_unmap_SOURCES	= map_unmap.c
 mremap_SOURCES		= mremap.c
 munmap_exe_SOURCES 	= munmap_exe.c
@@ -99,3 +109,5 @@
 # must be built with these flags -- bug only occurred with them
 fpu_lazy_eflags.o: CFLAGS += -O2 -mcpu=pentiumpro -march=pentiumpro
 
+.def.c:
+	$(PERL) gen_insn_test.pl < $< > $@
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 8659f05..13660d7 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -12,12 +12,13 @@
 EXTRA_DIST = $(noinst_SCRIPTS)
 
 check_PROGRAMS = \
-	true
+	true \
+	cputest
 
 AM_CFLAGS   = $(WERROR) -Winline -Wall -Wshadow -g
 AM_CXXFLAGS = $(AM_CFLAGS)
 
 # generic C ones
 true_SOURCES 	= true.c
-
+cputest_SOURCES = cputest.c
 
diff --git a/tests/vg_regtest.in b/tests/vg_regtest.in
index 84182d8..965df82 100755
--- a/tests/vg_regtest.in
+++ b/tests/vg_regtest.in
@@ -49,6 +49,7 @@
 #   - vgopts: <Valgrind options>                    (default: none)
 #   - stdout_filter: <filter to run stdout through> (default: none)
 #   - stderr_filter: <filter to run stderr through> (default: ./filter_stderr)
+#   - cpu_test: <cpu feature required for test>     (default: none)
 #
 # Note that filters are necessary for stderr results to filter out things that
 # always change, eg. process id numbers.
@@ -80,6 +81,7 @@
 my $args;               # test prog args
 my $stdout_filter;      # filter program to run stdout results file through
 my $stderr_filter;      # filter program to run stderr results file through
+my $cpu_test;           # cpu feature to check for before running test
 
 my @failures;           # List of failed tests
 
@@ -165,8 +167,8 @@
     my ($f) = @_;
 
     # Defaults.
-    ($vgopts, $prog, $args, $stdout_filter, $stderr_filter) = 
-        ("", undef, "", undef, undef);
+    ($vgopts, $prog, $args, $stdout_filter, $stderr_filter, $cpu_test) = 
+        ("", undef, "", undef, undef, undef);
 
     # Every test directory must have a "filter_stderr"
     $stderr_filter = validate_program(".", $default_stderr_filter, 1);
@@ -184,6 +186,8 @@
             $stdout_filter = validate_program(".", $1, 1);
         } elsif ($line =~ /^\s*stderr_filter:\s*(.*)$/) {
             $stderr_filter = validate_program(".", $1, 1);
+        } elsif ($line =~ /^\s*cpu_test:\s*(.*)$/) {
+            $cpu_test = $1;
         } else {
             die "Bad line in $f: $line\n";
         }
@@ -222,6 +226,10 @@
 
     read_vgtest_file($vgtest);
 
+    if (defined $cpu_test) {
+        return unless system("../../tests/cputest $cpu_test") == 0;
+    }
+
     printf("%-16s valgrind $vgopts $prog $args\n", "$name:");
 
     # Pass the appropriate --tool option for the directory (can be overridden
