Move the handling of PSHUFW from the SSE code to the MMX code so that
it will work on older Athlons which only have MMXEXT support.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@2319 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index 2f83e76..a0641dd 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -604,6 +604,13 @@
             is_FPU_R = True;
             break;
 
+         case MMX2a1_MemRd:
+            sk_assert(u_in->size == 8);
+            sk_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
+            t_read = u_in->val3;
+            is_FPU_R = True;
+            break;
+
          case SSE2a_MemRd:
          case SSE2a1_MemRd:
             sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
@@ -857,6 +864,18 @@
                         : MIN_LINE_SIZE);
             VG_(copy_UInstr)(cb, u_in);
             break;
+            break;
+
+         case MMX2a1_MemRd:
+            sk_assert(u_in->size == 8);
+            t_read      = u_in->val3;
+            t_read_addr = newTemp(cb);
+            uInstr2(cb, MOV, 4, TempReg, u_in->val3,  TempReg, t_read_addr);
+            data_size = ( u_in->size <= MIN_LINE_SIZE
+                        ? u_in->size
+                        : MIN_LINE_SIZE);
+            VG_(copy_UInstr)(cb, u_in);
+            break;
 
          case SSE2a_MemRd:
          case SSE2a1_MemRd:
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index 859d098..211f58e 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -1497,6 +1497,28 @@
                   nameIReg(4,ireg) );
 }
 
+static void emit_MMX2a1 ( FlagSet uses_sflags, 
+                          FlagSet sets_sflags,
+			  UChar first_byte, 
+                          UChar second_byte, 
+                          UChar third_byte, 
+                          Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+
+   boundscheck();
+
+   VG_(emitB) ( 0x0F );
+   VG_(emitB) ( first_byte );
+   second_byte &= 0x38; /* mask out mod and rm fields */
+   emit_amode_regmem_reg ( ireg, second_byte >> 3 );
+   VG_(emitB) ( third_byte );
+   if (dis)
+      VG_(printf)("\n\t\tmmx2a1-0x%x:0x%x:0x%x-(%s)\n", 
+                  (UInt)first_byte, (UInt)second_byte, (UInt)third_byte,
+                  nameIReg(4,ireg) );
+}
+
 static void emit_SSE2a ( FlagSet uses_sflags, 
                          FlagSet sets_sflags,
                          UChar first_byte, 
@@ -3273,6 +3295,17 @@
 }
 
 
+static void synth_MMX2a1 ( Bool uses_flags, Bool sets_flags,
+ 			   UChar first_byte,
+                           UChar second_byte, 
+                           UChar third_byte, 
+                           Int ireg )
+{
+   emit_MMX2a1 ( uses_flags, sets_flags, 
+                 first_byte, second_byte, third_byte, ireg );
+}
+
+
 static void synth_MMX2_reg_to_mmxreg ( Bool uses_flags, Bool sets_flags,
                                        UChar first_byte,
                                        UChar second_byte, 
@@ -4076,6 +4109,23 @@
                              u->val2 );
          break;
 
+      case MMX2a1_MemRd:
+         vg_assert(u->size == 8);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         synth_MMX2a1 ( u->flags_r, u->flags_w,
+                        (u->val1 >> 8) & 0xFF,
+                        u->val1 & 0xFF,
+                        u->val2 & 0xFF,
+                        u->val3 );
+         break;
+
       case MMX2_ERegRd:
          vg_assert(u->tag1 == Lit16);
          vg_assert(u->tag2 == RealReg);
diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c
index a63a4a5..41ea9c2 100644
--- a/coregrind/vg_to_ucode.c
+++ b/coregrind/vg_to_ucode.c
@@ -3289,6 +3289,57 @@
 }
 
 
+/* Simple MMX operations, either 
+       op   (src)mmxreg, (dst)mmxreg
+   or
+       op   (src)address, (dst)mmxreg
+   opc is the byte following the 0x0F prefix.
+*/
+static 
+Addr dis_MMXop_regmem_to_reg_Imm8 ( UCodeBlock* cb, 
+                                    UChar sorb,
+                                    Addr eip,
+                                    UChar opc,
+                                    Char* name,
+                                    Bool show_granularity )
+{
+    Char dis_buf[50];
+   UChar modrm = getUChar(eip);
+   UChar imm8;
+   Bool  isReg = epartIsReg(modrm);
+
+   if (isReg) {
+      eip++;
+      imm8 = getUChar(eip);
+      eip++;
+      uInstr2(cb, MMX3, 0, 
+                  Lit16, 
+                  (((UShort)(opc)) << 8) | ((UShort)modrm),
+                  Lit16,
+                  ((UShort)imm8));
+   } else {
+      UInt pair = disAMode ( cb, sorb, eip, dis_buf );
+      Int  tmpa = LOW24(pair);
+      eip += HI8(pair);
+      imm8 = getUChar(eip);
+      eip++;
+      uInstr3(cb, MMX2a1_MemRd, 8, 
+                  Lit16, 
+                  (((UShort)(opc)) << 8) | ((UShort)modrm),
+                  Lit16,
+                  ((UShort)imm8),
+                  TempReg, tmpa);
+   }
+
+   DIP("%s%s %s, %s, $%d\n", 
+       name, show_granularity ? nameMMXGran(opc & 3) : (Char*)"",
+       ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
+       nameMMXReg(gregOfRM(modrm)), (Int)imm8 );
+
+   return eip;
+}
+
+
 
 /* Simple SSE operations, either 
        op   (src)xmmreg, (dst)xmmreg
@@ -4217,15 +4268,6 @@
       goto decode_success;
    }
 
-   /* PSHUFW */
-   if (sz == 4
-       && insn[0] == 0x0F && insn[1] == 0x70) {
-      eip = dis_SSE2_reg_or_mem_Imm8 ( cb, sorb, eip+2, 8, 
-                                           "pshufw",
-                                           insn[0], insn[1] );
-      goto decode_success;
-   }
-
    /* SHUFPD */
    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
       eip = dis_SSE3_reg_or_mem_Imm8 ( cb, sorb, eip+2, 16, "shufpd",
@@ -7143,6 +7185,12 @@
          eip = dis_MMXop_regmem_to_reg ( cb, sorb, eip, opc, "psadbw", False );
          break;
 
+      case 0x70:
+         /* PSHUFW imm8, (src)mmxreg-or-mem, (dst)mmxreg */
+         vg_assert(sz == 4);
+         eip = dis_MMXop_regmem_to_reg_Imm8 ( cb, sorb, eip, opc, "pshufw", False );
+         break;
+
       case 0xD7:
          /* PMOVMSKB (src)mmxreg, (dst)ireg */
          vg_assert(sz == 4);
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index faaf877..738c5f7 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -555,14 +555,15 @@
                        (u->argc > 1                   ? TR2 : N2) && 
                        (u->argc > 2 || u->has_ret_val ? TR3 : N3) &&
                        u->regparms_n <= u->argc && XCCALL;
-   /* Fields checked:     lit32   size  flags_r/w tag1   tag2   tag3    (rest) */
+   /* Fields checked:       lit32   size  flags_r/w tag1   tag2   tag3    (rest) */
    case MMX1:
-   case MMX2:        return LIT0 && SZ0  && CC0 &&  Ls1 &&  N2 &&  N3 && XOTHER;
-   case MMX3:        return LIT0 && SZ0  && CC0 &&  Ls1 && Ls2 &&  N3 && XOTHER;
-   case MMX2_MemRd:  return LIT0 && SZ48 && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
-   case MMX2_MemWr:  return LIT0 && SZ48 && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
-   case MMX2_ERegRd: return LIT0 && SZ4  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
-   case MMX2_ERegWr: return LIT0 && SZ4  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
+   case MMX2:         return LIT0 && SZ0  && CC0 &&  Ls1 &&  N2 &&  N3 && XOTHER;
+   case MMX3:         return LIT0 && SZ0  && CC0 &&  Ls1 && Ls2 &&  N3 && XOTHER;
+   case MMX2_MemRd:   return LIT0 && SZ48 && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
+   case MMX2_MemWr:   return LIT0 && SZ48 && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
+   case MMX2a1_MemRd: return LIT0 && SZ8  && CC0 &&  Ls1 && Ls2 && TR3 && XOTHER;
+   case MMX2_ERegRd:  return LIT0 && SZ4  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
+   case MMX2_ERegWr:  return LIT0 && SZ4  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
 
    /* Fields checked:        lit32   size  flags_r/w tag1   tag2   tag3    (rest) */
    case SSE2a_MemWr:  return LIT0 && SZsse2 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
@@ -897,6 +898,7 @@
       case MMX3:       return "MMX3" ;
       case MMX2_MemRd: return "MMX2_MRd" ;
       case MMX2_MemWr: return "MMX2_MWr" ;
+      case MMX2a1_MemRd: return "MMX2a1_MRd" ;
       case MMX2_ERegRd: return "MMX2_eRRd" ;
       case MMX2_ERegWr: return "MMX2_eRWr" ;
       case SSE2a_MemWr: return "SSE2a_MWr";
@@ -1067,6 +1069,12 @@
          VG_(pp_UOperand)(u, 2, 4, True);
          break;
 
+      case MMX2a1_MemRd:
+          VG_(printf)("0x%x:0x%x:0x%x",
+                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, u->val2 & 0xFF );
+         VG_(pp_UOperand)(u, 3, 4, True);
+         break;
+
       case SSE2a_MemWr:
       case SSE2a_MemRd:
       case SSE2g_RegWr:
@@ -1296,6 +1304,7 @@
 
       case SSE3ag_MemRd_RegWr: RD(1); WR(2); break;
 
+      case MMX2a1_MemRd: RD(3); break;
       case MMX2_ERegRd: RD(2); break;
       case MMX2_ERegWr: WR(2); break;
 
@@ -1451,7 +1460,7 @@
       case JIFZ:
       case FPU: case FPU_R: case FPU_W:
       case MMX1: case MMX2: case MMX3:
-      case MMX2_MemRd: case MMX2_MemWr:
+      case MMX2_MemRd: case MMX2_MemWr: case MMX2a1_MemRd:
       case MMX2_ERegRd: case MMX2_ERegWr:
       case SSE2a_MemWr: case SSE2a_MemRd: case SSE2a1_MemRd:
       case SSE2g_RegWr: case SSE2g1_RegWr: case SSE2e1_RegRd:
diff --git a/helgrind/hg_main.c b/helgrind/hg_main.c
index 9b54604..a3516cf 100644
--- a/helgrind/hg_main.c
+++ b/helgrind/hg_main.c
@@ -2145,6 +2145,23 @@
 	    break;
 	 } 
 
+         case MMX2a1_MemRd: {
+            sk_assert(8 == u_in->size);
+	    
+	    t_size = newTemp(cb);
+	    uInstr2(cb, MOV,   4, Literal, 0, TempReg, t_size);
+	    uLiteral(cb, (UInt)u_in->size);
+
+	    /* XXX all registers should be flushed to baseblock
+	       here */
+	    uInstr2(cb, CCALL, 0, TempReg, u_in->val3, TempReg, t_size);
+	    uCCall(cb, (Addr) & eraser_mem_help_read_N, 2, 2, False);
+	    
+	    VG_(copy_UInstr)(cb, u_in);
+	    t_size = INVALID_TEMPREG;
+	    break;
+	 } 
+
          case SSE2a_MemRd:
          case SSE2a1_MemRd:
          case SSE3a_MemRd:
diff --git a/include/vg_skin.h.base b/include/vg_skin.h.base
index 7070479..d316229 100644
--- a/include/vg_skin.h.base
+++ b/include/vg_skin.h.base
@@ -665,6 +665,15 @@
       MMX2_MemRd,
       MMX2_MemWr,
 
+      /* 3 bytes, reads/writes mem.  Insns of the form
+         bbbbbbbb:mod mmxreg r/m:bbbbbbbb
+         Held in val1[15:0] and val2[7:0], and mod and rm are to be
+         replaced at codegen time by a reference to the Temp/RealReg
+         holding the address.  Arg2 holds this Temp/Real Reg.
+         Transfer is always at size 8.
+      */
+      MMX2a1_MemRd,
+
       /* 2 bytes, reads/writes an integer ("E") register.  Insns of the form
          bbbbbbbb:11 mmxreg ireg.
          Held in val1[15:0], and ireg is to be replaced
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index fc62ba9..7a2e05b 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -1075,6 +1075,25 @@
             break;
          }
 
+         case MMX2a1_MemRd: {
+            Int t_size = INVALID_TEMPREG;
+
+            sk_assert(u_in->size == 8);
+
+            sk_assert(u_in->tag3 == TempReg);
+            uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val3));
+            uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val3));
+
+            t_size = newTemp(cb);
+            uInstr2(cb, MOV,   4, Literal, 0, TempReg, t_size);
+            uLiteral(cb, u_in->size);
+            uInstr2(cb, CCALL, 0, TempReg, u_in->val3, TempReg, t_size);
+            uCCall(cb, (Addr) & MC_(fpu_read_check), 2, 2, False);
+            
+            VG_(copy_UInstr)(cb, u_in);
+            break;
+         }
+
 	 /* SSE ins referencing scalar integer registers */
          case SSE2g_RegWr:
          case SSE2g1_RegWr: