Yet more SSE insns.
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1696 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/addrcheck/ac_main.c b/addrcheck/ac_main.c
index 92c80d2..407d001 100644
--- a/addrcheck/ac_main.c
+++ b/addrcheck/ac_main.c
@@ -961,6 +961,20 @@
             VG_(copy_UInstr)(cb, u_in);
             break;
 
+	 case SSE3ag_MemRd_RegWr:
+            sk_assert(u_in->size == 4 || u_in->size == 8);
+	    goto do_Access_ARG1;
+         do_Access_ARG1:
+	    sk_assert(u_in->tag1 == TempReg);
+            t_addr = u_in->val1;
+            t_size = newTemp(cb);
+	    uInstr2(cb, MOV, 4, Literal, 0, TempReg, t_size);
+	    uLiteral(cb, u_in->size);
+            uInstr2(cb, CCALL, 0, TempReg, t_addr, TempReg, t_size);
+            uCCall(cb, (Addr) & ac_fpu_ACCESS_check, 2, 2, False );
+            VG_(copy_UInstr)(cb, u_in);
+            break;
+
          case MMX2_MemRd:
          case MMX2_MemWr:
             sk_assert(u_in->size == 4 || u_in->size == 8);
@@ -999,15 +1013,14 @@
 
 	    //         case SSE2a1_MemRd:
 	    //         case SSE2a1_MemWr:
-         case SSE3g1_RegWr:
          case SSE3g1_RegRd:
-	   //         case SSE3ag_MemRd_RegWr:
 	   //         case SSE3a1_MemRd:
 	   //         case SSE3a1_MemWr:
 	    VG_(pp_UInstr)(0,u_in);
 	    VG_(skin_panic)("AddrCheck: unhandled SSE uinstr");
 	    break;
 
+         case SSE3g1_RegWr:
          case SSE5:
          case SSE3g_RegWr:
          case SSE3g_RegRd:
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index af63fa2..8b699e6 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -1563,11 +1563,11 @@
 }
 
 static void emit_SSE4 ( FlagSet uses_sflags, 
-                         FlagSet sets_sflags,
-                         UChar first_byte, 
-                         UChar second_byte, 
-			 UChar third_byte,
-                         UChar fourth_byte )
+                        FlagSet sets_sflags,
+                        UChar first_byte, 
+                        UChar second_byte, 
+		        UChar third_byte,
+                        UChar fourth_byte )
 {
    VG_(new_emit)(True, uses_sflags, sets_sflags);
    VG_(emitB) ( first_byte );
@@ -1581,12 +1581,12 @@
 }
 
 static void emit_SSE5 ( FlagSet uses_sflags, 
-                         FlagSet sets_sflags,
-                         UChar first_byte, 
-                         UChar second_byte, 
-			 UChar third_byte,
-			 UChar fourth_byte,
-			 UChar fifth_byte )
+                        FlagSet sets_sflags,
+                        UChar first_byte, 
+                        UChar second_byte, 
+			UChar third_byte,
+			UChar fourth_byte,
+			UChar fifth_byte )
 {
    VG_(new_emit)(True, uses_sflags, sets_sflags);
    VG_(emitB) ( first_byte );
@@ -1602,10 +1602,10 @@
 }
 
 static void emit_SSE3 ( FlagSet uses_sflags, 
-                         FlagSet sets_sflags,
-                         UChar first_byte, 
-                         UChar second_byte, 
-                         UChar third_byte )
+                        FlagSet sets_sflags,
+                        UChar first_byte, 
+                        UChar second_byte, 
+                        UChar third_byte )
 {
    VG_(new_emit)(True, uses_sflags, sets_sflags);
    VG_(emitB) ( first_byte );
@@ -1617,6 +1617,28 @@
                   (UInt)third_byte );
 }
 
+static void emit_SSE3ag_MemRd_RegWr ( FlagSet uses_sflags, 
+                                      FlagSet sets_sflags,
+                                      UChar first_byte, 
+                                      UChar second_byte, 
+                                      UChar third_byte,
+				      Int addr_reg,
+				      Int dest_reg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   VG_(emitB) ( third_byte );
+   /* 4th byte can be completely synthesised from addr_reg and
+      dest_reg. */
+   emit_amode_regmem_reg ( addr_reg, dest_reg );
+   if (dis)
+      VG_(printf)("\n\t\tsse-0x%x:0x%x:0x%x(addr=%s, dest=%s)\n", 
+                  (UInt)first_byte, (UInt)second_byte, 
+            	  (UInt)third_byte, nameIReg(4, addr_reg), 
+                                    nameIReg(4, dest_reg));
+}
+
 static void emit_MMX2_reg_to_mmxreg ( FlagSet uses_sflags, 
                                       FlagSet sets_sflags,
 			              UChar first_byte, 
@@ -3944,6 +3966,23 @@
                      u->val2 & 0xFF );
          break;
 
+      case SSE3ag_MemRd_RegWr:
+         vg_assert(u->size == 4 || u->size == 8);
+         vg_assert(u->tag1 == RealReg);
+         vg_assert(u->tag2 == RealReg);
+         vg_assert(u->tag3 == NoValue);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE3ag_MemRd_RegWr ( u->flags_r, u->flags_w,
+                                   (u->lit32 >> 24) & 0xFF,
+                                   (u->lit32 >> 16) & 0xFF,
+                                   (u->lit32 >> 8) & 0xFF,
+				   u->val1, u->val2 );
+	 break;
+
       default: 
          if (VG_(needs).extended_UCode) {
 	    if (*sselive) {
diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c
index d7cc5e2..fa8a7da 100644
--- a/coregrind/vg_to_ucode.c
+++ b/coregrind/vg_to_ucode.c
@@ -3659,19 +3659,24 @@
       goto decode_success;
    }
 
-   /* CVTTSD2SI (0xF2) -- convert a double-precision float value in
-      memory or xmm reg to int and put it in an ireg. */
-   /* CVTTSS2SI (0xF3) -- convert a single-precision float value in
-      memory or xmm reg to int and put it in an ireg. */
+   /* CVTTSD2SI (0xF2,0x0F,0x2C) -- convert a double-precision float value in
+      memory or xmm reg to int and put it in an ireg.  Truncate. */
+   /* CVTTSS2SI (0xF3,0x0F,0x2C) -- convert a single-precision float value in
+      memory or xmm reg to int and put it in an ireg.  Truncate. */
+   /* CVTSD2SI (0xF2,0x0F,0x2D) -- convert a double-precision float value in
+      memory or xmm reg to int and put it in an ireg.  Round as per MXCSR. */
+   /* CVTSS2SI (0xF3,0x0F,0x2D) -- convert a single-precision float value in
+      memory or xmm reg to int and put it in an ireg.  Round as per MXCSR. */
    if ((insn[0] == 0xF2 || insn[0] == 0xF3)
-       && insn[1] == 0x0F && insn[2] == 0x2C) {
+       && insn[1] == 0x0F 
+       && (insn[2] == 0x2C || insn[2] == 0x2D)) {
       vg_assert(sz == 4);
       modrm = insn[3];
       if (epartIsReg(modrm)) {
          /* We're moving a value in an xmm reg to an ireg. */
          eip += 4;
 	 t1 = newTemp(cb);
-         /* sz is 4 for both CVTTSD2SI and CVTTSS2SI. */
+         /* sz is 4 for all 4 insns. */
          vg_assert(epartIsReg(modrm));
          uInstr3(cb, SSE3g_RegWr, 4,
                      Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
@@ -3679,27 +3684,34 @@
                      TempReg, t1 );
          uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
 	 if (dis)
-            VG_(printf)("cvtts{s,d}2si %s, %s\n", 
+            VG_(printf)("cvt{t}s{s,d}2si %s, %s\n", 
                         nameXMMReg(eregOfRM(modrm)),
                         nameIReg(4,gregOfRM(modrm)) );
       } else {
-#if 0
          /* So, we're reading memory and writing an ireg.  This calls
-            for the ultra-horrible SSE3ag_MemRd_RegWr uinstr. */
+            for the ultra-horrible SSE3ag_MemRd_RegWr uinstr.  We
+            can't do it in a roundabout route because it does some
+            kind of conversion on the way, which we need to have
+            happen too.  So our only choice is to re-emit a suitably
+            rehashed version of the instruction. */
+ 	 /* Destination ireg is GREG.  Address goes as EREG as
+	    usual. */
          t1 = newTemp(cb); /* t1 holds value on its way to ireg */
          pair = disAMode ( cb, sorb, eip+3, dis?dis_buf:NULL );
          t2   = LOW24(pair); /* t2 holds addr */
          eip += 3+HI8(pair);
-         uInstr2(cb, SSE3ag_MemRd_RegWr, 8,
+         uInstr2(cb, SSE3ag_MemRd_RegWr, insn[0]==0xF2 ? 8 : 4,
                      TempReg, t2, /* address */
                      TempReg, t1 /* dest */);
-         uLiteral(cb, (((UInt)insn[0]) << 16)
-                      | (((UInt)insn[1]) << 8)
-                      | ((UInt)insn[2]) );
+         uLiteral(cb  , (((UInt)insn[0]) << 24)
+                      | (((UInt)insn[1]) << 16)
+                      | (((UInt)insn[2]) << 8) 
+                      | ((UInt)modrm) );
          uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
-	 /* PRINTING CODE */
-#endif
-         VG_(core_panic)("CVTTSD2SI mem");
+	 if (dis)
+            VG_(printf)("cvt{t}s{s,d}2si %s, %s\n", 
+                        dis_buf,
+                        nameIReg(4,gregOfRM(modrm)) );
       }
       goto decode_success;
    }
@@ -3839,6 +3851,14 @@
       goto decode_success;
    }
 
+   /* XORPD (src)xmmreg-or-mem, (dst)xmmreg */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0x57) {
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "xorpd",
+                                      0x66, insn[0], insn[1] );
+      goto decode_success;
+   }
+
    /* PXOR (src)xmmreg-or-mem, (dst)xmmreg */
    if (sz == 2
        && insn[0] == 0x0F && insn[1] == 0xEF) {
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index 48b4f2d..7f9570b 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -570,6 +570,8 @@
    case SSE3:         return LIT0 && SZ0   && CC0  && Ls1 && Ls2 && N3  && XOTHER;
    case SSE4:         return LIT0 && SZ0   && CCf  && Ls1 && Ls2 && N3  && XOTHER;
    case SSE5:         return LIT0 && SZ0   && CC0  && Ls1 && Ls2 && Ls3 && XOTHER;
+   case SSE3ag_MemRd_RegWr:
+                      return         SZ48  && CC0  && TR1 && TR2 && N3  && XOTHER;
    default: 
       if (VG_(needs).extended_UCode)
          return SK_(sane_XUInstr)(beforeRA, beforeLiveness, u);
@@ -893,6 +895,7 @@
       case SSE5:        return "SSE5";
       case SSE3a_MemWr: return "SSE3a_MWr";
       case SSE3a_MemRd: return "SSE3a_MRd";
+      case SSE3ag_MemRd_RegWr: return "SSE3ag_MemRd_RegWr";
       default:
          if (VG_(needs).extended_UCode)
             return SK_(name_XUOpcode)(opc);
@@ -1094,6 +1097,14 @@
                      u->val3 & 0xFF );
          break;
 
+      case SSE3ag_MemRd_RegWr:
+	 VG_(printf)("0x%x(addr=", u->lit32 );
+	 VG_(pp_UOperand)(u, 1, 4, False);
+	 VG_(printf)(", dst=");
+	 VG_(pp_UOperand)(u, 2, 4, False);
+         VG_(printf)(")");
+         break;
+
       case GET: case PUT: case MOV: case LOAD: case STORE: case CMOV:
       case GETSEG: case PUTSEG:
          VG_(pp_UOperand)(u, 1, u->size, u->opcode==LOAD); 
@@ -1249,6 +1260,8 @@
       case SSE3g1_RegWr:
       case SSE3g_RegWr: WR(3); break;
 
+      case SSE3ag_MemRd_RegWr: RD(1); WR(2); break;
+
       case MMX2_RegRd: RD(2); break;
       case MMX2_RegWr: WR(2); break;
 
@@ -1408,7 +1421,7 @@
       case SSE3a_MemWr: case SSE3a_MemRd:
       case SSE3g_RegRd: case SSE3g_RegWr: 
       case SSE3g1_RegWr: case SSE3g1_RegRd:
-      case SSE4: case SSE3: case SSE5:
+      case SSE4: case SSE3: case SSE5: case SSE3ag_MemRd_RegWr:
       case WIDEN:
       /* GETSEG and USESEG are to do with ArchRegS, not ArchReg */
       case GETSEG: case PUTSEG: 
@@ -2298,7 +2311,7 @@
    UChar*      final_code;
    UCodeBlock* cb;
    Bool        notrace_until_done;
-   UInt        notrace_until_limit = 23000;
+   UInt        notrace_until_limit = 53000;
 
    VGP_PUSHCC(VgpTranslate);
    debugging_translation
diff --git a/include/vg_skin.h b/include/vg_skin.h
index e0570d6..6e37a98 100644
--- a/include/vg_skin.h
+++ b/include/vg_skin.h
@@ -662,12 +662,11 @@
       */
       SSE3g_RegRd,
 
-#if 0
       /* 4 bytes, reads memory, writes an integer register, but is
          nevertheless an SSE insn.  The insn is of the form
          bbbbbbbb:bbbbbbbb:bbbbbbbb:mod ireg rm where mod indicates
          memory (ie is not 11b) and ireg is the int reg written.  The
-         first 3 bytes are held in lit32[23:0] since there is
+         first 4 bytes are held in lit32[31:0] since there is
          insufficient space elsewhere.  mod and rm are to be replaced
          at codegen time by a reference to the Temp/RealReg holding
          the address.  Arg1 holds this Temp/RealReg.  ireg is to be
@@ -675,10 +674,13 @@
          RealReg in which the answer is to be written.  Arg2 holds
          this Temp/RealReg.  Transfer to the destination reg is always
          at size 4.  However the memory read can be at sizes 4 or 8
-         and so this is what the sz field holds.
+         and so this is what the sz field holds.  Note that the 4th
+         byte of the instruction (the modrm byte) is redundant, but we
+         store it anyway so as to be consistent with all other SSE
+         uinstrs.
       */
       SSE3ag_MemRd_RegWr,
-#endif
+
       /* 5 bytes, no memrefs, no iregdefs, copy exactly to the
          output.  Held in val1[15:0], val2[15:0] and val3[7:0]. */
       SSE5,