Support for FXSAVE/FXRSTOR (Tom Hughes).  Fixes #71180.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@2183 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/addrcheck/ac_main.c b/addrcheck/ac_main.c
index 74acac0..044f917 100644
--- a/addrcheck/ac_main.c
+++ b/addrcheck/ac_main.c
@@ -907,7 +907,7 @@
       return;
    }
 
-   if (size == 16 || size == 10 || size == 28 || size == 108) {
+   if (size == 16 || size == 10 || size == 28 || size == 108 || size == 512) {
       PROF_EVENT(94);
       ac_fpu_ACCESS_check_SLOWLY ( addr, size, isWrite );
       return;
@@ -1055,8 +1055,8 @@
             helper = (Addr)ac_fpu_WRITE_check;
 	    goto do_Access_ARG3;
          do_Access_ARG3:
-	    sk_assert(u_in->size == 4 
-                      || u_in->size == 8 || u_in->size == 16);
+	    sk_assert(u_in->size == 4 || u_in->size == 8
+                      || u_in->size == 16 || u_in->size == 512);
             sk_assert(u_in->tag3 == TempReg);
             t_addr = u_in->val3;
             t_size = newTemp(cb);
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index 0cbeaab..6264abf 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -544,7 +544,7 @@
 
          case SSE2a_MemRd:
          case SSE2a1_MemRd:
-            sk_assert(u_in->size == 4 || u_in->size == 16);
+            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
             t_read = u_in->val3;
             is_FPU_R = True;
             break;
@@ -577,7 +577,7 @@
             break;
 
          case SSE2a_MemWr:
-            sk_assert(u_in->size == 4 || u_in->size == 16);
+            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
             t_write = u_in->val3;
             is_FPU_W = True;
             break;
@@ -798,11 +798,16 @@
 
          case SSE2a_MemRd:
          case SSE2a1_MemRd:
-            sk_assert(u_in->size == 4 || u_in->size == 16);
+            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
             t_read = u_in->val3;
             t_read_addr = newTemp(cb);
             uInstr2(cb, MOV, 4, TempReg, u_in->val3,  TempReg, t_read_addr);
-            data_size = u_in->size;
+            /* 512 B data-sized instructions will be done inaccurately
+             * but they're very rare and this avoids errors from
+             * hitting more than two cache lines in the simulation. */
+            data_size = ( u_in->size <= MIN_LINE_SIZE
+                        ? u_in->size
+                        : MIN_LINE_SIZE);
             VG_(copy_UInstr)(cb, u_in);
             break;
 
@@ -856,14 +861,19 @@
             break;
 
          case SSE2a_MemWr:
-            sk_assert(u_in->size == 4 || u_in->size == 16);
+            sk_assert(u_in->size == 4 || u_in->size == 16 || u_in->size == 512);
            /* fall through */
          case SSE3a_MemWr:
-            sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16);
+            sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
             t_write = u_in->val3;
             t_write_addr = newTemp(cb);
             uInstr2(cb, MOV, 4, TempReg, u_in->val3, TempReg, t_write_addr);
-            data_size = u_in->size;
+            /* 512 B data-sized instructions will be done inaccurately
+             * but they're very rare and this avoids errors from
+             * hitting more than two cache lines in the simulation. */
+            data_size = ( u_in->size <= MIN_LINE_SIZE
+                        ? u_in->size
+                        : MIN_LINE_SIZE);
             VG_(copy_UInstr)(cb, u_in);
             break;
 
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index d83862d..7be34b1 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -4075,7 +4075,7 @@
 
       case SSE2a_MemWr:
       case SSE2a_MemRd:
-         vg_assert(u->size == 4 || u->size == 16);
+         vg_assert(u->size == 4 || u->size == 16 || u->size == 512);
          vg_assert(u->tag1 == Lit16);
          vg_assert(u->tag2 == Lit16);
          vg_assert(u->tag3 == RealReg);
diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c
index 35d1911..e7c48de 100644
--- a/coregrind/vg_to_ucode.c
+++ b/coregrind/vg_to_ucode.c
@@ -3545,8 +3545,27 @@
    if (VG_(have_ssestate)) {
    UChar* insn = (UChar*)eip;
 
+   /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
+   if (insn[0] == 0x0F && insn[1] == 0xAE 
+       && (!epartIsReg(insn[2]))
+       && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
+      Bool store = gregOfRM(insn[2]) == 0;
+      vg_assert(sz == 4);
+      pair = disAMode ( cb, sorb, eip+2, dis?dis_buf:NULL );
+      t1   = LOW24(pair);
+      eip += 2+HI8(pair);
+      uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
+                  Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
+                  Lit16, (UShort)insn[2],
+                  TempReg, t1 );
+      if (dis)
+         VG_(printf)("fx%s %s\n", store ? "save" : "rstor", dis_buf );
+      goto decode_success;
+   }
+
    /* STMXCSR/LDMXCSR m32 -- load/store the MXCSR register. */
    if (insn[0] == 0x0F && insn[1] == 0xAE 
+       && (!epartIsReg(insn[2]))
        && (gregOfRM(insn[2]) == 3 || gregOfRM(insn[2]) == 2) ) {
       Bool store = gregOfRM(insn[2]) == 3;
       vg_assert(sz == 4);
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index 408a707..0a231c1 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -414,7 +414,8 @@
 #  define SZ42 (u->size == 4 || u->size == 2)
 #  define SZ48 (u->size == 4 || u->size == 8)
 #  define SZ416 (u->size == 4 || u->size == 16)
-#  define SZsse (u->size == 4 || u->size == 8 || u->size == 16)
+#  define SZsse2 (u->size == 4 || u->size == 16 || u->size == 512)
+#  define SZsse3 (u->size == 4 || u->size == 8 || u->size == 16)
 #  define SZi  (u->size == 4 || u->size == 2 || u->size == 1)
 #  define SZf  (  u->size ==  4 || u->size ==  8 || u->size ==   2     \
                || u->size == 10 || u->size == 28 || u->size == 108)
@@ -563,22 +564,22 @@
    case MMX2_ERegWr: return LIT0 && SZ4  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
 
    /* Fields checked:        lit32   size  flags_r/w tag1   tag2   tag3    (rest) */
-   case SSE2a_MemWr:  return LIT0 && SZ416 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE2a_MemRd:  return LIT0 && SZ416 && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE2a1_MemRd: return LIT0 && SZ416 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3a_MemWr:  return LIT0 && SZsse && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3a_MemRd:  return LIT0 && SZsse && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3e_RegRd:  return LIT0 && SZ4   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3e_RegWr:  return LIT0 && SZ4   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3a1_MemRd: return LIT8 && SZ16  && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3g_RegWr:  return LIT0 && SZ4   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3g1_RegWr: return LIT8 && SZ4   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3e1_RegRd: return LIT8 && SZ2   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3:         return LIT0 && SZ0   && CCa  && Ls1 && Ls2 && N3  && XOTHER;
-   case SSE4:         return LIT0 && SZ0   && CCa  && Ls1 && Ls2 && N3  && XOTHER;
-   case SSE5:         return LIT0 && SZ0   && CC0  && Ls1 && Ls2 && Ls3 && XOTHER;
+   case SSE2a_MemWr:  return LIT0 && SZsse2 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE2a_MemRd:  return LIT0 && SZsse2 && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE2a1_MemRd: return LIT0 && SZ416  && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3a_MemWr:  return LIT0 && SZsse3 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3a_MemRd:  return LIT0 && SZsse3 && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3e_RegRd:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3e_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3a1_MemRd: return LIT8 && SZ16   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3g_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3g1_RegWr: return LIT8 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3e1_RegRd: return LIT8 && SZ2    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3:         return LIT0 && SZ0    && CCa  && Ls1 && Ls2 && N3  && XOTHER;
+   case SSE4:         return LIT0 && SZ0    && CCa  && Ls1 && Ls2 && N3  && XOTHER;
+   case SSE5:         return LIT0 && SZ0    && CC0  && Ls1 && Ls2 && Ls3 && XOTHER;
    case SSE3ag_MemRd_RegWr:
-                      return         SZ48  && CC0  && TR1 && TR2 && N3  && XOTHER;
+                      return         SZ48   && CC0  && TR1 && TR2 && N3  && XOTHER;
    default: 
       if (VG_(needs).extended_UCode)
          return SK_(sane_XUInstr)(beforeRA, beforeLiveness, u);
@@ -602,7 +603,8 @@
 #  undef SZ42
 #  undef SZ48
 #  undef SZ416
-#  undef SZsse
+#  undef SZsse2
+#  undef SZsse3
 #  undef SZi
 #  undef SZf
 #  undef SZ4m
diff --git a/include/vg_skin.h.base b/include/vg_skin.h.base
index 4ef7a63..52fc2c8 100644
--- a/include/vg_skin.h.base
+++ b/include/vg_skin.h.base
@@ -960,7 +960,7 @@
       /* word 3 */
       UShort  val3;       /* third operand */
       UChar   opcode;     /* opcode */
-      UChar   size;       /* data transfer size */
+      UShort  size;       /* data transfer size */
 
       /* word 4 */
       FlagSet flags_r;    /* :: FlagSet */
diff --git a/memcheck/mac_needs.c b/memcheck/mac_needs.c
index 315847f..a3ab823 100644
--- a/memcheck/mac_needs.c
+++ b/memcheck/mac_needs.c
@@ -732,19 +732,19 @@
    81   fpu_read aligned 4
    82   fpu_read aligned 8
    83   fpu_read 2
-   84   fpu_read 10/28/108
+   84   fpu_read 10/28/108/512
 
 M  85   fpu_write
 M  86   fpu_write aligned 4
 M  87   fpu_write aligned 8
 M  88   fpu_write 2
-M  89   fpu_write 10/28/108
+M  89   fpu_write 10/28/108/512
 
    90   fpu_access
    91   fpu_access aligned 4
    92   fpu_access aligned 8
    93   fpu_access 2
-   94   fpu_access 10/28/108
+   94   fpu_access 10/28/108/512
 
    100  fpu_access_check_SLOWLY
    101  fpu_access_check_SLOWLY(byte loop)
diff --git a/memcheck/mc_main.c b/memcheck/mc_main.c
index 7ee467c..3cc44de 100644
--- a/memcheck/mc_main.c
+++ b/memcheck/mc_main.c
@@ -1190,7 +1190,7 @@
    }
 
    if (size == 16 /*SSE*/ 
-       || size == 10 || size == 28 || size == 108) {
+       || size == 10 || size == 28 || size == 108 || size == 512) {
       PROF_EVENT(84);
       mc_fpu_read_check_SLOWLY ( addr, size );
       return;
@@ -1273,7 +1273,7 @@
    }
 
    if (size == 16 /*SSE*/ 
-       || size == 10 || size == 28 || size == 108) {
+       || size == 10 || size == 28 || size == 108 || size == 512) {
       PROF_EVENT(89);
       mc_fpu_write_check_SLOWLY ( addr, size );
       return;
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 1b3599b..d21bb86 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -1112,8 +1112,8 @@
             Bool is_load;
             Int t_size;
 
-            sk_assert(u_in->size == 4 
-                      || u_in->size == 8 || u_in->size == 16);
+            sk_assert(u_in->size == 4 || u_in->size == 8
+                      || u_in->size == 16 || u_in->size == 512);
 
             t_size = INVALID_TEMPREG;
             is_load = u_in->opcode==SSE2a_MemRd