Implement XSAVE/XRSTOR for AVX (state components 0, 1 and 2)

Refactor existing FXSAVE / FXRSTOR implementation so as to use
the new code, since these are sub-cases of the general XSAVE/XRSTOR
functionality.

Add a new CPUID level to indicate CPUs which are AVX2 compatible,
and enable it by default on AVX2 compatible hosts.

For both the AVX and AVX2 simulated CPUIDs, claim that XSAVEOPT is not
supported, in an attempt to avoid having to implement it.

Remove CPUID kludgery to do with OSX 10.10 (Yosemite) in order to
persuade it not to use XSAVE/XRSTOR.

libvex_ir.h: add new guarded load conversion "ILGop_IdentV128"
as required by XSAVE/XRSTOR support.



git-svn-id: svn://svn.valgrind.org/vex/trunk@3169 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_amd64_defs.h b/priv/guest_amd64_defs.h
index 3b9e7ff..72cd7f7 100644
--- a/priv/guest_amd64_defs.h
+++ b/priv/guest_amd64_defs.h
@@ -168,13 +168,19 @@
 extern void  amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st );
 extern void  amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st );
 extern void  amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st );
+extern void  amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st );
 
 extern void  amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* );
 
-extern void      amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM
-                    ( VexGuestAMD64State*, HWord );
-extern VexEmNote amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM
-                    ( VexGuestAMD64State*, HWord );
+extern void amd64g_dirtyhelper_XSAVE_COMPONENT_0
+               ( VexGuestAMD64State* gst, HWord addr );
+extern void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS 
+               ( VexGuestAMD64State* gst, HWord addr );
+
+extern VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
+                    ( VexGuestAMD64State* gst, HWord addr );
+extern VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS 
+                    ( VexGuestAMD64State* gst, HWord addr );
 
 extern ULong amd64g_dirtyhelper_RDTSC ( void );
 extern void  amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st );
diff --git a/priv/guest_amd64_helpers.c b/priv/guest_amd64_helpers.c
index e3c3b73..3eeb8ae 100644
--- a/priv/guest_amd64_helpers.c
+++ b/priv/guest_amd64_helpers.c
@@ -1943,8 +1943,15 @@
 }
 
 
-static
-void do_fxsave ( VexGuestAMD64State* gst, HWord addr, Bool save_xmm_regs )
+/*---------------------------------------------------------------*/
+/*--- Supporting functions for XSAVE/FXSAVE.                  ---*/
+/*---------------------------------------------------------------*/
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest state, writes guest mem) */
+/* XSAVE component 0 is the x87 FPU state. */
+void amd64g_dirtyhelper_XSAVE_COMPONENT_0
+        ( VexGuestAMD64State* gst, HWord addr )
 {
    /* Derived from values obtained from
       vendor_id       : AuthenticAMD
@@ -1959,17 +1966,15 @@
    Fpu_State tmp;
    UShort*   addrS = (UShort*)addr;
    UChar*    addrC = (UChar*)addr;
-   UInt      mxcsr;
    UShort    fp_tags;
    UInt      summary_tags;
    Int       r, stno;
    UShort    *srcS, *dstS;
 
    do_get_x87( gst, (UChar*)&tmp );
-   mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
 
-   /* Now build the proper fxsave image from the x87 image we just
-      made. */
+   /* Now build the proper fxsave x87 image from the fsave x87 image
+      we just made. */
 
    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
@@ -2002,11 +2007,8 @@
    addrS[10] = 0; /* BOGUS */
    addrS[11] = 0; /* BOGUS */
 
-   addrS[12] = toUShort(mxcsr);  /* MXCSR */
-   addrS[13] = toUShort(mxcsr >> 16);
-
-   addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
-   addrS[15] = 0x0000; /* MXCSR mask (hi16) */
+   /* addrS[13,12] are MXCSR -- not written */
+   /* addrS[15,14] are MXCSR_MASK -- not written */
 
    /* Copy in the FP registers, in ST order. */
    for (stno = 0; stno < 8; stno++) {
@@ -2021,94 +2023,95 @@
       dstS[6] = 0;
       dstS[7] = 0;
    }
-
-   /* That's the first 160 bytes of the image done. */
-   if (save_xmm_regs == True) {
-      /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
-         big-endian, these need to be byte-swapped. */
-      U128 *xmm = (U128 *)(addr + 160);
-
-      vassert(host_is_little_endian());
-
-#     define COPY_U128(_dst,_src)                       \
-         do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
-              _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
-         while (0)
-
-      COPY_U128( xmm[0],  gst->guest_YMM0 );
-      COPY_U128( xmm[1],  gst->guest_YMM1 );
-      COPY_U128( xmm[2],  gst->guest_YMM2 );
-      COPY_U128( xmm[3],  gst->guest_YMM3 );
-      COPY_U128( xmm[4],  gst->guest_YMM4 );
-      COPY_U128( xmm[5],  gst->guest_YMM5 );
-      COPY_U128( xmm[6],  gst->guest_YMM6 );
-      COPY_U128( xmm[7],  gst->guest_YMM7 );
-      COPY_U128( xmm[8],  gst->guest_YMM8 );
-      COPY_U128( xmm[9],  gst->guest_YMM9 );
-      COPY_U128( xmm[10], gst->guest_YMM10 );
-      COPY_U128( xmm[11], gst->guest_YMM11 );
-      COPY_U128( xmm[12], gst->guest_YMM12 );
-      COPY_U128( xmm[13], gst->guest_YMM13 );
-      COPY_U128( xmm[14], gst->guest_YMM14 );
-      COPY_U128( xmm[15], gst->guest_YMM15 );
-#  undef COPY_U128
-   } else {
-      /* We let the generated IR to copy remaining %xmm0 .. %xmm15, so as to
-       make Memcheck's definedness flow for the non-XMM parts independent from
-       that of the all the other control and status words in the structure.
-       This avoids the false positives shown in #291310. */
-   }
 }
 
 
-static
-VexEmNote do_fxrstor ( VexGuestAMD64State* gst, HWord addr,
-                       Bool rstor_xmm_regs )
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest state, writes guest mem) */
+/* XSAVE component 1 is the SSE state. */
+void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS 
+        ( VexGuestAMD64State* gst, HWord addr )
+{
+   UShort* addrS = (UShort*)addr;
+   UInt    mxcsr;
+
+   /* The only non-register parts of the SSE state are MXCSR and
+      MXCSR_MASK. */
+   mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
+
+   addrS[12] = toUShort(mxcsr);  /* MXCSR */
+   addrS[13] = toUShort(mxcsr >> 16);
+
+   addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
+   addrS[15] = 0x0000; /* MXCSR mask (hi16) */
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+/* Do FXSAVE from the supplied VexGuestAMD64State structure and store
+   the result at the given address which represents a buffer of at
+   least 416 bytes.
+
+   This function is not called from generated code.  FXSAVE is dealt
+   with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
+   functions above plus some in-line IR.  This function is merely a
+   convenience function for VEX's users.
+*/
+void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
+                                /*OUT*/HWord fp_state )
+{
+   /* Do the x87 part */
+   amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);
+
+   /* And now the SSE part, except for the registers themselves. */
+   amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
+
+   /* That's the first 160 bytes of the image done. */
+   /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
+      big-endian, these need to be byte-swapped. */
+   U128 *xmm = (U128 *)(fp_state + 160);
+   vassert(host_is_little_endian());
+
+#  define COPY_U128(_dst,_src)                       \
+      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
+           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
+      while (0)
+
+   COPY_U128( xmm[0],  gst->guest_YMM0 );
+   COPY_U128( xmm[1],  gst->guest_YMM1 );
+   COPY_U128( xmm[2],  gst->guest_YMM2 );
+   COPY_U128( xmm[3],  gst->guest_YMM3 );
+   COPY_U128( xmm[4],  gst->guest_YMM4 );
+   COPY_U128( xmm[5],  gst->guest_YMM5 );
+   COPY_U128( xmm[6],  gst->guest_YMM6 );
+   COPY_U128( xmm[7],  gst->guest_YMM7 );
+   COPY_U128( xmm[8],  gst->guest_YMM8 );
+   COPY_U128( xmm[9],  gst->guest_YMM9 );
+   COPY_U128( xmm[10], gst->guest_YMM10 );
+   COPY_U128( xmm[11], gst->guest_YMM11 );
+   COPY_U128( xmm[12], gst->guest_YMM12 );
+   COPY_U128( xmm[13], gst->guest_YMM13 );
+   COPY_U128( xmm[14], gst->guest_YMM14 );
+   COPY_U128( xmm[15], gst->guest_YMM15 );
+#  undef COPY_U128
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Supporting functions for XRSTOR/FXRSTOR.                ---*/
+/*---------------------------------------------------------------*/
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (writes guest state, reads guest mem) */
+VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
+             ( VexGuestAMD64State* gst, HWord addr )
 {
    Fpu_State tmp;
-   VexEmNote warnX87 = EmNote_NONE;
-   VexEmNote warnXMM = EmNote_NONE;
    UShort*   addrS   = (UShort*)addr;
    UChar*    addrC   = (UChar*)addr;
    UShort    fp_tags;
    Int       r, stno, i;
 
-   if (rstor_xmm_regs == True) {
-      /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
-         to be byte-swapped. */
-      U128 *xmm = (U128 *)(addr + 160);
-
-      vassert(host_is_little_endian());
-
-#     define COPY_U128(_dst,_src)                       \
-         do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
-              _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
-         while (0)
-
-      COPY_U128( gst->guest_YMM0, xmm[0] );
-      COPY_U128( gst->guest_YMM1, xmm[1] );
-      COPY_U128( gst->guest_YMM2, xmm[2] );
-      COPY_U128( gst->guest_YMM3, xmm[3] );
-      COPY_U128( gst->guest_YMM4, xmm[4] );
-      COPY_U128( gst->guest_YMM5, xmm[5] );
-      COPY_U128( gst->guest_YMM6, xmm[6] );
-      COPY_U128( gst->guest_YMM7, xmm[7] );
-      COPY_U128( gst->guest_YMM8, xmm[8] );
-      COPY_U128( gst->guest_YMM9, xmm[9] );
-      COPY_U128( gst->guest_YMM10, xmm[10] );
-      COPY_U128( gst->guest_YMM11, xmm[11] );
-      COPY_U128( gst->guest_YMM12, xmm[12] );
-      COPY_U128( gst->guest_YMM13, xmm[13] );
-      COPY_U128( gst->guest_YMM14, xmm[14] );
-      COPY_U128( gst->guest_YMM15, xmm[15] );
-
-#  undef COPY_U128
-   } else {
-      /* Don't restore %xmm0 .. %xmm15, for the same reasons that
-         do_fxsave(save_xmm_regs = False) doesn't save them.  See
-         comment in that function for details. */
-   }
-
    /* Copy the x87 registers out of the image, into a temporary
       Fpu_State struct. */
    for (i = 0; i < 14; i++) tmp.env[i] = 0;
@@ -2137,16 +2140,75 @@
    tmp.env[FP_ENV_TAG] = fp_tags;
 
    /* Now write 'tmp' into the guest state. */
-   warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
+   VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
 
-   { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
-                | ((((UInt)addrS[13]) & 0xFFFF) << 16);
-     ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
+   return warnX87;
+}
 
-     warnXMM = (VexEmNote)(w64 >> 32);
 
-     gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
-   }
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (writes guest state, reads guest mem) */
+VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
+             ( VexGuestAMD64State* gst, HWord addr )
+{
+   UShort* addrS = (UShort*)addr;
+   UInt    w32   = (((UInt)addrS[12]) & 0xFFFF)
+                   | ((((UInt)addrS[13]) & 0xFFFF) << 16);
+   ULong   w64   = amd64g_check_ldmxcsr( (ULong)w32 );
+
+   VexEmNote warnXMM = (VexEmNote)(w64 >> 32);
+
+   gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
+   return warnXMM;
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+/* Do FXRSTOR from the supplied address and store read values to the given
+   VexGuestAMD64State structure. 
+
+   This function is not called from generated code.  FXRSTOR is dealt
+   with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
+   functions above plus some in-line IR.  This function is merely a
+   convenience function for VEX's users.
+*/
+VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
+                                      /*MOD*/VexGuestAMD64State* gst )
+{
+   /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
+      to be byte-swapped. */
+   U128 *xmm = (U128 *)(fp_state + 160);
+
+   vassert(host_is_little_endian());
+
+#  define COPY_U128(_dst,_src)                       \
+      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
+           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
+      while (0)
+
+   COPY_U128( gst->guest_YMM0, xmm[0] );
+   COPY_U128( gst->guest_YMM1, xmm[1] );
+   COPY_U128( gst->guest_YMM2, xmm[2] );
+   COPY_U128( gst->guest_YMM3, xmm[3] );
+   COPY_U128( gst->guest_YMM4, xmm[4] );
+   COPY_U128( gst->guest_YMM5, xmm[5] );
+   COPY_U128( gst->guest_YMM6, xmm[6] );
+   COPY_U128( gst->guest_YMM7, xmm[7] );
+   COPY_U128( gst->guest_YMM8, xmm[8] );
+   COPY_U128( gst->guest_YMM9, xmm[9] );
+   COPY_U128( gst->guest_YMM10, xmm[10] );
+   COPY_U128( gst->guest_YMM11, xmm[11] );
+   COPY_U128( gst->guest_YMM12, xmm[12] );
+   COPY_U128( gst->guest_YMM13, xmm[13] );
+   COPY_U128( gst->guest_YMM14, xmm[14] );
+   COPY_U128( gst->guest_YMM15, xmm[15] );
+
+#  undef COPY_U128
+
+   VexEmNote warnXMM
+      = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
+   VexEmNote warnX87
+      = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);
 
    /* Prefer an X87 emwarn over an XMM one, if both exist. */
    if (warnX87 != EmNote_NONE)
@@ -2156,24 +2218,9 @@
 }
 
 
-/* CALLED FROM GENERATED CODE */
-/* DIRTY HELPER (reads guest state, writes guest mem) */
-/* NOTE: only handles 32-bit format (no REX.W on the insn) */
-/* NOTE: does not save XMM registers - see do_fxsave() for details */
-void amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM ( VexGuestAMD64State* gst,
-                                                HWord addr )
-{
-   do_fxsave( gst, addr, False );
-}
-
-/* CALLED FROM GENERATED CODE */
-/* DIRTY HELPER (writes guest state, reads guest mem) */
-VexEmNote amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM ( VexGuestAMD64State* gst,
-                                                      HWord addr )
-{
-   return do_fxrstor( gst, addr, False );
-}
-
+/*---------------------------------------------------------------*/
+/*--- Supporting functions for FSAVE/FRSTOR                   ---*/
+/*---------------------------------------------------------------*/
 
 /* DIRTY HELPER (writes guest state) */
 /* Initialise the x87 FPU state as per 'finit'. */
@@ -2465,28 +2512,9 @@
    return ew;
 }
 
-/* VISIBLE TO LIBVEX CLIENT */
-/* Do FXSAVE from the supplied VexGuestAMD64tate structure and store the
-   result at the given address which represents a buffer of at least 416
-   bytes. Saves also XMM registers. */
-void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
-                                /*OUT*/HWord fp_state )
-{
-   do_fxsave( gst, fp_state, True );
-}
-
-/* VISIBLE TO LIBVEX CLIENT */
-/* Do FXRSTOR from the supplied address and store read values to the given
-   VexGuestAMD64State structure. Restores also XMM registers. */
-VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
-                                      /*MOD*/VexGuestAMD64State* gst )
-{
-   return do_fxrstor( gst, fp_state, True );
-}
-
 
 /*---------------------------------------------------------------*/
-/*--- Misc integer helpers, including rotates and CPUID.      ---*/
+/*--- CPUID helpers.                                          ---*/
 /*---------------------------------------------------------------*/
 
 /* Claim to be the following CPU, which is probably representative of
@@ -2845,6 +2873,14 @@
 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
    capable.  Plus (kludge!) it "supports" HTM.
 
+   Also with the following change: claim that XSaveOpt is not
+   available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
+   on the real CPU.  Consequently, programs that correctly observe
+   these CPUID values should only try to use 3 of the 8 XSave-family
+   instructions: XGETBV, XSAVE and XRSTOR.  In particular this avoids
+   having to implement the compacted or optimised save/restore
+   variants.
+
    vendor_id       : GenuineIntel
    cpu family      : 6
    model           : 42
@@ -2955,7 +2991,7 @@
          switch (old_ecx) {
             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
                                       0x00000340, 0x00000000); break;
-            case 0x00000001: SET_ABCD(0x00000001, 0x00000000,
+            case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
                                       0x00000000, 0x00000000); break;
             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
                                       0x00000000, 0x00000000); break;
@@ -3004,6 +3040,176 @@
 }
 
 
+/* Claim to be the following CPU (4 x ...), which is AVX2 capable.
+
+   With the following change: claim that XSaveOpt is not available, by
+   cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
+   CPU.  Consequently, programs that correctly observe these CPUID
+   values should only try to use 3 of the 8 XSave-family instructions:
+   XGETBV, XSAVE and XRSTOR.  In particular this avoids having to
+   implement the compacted or optimised save/restore variants.
+
+   vendor_id       : GenuineIntel
+   cpu family      : 6
+   model           : 60
+   model name      : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
+   stepping        : 3
+   microcode       : 0x1c
+   cpu MHz         : 919.957
+   cache size      : 8192 KB
+   physical id     : 0
+   siblings        : 4
+   core id         : 3
+   cpu cores       : 4
+   apicid          : 6
+   initial apicid  : 6
+   fpu             : yes
+   fpu_exception   : yes
+   cpuid level     : 13
+   wp              : yes
+   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
+                     cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
+                     tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
+                     arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
+                     aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
+                     vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
+                     sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
+                     avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
+                     tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
+                     bmi1 avx2 smep bmi2 erms invpcid xsaveopt
+   bugs            :
+   bogomips        : 5786.68
+   clflush size    : 64
+   cache_alignment : 64
+   address sizes   : 39 bits physical, 48 bits virtual
+   power management:
+*/
+void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st )
+{
+#  define SET_ABCD(_a,_b,_c,_d)                \
+      do { st->guest_RAX = (ULong)(_a);        \
+           st->guest_RBX = (ULong)(_b);        \
+           st->guest_RCX = (ULong)(_c);        \
+           st->guest_RDX = (ULong)(_d);        \
+      } while (0)
+
+   UInt old_eax = (UInt)st->guest_RAX;
+   UInt old_ecx = (UInt)st->guest_RCX;
+
+   switch (old_eax) {
+      case 0x00000000:
+         SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
+         break;
+      case 0x00000001:
+         SET_ABCD(0x000306c3, 0x02100800, 0x7ffafbff, 0xbfebfbff);
+         break;
+      case 0x00000002:
+         SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
+         break;
+      case 0x00000003:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000004:
+         switch (old_ecx) {
+            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
+                                      0x0000003f, 0x00000000); break;
+            case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
+                                      0x0000003f, 0x00000000); break;
+            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
+                                      0x000001ff, 0x00000000); break;
+            case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
+                                      0x00001fff, 0x00000006); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      0x00000000, 0x00000000); break;
+         }
+         break;
+      case 0x00000005:
+         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
+         break;
+      case 0x00000006:
+         SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
+         break;
+      case 0x00000007:
+         switch (old_ecx) {
+            case 0x00000000: SET_ABCD(0x00000000, 0x000027ab,
+                                      0x00000000, 0x00000000); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      0x00000000, 0x00000000); break;
+         }
+         break;
+      case 0x00000008:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000009:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x0000000a:
+         SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
+         break;
+      case 0x0000000b:
+         switch (old_ecx) {
+            case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
+                                      0x00000100, 0x00000002); break;
+            case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
+                                      0x00000201, 0x00000002); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      old_ecx,    0x00000002); break;
+         }
+         break;
+      case 0x0000000c:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x0000000d:
+         switch (old_ecx) {
+            case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
+                                      0x00000340, 0x00000000); break;
+            case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
+                                      0x00000000, 0x00000000); break;
+            case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
+                                      0x00000000, 0x00000000); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      0x00000000, 0x00000000); break;
+         }
+         break;
+      case 0x80000000:
+         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000001:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
+         break;
+      case 0x80000002:
+         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
+         break;
+      case 0x80000003:
+         SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
+         break;
+      case 0x80000004:
+         SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
+         break;
+      case 0x80000005:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000006:
+         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
+         break;
+      case 0x80000007:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
+         break;
+      case 0x80000008:
+         SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      default:
+         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
+         break;
+   }
+#  undef SET_ABCD
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Misc integer helpers, including rotates and crypto.     ---*/
+/*---------------------------------------------------------------*/
+
 ULong amd64g_calculate_RCR ( ULong arg, 
                              ULong rot_amt, 
                              ULong rflags_in, 
diff --git a/priv/guest_amd64_toIR.c b/priv/guest_amd64_toIR.c
index 908b84e..a594832 100644
--- a/priv/guest_amd64_toIR.c
+++ b/priv/guest_amd64_toIR.c
@@ -349,6 +349,13 @@
    vpanic("doScalarWidening(amd64)");
 }
 
+static
+void putGuarded ( Int gstOffB, IRExpr* guard, IRExpr* value )
+{
+   IRType ty = typeOfIRExpr(irsb->tyenv, value);
+   stmt( IRStmt_Put(gstOffB,
+                    IRExpr_ITE(guard, value, IRExpr_Get(gstOffB, ty))) );
+}
 
 
 /*------------------------------------------------------------*/
@@ -5195,6 +5202,52 @@
 }
 
 
+/* Generate a dirty helper call that initialises the x87 state a la
+   FINIT.  If |guard| is NULL, it is done unconditionally.  Otherwise
+   |guard| is used as a guarding condition.
+*/
+static void gen_FINIT_SEQUENCE ( IRExpr* guard )
+{
+   /* Uses dirty helper: 
+         void amd64g_do_FINIT ( VexGuestAMD64State* ) */
+   IRDirty* d  = unsafeIRDirty_0_N ( 
+                    0/*regparms*/, 
+                    "amd64g_dirtyhelper_FINIT", 
+                    &amd64g_dirtyhelper_FINIT,
+                    mkIRExprVec_1( IRExpr_BBPTR() )
+                 );
+
+   /* declare we're writing guest state */
+   d->nFxState = 5;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+
+   d->fxState[0].fx     = Ifx_Write;
+   d->fxState[0].offset = OFFB_FTOP;
+   d->fxState[0].size   = sizeof(UInt);
+
+   d->fxState[1].fx     = Ifx_Write;
+   d->fxState[1].offset = OFFB_FPREGS;
+   d->fxState[1].size   = 8 * sizeof(ULong);
+
+   d->fxState[2].fx     = Ifx_Write;
+   d->fxState[2].offset = OFFB_FPTAGS;
+   d->fxState[2].size   = 8 * sizeof(UChar);
+
+   d->fxState[3].fx     = Ifx_Write;
+   d->fxState[3].offset = OFFB_FPROUND;
+   d->fxState[3].size   = sizeof(ULong);
+
+   d->fxState[4].fx     = Ifx_Write;
+   d->fxState[4].offset = OFFB_FC3210;
+   d->fxState[4].size   = sizeof(ULong);
+
+   if (guard)
+      d->guard = guard;
+
+   stmt( IRStmt_Dirty(d) );
+}
+
+
 /* ------------------------------------------------------- */
 /* Given all that stack-mangling junk, we can now go ahead
    and describe FP instructions. 
@@ -6309,41 +6362,7 @@
                break;
 
             case 0xE3: {
-               /* Uses dirty helper: 
-                     void amd64g_do_FINIT ( VexGuestAMD64State* ) */
-               IRDirty* d  = unsafeIRDirty_0_N ( 
-                                0/*regparms*/, 
-                                "amd64g_dirtyhelper_FINIT", 
-                                &amd64g_dirtyhelper_FINIT,
-                                mkIRExprVec_1( IRExpr_BBPTR() )
-                             );
-
-               /* declare we're writing guest state */
-               d->nFxState = 5;
-               vex_bzero(&d->fxState, sizeof(d->fxState));
-
-               d->fxState[0].fx     = Ifx_Write;
-               d->fxState[0].offset = OFFB_FTOP;
-               d->fxState[0].size   = sizeof(UInt);
-
-               d->fxState[1].fx     = Ifx_Write;
-               d->fxState[1].offset = OFFB_FPREGS;
-               d->fxState[1].size   = 8 * sizeof(ULong);
-
-               d->fxState[2].fx     = Ifx_Write;
-               d->fxState[2].offset = OFFB_FPTAGS;
-               d->fxState[2].size   = 8 * sizeof(UChar);
-
-               d->fxState[3].fx     = Ifx_Write;
-               d->fxState[3].offset = OFFB_FPROUND;
-               d->fxState[3].size   = sizeof(ULong);
-
-               d->fxState[4].fx     = Ifx_Write;
-               d->fxState[4].offset = OFFB_FC3210;
-               d->fxState[4].size   = sizeof(ULong);
-
-               stmt( IRStmt_Dirty(d) );
-
+               gen_FINIT_SEQUENCE(NULL/*no guarding condition*/);
                DIP("fninit\n");
                break;
             }
@@ -9875,6 +9894,10 @@
    gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
 }
 
+static void gen_SEGV_if_not_64_aligned ( IRTemp effective_addr ) {
+   gen_SEGV_if_not_XX_aligned(effective_addr, 64-1);
+}
+
 /* Helper for deciding whether a given insn (starting at the opcode
    byte) may validly be used with a LOCK prefix.  The following insns
    may be used with LOCK when their destination operand is in memory.
@@ -11550,6 +11573,495 @@
 }
 
 
+static void gen_XSAVE_SEQUENCE ( IRTemp addr, IRTemp rfbm )
+{
+   /* ------ rfbm[0] gates the x87 state ------ */
+
+   /* Uses dirty helper: 
+         void amd64g_do_XSAVE_COMPONENT_0 ( VexGuestAMD64State*, ULong )
+   */
+   IRDirty* d0 = unsafeIRDirty_0_N (
+                    0/*regparms*/, 
+                    "amd64g_dirtyhelper_XSAVE_COMPONENT_0",
+                    &amd64g_dirtyhelper_XSAVE_COMPONENT_0,
+                    mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
+                 );
+   d0->guard = binop(Iop_CmpEQ64, binop(Iop_And64, mkexpr(rfbm), mkU64(1)),
+                     mkU64(1));
+
+   /* Declare we're writing memory.  Really, bytes 24 through 31
+      (MXCSR and MXCSR_MASK) aren't written, but we can't express more
+      than 1 memory area here, so just mark the whole thing as
+      written. */
+   d0->mFx   = Ifx_Write;
+   d0->mAddr = mkexpr(addr);
+   d0->mSize = 160;
+
+   /* declare we're reading guest state */
+   d0->nFxState = 5;
+   vex_bzero(&d0->fxState, sizeof(d0->fxState));
+
+   d0->fxState[0].fx     = Ifx_Read;
+   d0->fxState[0].offset = OFFB_FTOP;
+   d0->fxState[0].size   = sizeof(UInt);
+
+   d0->fxState[1].fx     = Ifx_Read;
+   d0->fxState[1].offset = OFFB_FPREGS;
+   d0->fxState[1].size   = 8 * sizeof(ULong);
+
+   d0->fxState[2].fx     = Ifx_Read;
+   d0->fxState[2].offset = OFFB_FPTAGS;
+   d0->fxState[2].size   = 8 * sizeof(UChar);
+
+   d0->fxState[3].fx     = Ifx_Read;
+   d0->fxState[3].offset = OFFB_FPROUND;
+   d0->fxState[3].size   = sizeof(ULong);
+
+   d0->fxState[4].fx     = Ifx_Read;
+   d0->fxState[4].offset = OFFB_FC3210;
+   d0->fxState[4].size   = sizeof(ULong);
+
+   stmt( IRStmt_Dirty(d0) );
+
+   /* ------ rfbm[1] gates the SSE state ------ */
+
+   IRTemp rfbm_1    = newTemp(Ity_I64);
+   IRTemp rfbm_1or2 = newTemp(Ity_I64);
+   assign(rfbm_1,    binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
+   assign(rfbm_1or2, binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
+
+   IRExpr* guard_1    = binop(Iop_CmpEQ64, mkexpr(rfbm_1),    mkU64(2));
+   IRExpr* guard_1or2 = binop(Iop_CmpNE64, mkexpr(rfbm_1or2), mkU64(0));
+
+   /* Uses dirty helper: 
+         void amd64g_do_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
+                 ( VexGuestAMD64State*, ULong )
+      This creates only MXCSR and MXCSR_MASK.  We need to do this if
+      either components 1 (SSE) or 2 (AVX) are requested.  Hence the
+      guard condition is a bit more complex.
+   */
+   IRDirty* d1 = unsafeIRDirty_0_N (
+                    0/*regparms*/, 
+                    "amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS",
+                    &amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS,
+                    mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
+                 );
+   d1->guard = guard_1or2;
+
+   /* Declare we're writing memory: MXCSR and MXCSR_MASK.  Note that
+      the code for rbfm[0] just above claims a write of 0 .. 159, so
+      this duplicates it.  But at least correctly connects 24 .. 31 to
+      the MXCSR guest state representation (SSEROUND field). */
+   d1->mFx   = Ifx_Write;
+   d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
+   d1->mSize = 8;
+
+   /* declare we're reading guest state */
+   d1->nFxState = 1;
+   vex_bzero(&d1->fxState, sizeof(d1->fxState));
+
+   d1->fxState[0].fx     = Ifx_Read;
+   d1->fxState[0].offset = OFFB_SSEROUND;
+   d1->fxState[0].size   = sizeof(ULong);
+
+   /* Call the helper.  This creates MXCSR and MXCSR_MASK but nothing
+      else.  We do the actual register array, XMM[0..15], separately,
+      in order that any undefinedness in the XMM registers is tracked
+      separately by Memcheck and does not "infect" the in-memory
+      shadow for the other parts of the image. */
+   stmt( IRStmt_Dirty(d1) );
+
+   /* And now the XMMs themselves. */
+   UInt reg;
+   for (reg = 0; reg < 16; reg++) {
+      stmt( IRStmt_StoreG(
+               Iend_LE,
+               binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16)),
+               getXMMReg(reg),
+               guard_1
+      ));
+   }
+
+   /* ------ rfbm[2] gates the AVX state ------ */
+   /* Component 2 is just a bunch of register saves, so we'll do it
+      inline, just to be simple and to be Memcheck friendly. */
+
+   IRTemp rfbm_2 = newTemp(Ity_I64);
+   assign(rfbm_2, binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
+
+   IRExpr* guard_2 = binop(Iop_CmpEQ64, mkexpr(rfbm_2), mkU64(4));
+
+   for (reg = 0; reg < 16; reg++) {
+      stmt( IRStmt_StoreG(
+               Iend_LE,
+               binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16)),
+               getYMMRegLane128(reg,1),
+               guard_2
+      ));
+   }
+}
+
+
+static Long dis_XSAVE ( const VexAbiInfo* vbi,
+                        Prefix pfx, Long delta, Int sz )
+{
+   /* Note that the presence or absence of REX.W (indicated here by
+      |sz|) slightly affects the written format: whether the saved FPU
+      IP and DP pointers are 64 or 32 bits.  But the helper function
+      we call simply writes zero bits in the relevant fields, which
+      are 64 bits regardless of what REX.W is, and so it's good enough
+      (iow, equally broken) in both cases. */
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   UChar  modrm = getUChar(delta);
+   vassert(!epartIsReg(modrm)); /* ensured by caller */
+   vassert(sz == 4 || sz == 8); /* ditto */
+
+   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+   delta += alen;
+   gen_SEGV_if_not_64_aligned(addr);
+
+   DIP("%sxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
+
+   /* VEX's caller is assumed to have checked this. */
+   const ULong aSSUMED_XCR0_VALUE = 7;
+
+   IRTemp rfbm = newTemp(Ity_I64);
+   assign(rfbm,
+          binop(Iop_And64,
+                binop(Iop_Or64,
+                      binop(Iop_Shl64,
+                            unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
+                      unop(Iop_32Uto64, getIRegRAX(4))),
+                mkU64(aSSUMED_XCR0_VALUE)));
+
+   gen_XSAVE_SEQUENCE(addr, rfbm);
+
+   /* Finally, we need to update XSTATE_BV in the XSAVE header area, by
+      OR-ing the RFBM value into it. */
+   IRTemp addr_plus_512 = newTemp(Ity_I64);
+   assign(addr_plus_512, binop(Iop_Add64, mkexpr(addr), mkU64(512)));
+   storeLE( mkexpr(addr_plus_512),
+            binop(Iop_Or8,
+                  unop(Iop_64to8, mkexpr(rfbm)),
+                  loadLE(Ity_I8, mkexpr(addr_plus_512))) );
+
+   return delta;
+}
+
+
+static Long dis_FXSAVE ( const VexAbiInfo* vbi,
+                         Prefix pfx, Long delta, Int sz )
+{
+   /* See comment in dis_XSAVE about the significance of REX.W. */
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   UChar  modrm = getUChar(delta);
+   vassert(!epartIsReg(modrm)); /* ensured by caller */
+   vassert(sz == 4 || sz == 8); /* ditto */
+
+   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+   delta += alen;
+   gen_SEGV_if_not_16_aligned(addr);
+
+   DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
+
+   /* FXSAVE is just XSAVE with components 0 and 1 selected.  Set rfbm
+      to 0b011, generate the XSAVE sequence accordingly, and let iropt
+      fold out the unused (AVX) parts accordingly. */
+   IRTemp rfbm = newTemp(Ity_I64);
+   assign(rfbm, mkU64(3));
+   gen_XSAVE_SEQUENCE(addr, rfbm);
+
+   return delta;
+}
+
+
+static void gen_XRSTOR_SEQUENCE ( IRTemp addr, IRTemp xstate_bv, IRTemp rfbm )
+{
+   /* ------ rfbm[0] gates the x87 state ------ */
+
+   /* If rfbm[0] == 1, we have to write the x87 state.  If
+      xstate_bv[0] == 1, we will read it from the memory image, else
+      we'll set it to initial values.  Doing this with a helper
+      function and getting the definedness flow annotations correct is
+      too difficult, so generate stupid but simple code: first set the
+      registers to initial values, regardless of xstate_bv[0].  Then,
+      conditionally restore from the memory image. */
+
+   IRTemp rfbm_0       = newTemp(Ity_I64);
+   IRTemp xstate_bv_0  = newTemp(Ity_I64);
+   IRTemp restore_0    = newTemp(Ity_I64);
+   assign(rfbm_0,      binop(Iop_And64, mkexpr(rfbm), mkU64(1)));
+   assign(xstate_bv_0, binop(Iop_And64, mkexpr(xstate_bv), mkU64(1)));
+   assign(restore_0,   binop(Iop_And64, mkexpr(rfbm_0), mkexpr(xstate_bv_0)));
+
+   gen_FINIT_SEQUENCE( binop(Iop_CmpNE64, mkexpr(rfbm_0), mkU64(0)) );
+
+   /* Uses dirty helper: 
+         void amd64g_do_XRSTOR_COMPONENT_0 ( VexGuestAMD64State*, ULong )
+   */
+   IRDirty* d0 = unsafeIRDirty_0_N (
+                    0/*regparms*/, 
+                    "amd64g_dirtyhelper_XRSTOR_COMPONENT_0",
+                    &amd64g_dirtyhelper_XRSTOR_COMPONENT_0,
+                    mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
+                 );
+   d0->guard = binop(Iop_CmpNE64, mkexpr(restore_0), mkU64(0));
+
+   /* Declare we're reading memory.  Really, bytes 24 through 31
+      (MXCSR and MXCSR_MASK) aren't read, but we can't express more
+      than 1 memory area here, so just mark the whole thing as
+      read. */
+   d0->mFx   = Ifx_Read;
+   d0->mAddr = mkexpr(addr);
+   d0->mSize = 160;
+
+   /* declare we're writing guest state */
+   d0->nFxState = 5;
+   vex_bzero(&d0->fxState, sizeof(d0->fxState));
+
+   d0->fxState[0].fx     = Ifx_Write;
+   d0->fxState[0].offset = OFFB_FTOP;
+   d0->fxState[0].size   = sizeof(UInt);
+
+   d0->fxState[1].fx     = Ifx_Write;
+   d0->fxState[1].offset = OFFB_FPREGS;
+   d0->fxState[1].size   = 8 * sizeof(ULong);
+
+   d0->fxState[2].fx     = Ifx_Write;
+   d0->fxState[2].offset = OFFB_FPTAGS;
+   d0->fxState[2].size   = 8 * sizeof(UChar);
+
+   d0->fxState[3].fx     = Ifx_Write;
+   d0->fxState[3].offset = OFFB_FPROUND;
+   d0->fxState[3].size   = sizeof(ULong);
+
+   d0->fxState[4].fx     = Ifx_Write;
+   d0->fxState[4].offset = OFFB_FC3210;
+   d0->fxState[4].size   = sizeof(ULong);
+
+   stmt( IRStmt_Dirty(d0) );
+
+   /* ------ rfbm[1] gates the SSE state ------ */
+
+   /* Same scheme as component 0: first zero it out, and then possibly
+      restore from the memory area. */
+   IRTemp rfbm_1       = newTemp(Ity_I64);
+   IRTemp xstate_bv_1  = newTemp(Ity_I64);
+   IRTemp restore_1    = newTemp(Ity_I64);
+   assign(rfbm_1,      binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
+   assign(xstate_bv_1, binop(Iop_And64, mkexpr(xstate_bv), mkU64(2)));
+   assign(restore_1,   binop(Iop_And64, mkexpr(rfbm_1), mkexpr(xstate_bv_1)));
+   IRExpr* rfbm_1e     = binop(Iop_CmpNE64, mkexpr(rfbm_1),    mkU64(0));
+   IRExpr* restore_1e  = binop(Iop_CmpNE64, mkexpr(restore_1), mkU64(0));
+
+   IRTemp rfbm_1or2       = newTemp(Ity_I64);
+   IRTemp xstate_bv_1or2  = newTemp(Ity_I64);
+   IRTemp restore_1or2    = newTemp(Ity_I64);
+   assign(rfbm_1or2,      binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
+   assign(xstate_bv_1or2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(6)));
+   assign(restore_1or2,   binop(Iop_And64, mkexpr(rfbm_1or2),
+                                           mkexpr(xstate_bv_1or2)));
+   IRExpr* rfbm_1or2e     = binop(Iop_CmpNE64, mkexpr(rfbm_1or2),    mkU64(0));
+   IRExpr* restore_1or2e  = binop(Iop_CmpNE64, mkexpr(restore_1or2), mkU64(0));
+
+   /* The areas in question are: SSEROUND, and the XMM register array. */
+   putGuarded(OFFB_SSEROUND, rfbm_1or2e, mkU64(Irrm_NEAREST));
+
+   UInt reg;
+   for (reg = 0; reg < 16; reg++) {
+      putGuarded(xmmGuestRegOffset(reg), rfbm_1e, mkV128(0));
+   }
+
+   /* And now possibly restore from MXCSR/MXCSR_MASK */
+   /* Uses dirty helper: 
+         void amd64g_do_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
+                 ( VexGuestAMD64State*, ULong )
+      This restores from only MXCSR and MXCSR_MASK.  We need to do
+      this if either components 1 (SSE) or 2 (AVX) are requested.
+      Hence the guard condition is a bit more complex.
+   */
+   IRDirty* d1 = unsafeIRDirty_0_N (
+                    0/*regparms*/, 
+                    "amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS",
+                    &amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS,
+                    mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
+                ) ;
+   d1->guard = restore_1or2e;
+
+   /* Declare we're reading memory: MXCSR and MXCSR_MASK.  Note that
+      the code for rbfm[0] just above claims a read of 0 .. 159, so
+      this duplicates it.  But at least correctly connects 24 .. 31 to
+      the MXCSR guest state representation (SSEROUND field). */
+   d1->mFx   = Ifx_Read;
+   d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
+   d1->mSize = 8;
+
+   /* declare we're writing guest state */
+   d1->nFxState = 1;
+   vex_bzero(&d1->fxState, sizeof(d1->fxState));
+
+   d1->fxState[0].fx     = Ifx_Write;
+   d1->fxState[0].offset = OFFB_SSEROUND;
+   d1->fxState[0].size   = sizeof(ULong);
+
+   /* Call the helper.  This creates SSEROUND but nothing
+      else.  We do the actual register array, XMM[0..15], separately,
+      in order that any undefinedness in the XMM registers is tracked
+      separately by Memcheck and is not "infected" by the in-memory
+      shadow for the other parts of the image. */
+   stmt( IRStmt_Dirty(d1) );
+
+   /* And now the XMMs themselves.  For each register, we PUT either
+      its old value, or the value loaded from memory.  One convenient
+      way to do that is with a conditional load that has its the
+      default value, the old value of the register. */
+   for (reg = 0; reg < 16; reg++) {
+      IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16));
+      IRExpr* alt = getXMMReg(reg);
+      IRTemp  loadedValue = newTemp(Ity_V128);
+      stmt( IRStmt_LoadG(Iend_LE,
+                         ILGop_IdentV128, 
+                         loadedValue, ea, alt, restore_1e) );
+      putXMMReg(reg, mkexpr(loadedValue));
+   }
+
+   /* ------ rfbm[2] gates the AVX state ------ */
+   /* Component 2 is just a bunch of register loads, so we'll do it
+      inline, just to be simple and to be Memcheck friendly. */
+
+   /* Same scheme as component 0: first zero it out, and then possibly
+      restore from the memory area. */
+   IRTemp rfbm_2      = newTemp(Ity_I64);
+   IRTemp xstate_bv_2 = newTemp(Ity_I64);
+   IRTemp restore_2   = newTemp(Ity_I64);
+   assign(rfbm_2,      binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
+   assign(xstate_bv_2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(4)));
+   assign(restore_2,   binop(Iop_And64, mkexpr(rfbm_2), mkexpr(xstate_bv_2)));
+
+   IRExpr* rfbm_2e    = binop(Iop_CmpNE64, mkexpr(rfbm_2),    mkU64(0));
+   IRExpr* restore_2e = binop(Iop_CmpNE64, mkexpr(restore_2), mkU64(0));
+
+   for (reg = 0; reg < 16; reg++) {
+      putGuarded(ymmGuestRegLane128offset(reg, 1), rfbm_2e, mkV128(0));
+   }
+
+   for (reg = 0; reg < 16; reg++) {
+      IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16));
+      IRExpr* alt = getYMMRegLane128(reg, 1);
+      IRTemp  loadedValue = newTemp(Ity_V128);
+      stmt( IRStmt_LoadG(Iend_LE,
+                         ILGop_IdentV128, 
+                         loadedValue, ea, alt, restore_2e) );
+      putYMMRegLane128(reg, 1, mkexpr(loadedValue));
+   }
+}
+
+
+static Long dis_XRSTOR ( const VexAbiInfo* vbi,
+                         Prefix pfx, Long delta, Int sz )
+{
+   /* As with XRSTOR above we ignore the value of REX.W since we're
+      not bothering with the FPU DP and IP fields. */
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   UChar  modrm = getUChar(delta);
+   vassert(!epartIsReg(modrm)); /* ensured by caller */
+   vassert(sz == 4 || sz == 8); /* ditto */
+
+   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+   delta += alen;
+   gen_SEGV_if_not_64_aligned(addr);
+
+   DIP("%sxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
+
+   /* VEX's caller is assumed to have checked this. */
+   const ULong aSSUMED_XCR0_VALUE = 7;
+
+   IRTemp rfbm = newTemp(Ity_I64);
+   assign(rfbm,
+          binop(Iop_And64,
+                binop(Iop_Or64,
+                      binop(Iop_Shl64,
+                            unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
+                      unop(Iop_32Uto64, getIRegRAX(4))),
+                mkU64(aSSUMED_XCR0_VALUE)));
+
+   IRTemp xstate_bv = newTemp(Ity_I64);
+   assign(xstate_bv, loadLE(Ity_I64,
+                            binop(Iop_Add64, mkexpr(addr), mkU64(512+0))));
+
+   IRTemp xcomp_bv = newTemp(Ity_I64);
+   assign(xcomp_bv, loadLE(Ity_I64,
+                           binop(Iop_Add64, mkexpr(addr), mkU64(512+8))));
+
+   IRTemp xsavehdr_23_16 = newTemp(Ity_I64);
+   assign( xsavehdr_23_16, 
+           loadLE(Ity_I64,
+                  binop(Iop_Add64, mkexpr(addr), mkU64(512+16))));
+
+   /* We must fault if 
+      * xcomp_bv[63] == 1, since this simulated CPU does not support
+        the compaction extension.
+      * xstate_bv sets a bit outside of XCR0 (which we assume to be 7).
+      * any of the xsave header bytes 23 .. 8 are nonzero.  This seems to
+        imply that xcomp_bv must be zero.
+      xcomp_bv is header bytes 15 .. 8 and xstate_bv is header bytes 7 .. 0
+   */
+   IRTemp fault_if_nonzero = newTemp(Ity_I64);
+   assign(fault_if_nonzero,
+          binop(Iop_Or64,
+                binop(Iop_And64, mkexpr(xstate_bv), mkU64(~aSSUMED_XCR0_VALUE)),
+                binop(Iop_Or64, mkexpr(xcomp_bv), mkexpr(xsavehdr_23_16))));
+   stmt( IRStmt_Exit(binop(Iop_CmpNE64, mkexpr(fault_if_nonzero), mkU64(0)),
+                     Ijk_SigSEGV,
+                     IRConst_U64(guest_RIP_curr_instr),
+                     OFFB_RIP
+   ));
+
+   /* We are guaranteed now that both xstate_bv and rfbm are in the
+      range 0 .. 7.  Generate the restore sequence proper. */
+   gen_XRSTOR_SEQUENCE(addr, xstate_bv, rfbm);
+
+   return delta;
+}
+
+
+static Long dis_FXRSTOR ( const VexAbiInfo* vbi,
+                          Prefix pfx, Long delta, Int sz )
+{
+   /* As with FXSAVE above we ignore the value of REX.W since we're
+      not bothering with the FPU DP and IP fields. */
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   UChar  modrm = getUChar(delta);
+   vassert(!epartIsReg(modrm)); /* ensured by caller */
+   vassert(sz == 4 || sz == 8); /* ditto */
+
+   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+   delta += alen;
+   gen_SEGV_if_not_16_aligned(addr);
+
+   DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
+
+   /* FXRSTOR is just XRSTOR with components 0 and 1 selected and also
+      as if components 0 and 1 are set as present in XSTATE_BV in the
+      XSAVE header.  Set both rfbm and xstate_bv to 0b011 therefore,
+      generate the XRSTOR sequence accordingly, and let iropt fold out
+      the unused (AVX) parts accordingly. */
+   IRTemp three = newTemp(Ity_I64);
+   assign(three, mkU64(3));
+   gen_XRSTOR_SEQUENCE(addr, three/*xstate_bv*/, three/*rfbm*/);
+
+   return delta;
+}
+
+
 static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
 {
    vassert(imm8 >= 0 && imm8 <= 7);
@@ -11794,6 +12306,7 @@
 __attribute__((noinline))
 static
 Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
+                        const VexArchInfo* archinfo,
                         const VexAbiInfo* vbi,
                         Prefix pfx, Int sz, Long deltaIN,
                         DisResult* dres )
@@ -13620,166 +14133,34 @@
          delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
          goto decode_success;
       }
-      /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
-         Note that the presence or absence of REX.W slightly affects the
-         written format: whether the saved FPU IP and DP pointers are 64
-         or 32 bits.  But the helper function we call simply writes zero
-         bits in the relevant fields (which are 64 bits regardless of
-         what REX.W is) and so it's good enough (iow, equally broken) in
-         both cases. */
+      /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
           && !epartIsReg(getUChar(delta))
           && gregOfRexRM(pfx,getUChar(delta)) == 0) {
-          IRDirty* d;
-         modrm = getUChar(delta);
-         vassert(!epartIsReg(modrm));
-
-         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
-         delta += alen;
-         gen_SEGV_if_not_16_aligned(addr);
-
-         DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
-
-         /* Uses dirty helper: 
-              void amd64g_do_FXSAVE_ALL_EXCEPT_XMM ( VexGuestAMD64State*,
-                                                     ULong ) */
-         d = unsafeIRDirty_0_N ( 
-                0/*regparms*/, 
-                "amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM",
-                &amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM,
-                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
-             );
-
-         /* declare we're writing memory */
-         d->mFx   = Ifx_Write;
-         d->mAddr = mkexpr(addr);
-         d->mSize = 464; /* according to recent Intel docs */
-
-         /* declare we're reading guest state */
-         d->nFxState = 6;
-         vex_bzero(&d->fxState, sizeof(d->fxState));
-
-         d->fxState[0].fx     = Ifx_Read;
-         d->fxState[0].offset = OFFB_FTOP;
-         d->fxState[0].size   = sizeof(UInt);
-
-         d->fxState[1].fx     = Ifx_Read;
-         d->fxState[1].offset = OFFB_FPREGS;
-         d->fxState[1].size   = 8 * sizeof(ULong);
-
-         d->fxState[2].fx     = Ifx_Read;
-         d->fxState[2].offset = OFFB_FPTAGS;
-         d->fxState[2].size   = 8 * sizeof(UChar);
-
-         d->fxState[3].fx     = Ifx_Read;
-         d->fxState[3].offset = OFFB_FPROUND;
-         d->fxState[3].size   = sizeof(ULong);
-
-         d->fxState[4].fx     = Ifx_Read;
-         d->fxState[4].offset = OFFB_FC3210;
-         d->fxState[4].size   = sizeof(ULong);
-
-         d->fxState[5].fx     = Ifx_Read;
-         d->fxState[5].offset = OFFB_SSEROUND;
-         d->fxState[5].size   = sizeof(ULong);
-
-         /* Call the helper.  This creates all parts of the in-memory
-            image except for the XMM[0..15] array, which we do
-            separately, in order that any undefinedness in the XMM
-            registers is tracked separately by Memcheck and does not
-            "infect" the in-memory shadow for the other parts of the
-            image (FPTOP, FPREGS, FPTAGS, FPROUND, FC3210,
-            SSEROUND). */
-         stmt( IRStmt_Dirty(d) );
-
-         /* And now the XMMs themselves. */
-         UInt xmm;
-         for (xmm = 0; xmm < 16; xmm++) {
-            storeLE( binop(Iop_Add64, mkexpr(addr), mkU64(160 + xmm * 16)),
-                     getXMMReg(xmm) );
-         }
-
+         delta = dis_FXSAVE(vbi, pfx, delta, sz);
          goto decode_success;
       }
-      /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
-         As with FXSAVE above we ignore the value of REX.W since we're
-         not bothering with the FPU DP and IP fields. */
+      /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
           && !epartIsReg(getUChar(delta))
           && gregOfRexRM(pfx,getUChar(delta)) == 1) {
-         IRDirty* d;
-         modrm = getUChar(delta);
-         vassert(!epartIsReg(modrm));
-
-         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
-         delta += alen;
-         gen_SEGV_if_not_16_aligned(addr);
-
-         DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
-
-         /* Uses dirty helper: 
-              VexEmNote amd64g_do_FXRSTOR_ALL_EXCEPT_XMM ( VexGuestAMD64State*,
-                                                           ULong )
-            NOTE:
-              the VexEmNote value is simply ignored
-         */
-         d = unsafeIRDirty_0_N ( 
-                0/*regparms*/, 
-                "amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM", 
-                &amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM,
-                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
-             );
-
-         /* declare we're reading memory */
-         d->mFx   = Ifx_Read;
-         d->mAddr = mkexpr(addr);
-         d->mSize = 464; /* according to recent Intel docs */
-
-         /* declare we're writing guest state */
-         d->nFxState = 6;
-         vex_bzero(&d->fxState, sizeof(d->fxState));
-
-         d->fxState[0].fx     = Ifx_Write;
-         d->fxState[0].offset = OFFB_FTOP;
-         d->fxState[0].size   = sizeof(UInt);
-
-         d->fxState[1].fx     = Ifx_Write;
-         d->fxState[1].offset = OFFB_FPREGS;
-         d->fxState[1].size   = 8 * sizeof(ULong);
-
-         d->fxState[2].fx     = Ifx_Write;
-         d->fxState[2].offset = OFFB_FPTAGS;
-         d->fxState[2].size   = 8 * sizeof(UChar);
-
-         d->fxState[3].fx     = Ifx_Write;
-         d->fxState[3].offset = OFFB_FPROUND;
-         d->fxState[3].size   = sizeof(ULong);
-
-         d->fxState[4].fx     = Ifx_Write;
-         d->fxState[4].offset = OFFB_FC3210;
-         d->fxState[4].size   = sizeof(ULong);
-
-         d->fxState[5].fx     = Ifx_Write;
-         d->fxState[5].offset = OFFB_SSEROUND;
-         d->fxState[5].size   = sizeof(ULong);
-
-         /* Call the helper.  This reads all parts of the in-memory
-            image except for the XMM[0..15] array, which we do
-            separately, in order that any undefinedness in the XMM
-            registers is tracked separately by Memcheck and does not
-            "infect" the in-guest-state shadow for the other parts of the
-            image (FPTOP, FPREGS, FPTAGS, FPROUND, FC3210,
-            SSEROUND). */
-         stmt( IRStmt_Dirty(d) );
-
-         /* And now the XMMs themselves. */
-         UInt xmm;
-         for (xmm = 0; xmm < 16; xmm++) {
-            putXMMReg(xmm, loadLE(Ity_V128,
-                                  binop(Iop_Add64, mkexpr(addr),
-                                                   mkU64(160 + xmm * 16))));
-         }
-
+         delta = dis_FXRSTOR(vbi, pfx, delta, sz);
+         goto decode_success;
+      }
+      /* 0F AE /4 = XSAVE mem -- write x87, SSE, AVX state to memory */
+      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
+          && !epartIsReg(getUChar(delta))
+          && gregOfRexRM(pfx,getUChar(delta)) == 4
+          && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
+         delta = dis_XSAVE(vbi, pfx, delta, sz);
+         goto decode_success;
+      }
+      /* 0F AE /5 = XRSTOR mem -- read x87, SSE, AVX state from memory */
+      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
+          && !epartIsReg(getUChar(delta))
+          && gregOfRexRM(pfx,getUChar(delta)) == 5
+          && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
+         delta = dis_XRSTOR(vbi, pfx, delta, sz);
          goto decode_success;
       }
       break;
@@ -21524,31 +21905,20 @@
       const HChar* fName = NULL;
       void*        fAddr = NULL;
 
-      /* JRS 2014-11-11: this a really horrible temp kludge to work
-         around the fact that the Yosemite (OSX 10.10)
-         /usr/lib/system/libdyld.dylib expects XSAVE/XRSTOR to be
-         implemented, because amd64g_dirtyhelper_CPUID_avx_and_cx16
-         claims they are supported, but so far they aren't.  So cause
-         it to fall back to a simpler CPU.  The cleaner approach of
-         setting CPUID(eax=1).OSXSAVE=0 and .XSAVE=0 isn't desirable
-         since it will (per the official Intel guidelines) lead to
-         software concluding that AVX isn't supported.
-
-         This is also a kludge in that putting these ifdefs here checks
-         the build (host) architecture, when really we're checking the
-         guest architecture. */
-      Bool this_is_yosemite = False;
-#     if defined(VGP_amd64_darwin) && DARWIN_VERS == DARWIN_10_10
-      this_is_yosemite = True;
-#     endif
-
       if (haveF2orF3(pfx)) goto decode_failure;
+
       /* This isn't entirely correct, CPUID should depend on the VEX
          capabilities, not on the underlying CPU. See bug #324882. */
-      if (!this_is_yosemite &&
-          (archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
+      if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
           (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
-          (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
+          (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX2)) {
+         fName = "amd64g_dirtyhelper_CPUID_avx2";
+         fAddr = &amd64g_dirtyhelper_CPUID_avx2;
+         /* This is a Core-i7-4910-like machine */
+      }
+      else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
+               (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
+               (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
          fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16";
          fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16;
          /* This is a Core-i5-2300-like machine */
@@ -22050,7 +22420,8 @@
       facility in 64 bit mode. */
    {
       Bool decode_OK = False;
-      delta = dis_ESC_0F__SSE2 ( &decode_OK, vbi, pfx, sz, deltaIN, dres );
+      delta = dis_ESC_0F__SSE2 ( &decode_OK,
+                                 archinfo, vbi, pfx, sz, deltaIN, dres );
       if (decode_OK)
          return delta;
    }
diff --git a/priv/host_amd64_defs.c b/priv/host_amd64_defs.c
index 5e139e3..76c74c3 100644
--- a/priv/host_amd64_defs.c
+++ b/priv/host_amd64_defs.c
@@ -910,6 +910,28 @@
    vassert(sz == 4 || sz == 8 || sz == 16);
    return i;
 }
+AMD64Instr* AMD64Instr_SseCStore ( AMD64CondCode cond,
+                                   HReg src, AMD64AMode* addr )
+{
+   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+   i->tag                = Ain_SseCStore;
+   i->Ain.SseCStore.cond = cond;
+   i->Ain.SseCStore.src  = src;
+   i->Ain.SseCStore.addr = addr;
+   vassert(cond != Acc_ALWAYS);
+   return i;
+}
+AMD64Instr* AMD64Instr_SseCLoad ( AMD64CondCode cond,
+                                  AMD64AMode* addr, HReg dst )
+{
+   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+   i->tag               = Ain_SseCLoad;
+   i->Ain.SseCLoad.cond = cond;
+   i->Ain.SseCLoad.addr = addr;
+   i->Ain.SseCLoad.dst  = dst;
+   vassert(cond != Acc_ALWAYS);
+   return i;
+}
 AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
 {
    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
@@ -1268,6 +1290,24 @@
             ppAMD64AMode(i->Ain.SseLdSt.addr);
          }
          return;
+      case Ain_SseCStore:
+         vex_printf("if (%%rflags.%s) { ",
+                    showAMD64CondCode(i->Ain.SseCStore.cond));
+         vex_printf("movups ");
+         ppHRegAMD64(i->Ain.SseCStore.src);
+         vex_printf(", ");
+         ppAMD64AMode(i->Ain.SseCStore.addr);
+         vex_printf(" }");
+         return;
+      case Ain_SseCLoad:
+         vex_printf("if (%%rflags.%s) { ",
+                    showAMD64CondCode(i->Ain.SseCLoad.cond));
+         vex_printf("movups ");
+         ppAMD64AMode(i->Ain.SseCLoad.addr);
+         vex_printf(", ");
+         ppHRegAMD64(i->Ain.SseCLoad.dst);
+         vex_printf(" }");
+         return;
       case Ain_SseLdzLO:
          vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
          ppAMD64AMode(i->Ain.SseLdzLO.addr);
@@ -1566,6 +1606,14 @@
          addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
                        i->Ain.SseLdSt.reg);
          return;
+      case Ain_SseCStore:
+         addRegUsage_AMD64AMode(u, i->Ain.SseCStore.addr);
+         addHRegUse(u, HRmRead, i->Ain.SseCStore.src);
+         return;
+      case Ain_SseCLoad:
+         addRegUsage_AMD64AMode(u, i->Ain.SseCLoad.addr);
+         addHRegUse(u, HRmModify, i->Ain.SseCLoad.dst);
+         return;
       case Ain_SseLdzLO:
          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
@@ -1799,6 +1847,14 @@
          mapReg(m, &i->Ain.SseLdSt.reg);
          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
          break;
+      case Ain_SseCStore:
+         mapRegs_AMD64AMode(m, i->Ain.SseCStore.addr);
+         mapReg(m, &i->Ain.SseCStore.src);
+         return;
+      case Ain_SseCLoad:
+         mapRegs_AMD64AMode(m, i->Ain.SseCLoad.addr);
+         mapReg(m, &i->Ain.SseCLoad.dst);
+         return;
       case Ain_SseLdzLO:
          mapReg(m, &i->Ain.SseLdzLO.reg);
          mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
@@ -2366,7 +2422,7 @@
    UChar* p = &buf[0];
    UChar* ptmp;
    Int    j;
-   vassert(nbuf >= 32);
+   vassert(nbuf >= 64);
    vassert(mode64 == True);
 
    /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
@@ -2823,13 +2879,33 @@
                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
                break;
             case RLPri_2Int:
-               vassert(0); //ATC
+               goto bad; //ATC
                // movabsq $0x5555555555555555, %rax
                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
                // movq %rax, %rdx
                *p++ = 0x48; *p++ = 0x89; *p++ = 0xC2;
+               break;
+            case RLPri_V128SpRel:
+               if (i->Ain.Call.rloc.spOff == 0) {
+                  // We could accept any |spOff| here, but that's more
+                  // hassle and the only value we're ever going to get
+                  // is zero (I believe.)  Hence take the easy path :)
+                  // We need a scag register -- r11 can be it.
+                  // movabsq $0x5555555555555555, %r11
+                  *p++ = 0x49; *p++ = 0xBB;
+                  p = emit64(p, 0x5555555555555555ULL);
+                  // movq %r11, 0(%rsp)
+                  *p++ = 0x4C; *p++ = 0x89; *p++ = 0x1C; *p++ = 0x24;
+                  // movq %r11, 8(%rsp)
+                  *p++ = 0x4C; *p++ = 0x89; *p++ = 0x5C; *p++ = 0x24;
+                  *p++ = 0x08;
+                  break;
+               }
+               goto bad; //ATC for all other spOff values
+            case RLPri_V256SpRel:
+               goto bad; //ATC
             case RLPri_None: case RLPri_INVALID: default:
-               vassert(0);
+               vassert(0); // should never get here
          }
 
          // after:
@@ -3081,7 +3157,7 @@
    }
 
    case Ain_CStore: {
-      /* AFAICS this is identical to Ain_CStore except that the opcode
+      /* AFAICS this is identical to Ain_CLoad except that the opcode
          is 0x89 instead of 0x8B. */
       vassert(i->Ain.CStore.cond != Acc_ALWAYS);
 
@@ -3418,6 +3494,60 @@
                            i->Ain.SseLdSt.addr);
       goto done;
 
+   case Ain_SseCStore: {
+      vassert(i->Ain.SseCStore.cond != Acc_ALWAYS);
+
+      /* Use ptmp for backpatching conditional jumps. */
+      ptmp = NULL;
+
+      /* jmp fwds if !condition */
+      *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCStore.cond ^ 1)));
+      ptmp = p; /* fill in this bit later */
+      *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+
+      /* Now the store. */
+      *p++ = clearWBit(
+             rexAMode_M_enc(vregEnc3210(i->Ain.SseCStore.src),
+                            i->Ain.SseCStore.addr));
+      *p++ = 0x0F; 
+      *p++ = toUChar(0x11);
+      p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCStore.src),
+                           i->Ain.SseCStore.addr);
+
+      /* Fix up the conditional branch */
+      Int delta = p - ptmp;
+      vassert(delta > 0 && delta < 40);
+      *ptmp = toUChar(delta-1);
+      goto done;
+   }
+
+   case Ain_SseCLoad: {
+      vassert(i->Ain.SseCLoad.cond != Acc_ALWAYS);
+
+      /* Use ptmp for backpatching conditional jumps. */
+      ptmp = NULL;
+
+      /* jmp fwds if !condition */
+      *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCLoad.cond ^ 1)));
+      ptmp = p; /* fill in this bit later */
+      *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+
+      /* Now the load. */
+      *p++ = clearWBit(
+             rexAMode_M_enc(vregEnc3210(i->Ain.SseCLoad.dst),
+                            i->Ain.SseCLoad.addr));
+      *p++ = 0x0F; 
+      *p++ = toUChar(0x10);
+      p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCLoad.dst),
+                           i->Ain.SseCLoad.addr);
+
+      /* Fix up the conditional branch */
+      Int delta = p - ptmp;
+      vassert(delta > 0 && delta < 40);
+      *ptmp = toUChar(delta-1);
+      goto done;
+   }
+
    case Ain_SseLdzLO:
       vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
       /* movs[sd] amode, %xmm-dst */
@@ -3726,7 +3856,7 @@
    /*NOTREACHED*/
    
   done:
-   vassert(p - &buf[0] <= 32);
+   vassert(p - &buf[0] <= 64);
    return p - &buf[0];
 }
 
diff --git a/priv/host_amd64_defs.h b/priv/host_amd64_defs.h
index f76cd83..523d18d 100644
--- a/priv/host_amd64_defs.h
+++ b/priv/host_amd64_defs.h
@@ -390,6 +390,8 @@
       Ain_SseSDSS,     /* scalar float32 to/from float64 */
       Ain_SseLdSt,     /* SSE load/store 32/64/128 bits, no alignment
                           constraints, upper 96/64/0 bits arbitrary */
+      Ain_SseCStore,   /* SSE conditional store, 128 bit only, any alignment */
+      Ain_SseCLoad,    /* SSE conditional load, 128 bit only, any alignment */
       Ain_SseLdzLO,    /* SSE load low 32/64 bits, zero remainder of reg */
       Ain_Sse32Fx4,    /* SSE binary, 32Fx4 */
       Ain_Sse32FLo,    /* SSE binary, 32F in lowest lane only */
@@ -642,6 +644,16 @@
             AMD64AMode* addr;
          } SseLdSt;
          struct {
+            AMD64CondCode cond; /* may not be Acc_ALWAYS */
+            HReg          src;
+            AMD64AMode*   addr;
+         } SseCStore;
+         struct {
+            AMD64CondCode cond; /* may not be Acc_ALWAYS */
+            AMD64AMode*   addr;
+            HReg          dst;
+         } SseCLoad;
+         struct {
             Int         sz; /* 4 or 8 only */
             HReg        reg;
             AMD64AMode* addr;
@@ -751,6 +763,8 @@
 extern AMD64Instr* AMD64Instr_SseSF2SI   ( Int szS, Int szD, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_SseSDSS    ( Bool from64, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_SseLdSt    ( Bool isLoad, Int sz, HReg, AMD64AMode* );
+extern AMD64Instr* AMD64Instr_SseCStore  ( AMD64CondCode, HReg, AMD64AMode* );
+extern AMD64Instr* AMD64Instr_SseCLoad   ( AMD64CondCode, AMD64AMode*, HReg );
 extern AMD64Instr* AMD64Instr_SseLdzLO   ( Int sz, HReg, AMD64AMode* );
 extern AMD64Instr* AMD64Instr_Sse32Fx4   ( AMD64SseOp, HReg, HReg );
 extern AMD64Instr* AMD64Instr_Sse32FLo   ( AMD64SseOp, HReg, HReg );
diff --git a/priv/host_amd64_isel.c b/priv/host_amd64_isel.c
index 9d9d78e..8be498b 100644
--- a/priv/host_amd64_isel.c
+++ b/priv/host_amd64_isel.c
@@ -4298,21 +4298,35 @@
 
       UChar szB = 0; /* invalid */
       switch (lg->cvt) {
-         case ILGop_Ident32: szB = 4; break;
-         case ILGop_Ident64: szB = 8; break;
+         case ILGop_Ident32:   szB = 4;  break;
+         case ILGop_Ident64:   szB = 8;  break;
+         case ILGop_IdentV128: szB = 16; break;
          default: break;
       }
       if (szB == 0)
          goto stmt_fail;
 
-      AMD64AMode* amAddr = iselIntExpr_AMode(env, lg->addr);
-      HReg rAlt  = iselIntExpr_R(env, lg->alt);
-      HReg rDst  = lookupIRTemp(env, lg->dst);
+      AMD64AMode* amAddr
+         = iselIntExpr_AMode(env, lg->addr);
+      HReg rAlt
+         = szB == 16 ? iselVecExpr(env, lg->alt)
+                     : iselIntExpr_R(env, lg->alt);
+      HReg rDst
+         = lookupIRTemp(env, lg->dst);
+
       /* Get the alt value into the dst.  We'll do a conditional load
          which overwrites it -- or not -- with loaded data. */
-      addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
+      if (szB == 16) {
+         addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
+      } else {
+         addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
+      }
       AMD64CondCode cc = iselCondCode(env, lg->guard);
-      addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
+      if (szB == 16) {
+         addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
+      } else {
+         addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
+      }
       return;
    }
 
@@ -4324,17 +4338,26 @@
 
       UChar szB = 0; /* invalid */
       switch (typeOfIRExpr(env->type_env, sg->data)) {
-         case Ity_I32: szB = 4; break;
-         case Ity_I64: szB = 8; break;
+         case Ity_I32:  szB = 4; break;
+         case Ity_I64:  szB = 8; break;
+         case Ity_V128: szB = 16; break;
          default: break;
       }
       if (szB == 0)
          goto stmt_fail;
 
-      AMD64AMode*   amAddr = iselIntExpr_AMode(env, sg->addr);
-      HReg          rSrc   = iselIntExpr_R(env, sg->data);
-      AMD64CondCode cc     = iselCondCode(env, sg->guard);
-      addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
+      AMD64AMode* amAddr
+         = iselIntExpr_AMode(env, sg->addr);
+      HReg rSrc
+         = szB == 16 ? iselVecExpr(env, sg->data)
+                     : iselIntExpr_R(env, sg->data);
+      AMD64CondCode cc
+         = iselCondCode(env, sg->guard);
+      if (szB == 16) {
+         addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
+      } else {
+         addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
+      }
       return;
    }
 
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index 305e915..dd9d7ba 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -1470,13 +1470,14 @@
 void ppIRLoadGOp ( IRLoadGOp cvt )
 {
    switch (cvt) {
-      case ILGop_INVALID: vex_printf("ILGop_INVALID"); break;      
-      case ILGop_Ident64: vex_printf("Ident64"); break;      
-      case ILGop_Ident32: vex_printf("Ident32"); break;      
-      case ILGop_16Uto32: vex_printf("16Uto32"); break;      
-      case ILGop_16Sto32: vex_printf("16Sto32"); break;      
-      case ILGop_8Uto32:  vex_printf("8Uto32"); break;      
-      case ILGop_8Sto32:  vex_printf("8Sto32"); break;      
+      case ILGop_INVALID:   vex_printf("ILGop_INVALID"); break;      
+      case ILGop_IdentV128: vex_printf("IdentV128"); break;      
+      case ILGop_Ident64:   vex_printf("Ident64"); break;      
+      case ILGop_Ident32:   vex_printf("Ident32"); break;      
+      case ILGop_16Uto32:   vex_printf("16Uto32"); break;      
+      case ILGop_16Sto32:   vex_printf("16Sto32"); break;      
+      case ILGop_8Uto32:    vex_printf("8Uto32"); break;      
+      case ILGop_8Sto32:    vex_printf("8Sto32"); break;      
       default: vpanic("ppIRLoadGOp");
    }
 }
@@ -3525,6 +3526,8 @@
                        /*OUT*/IRType* t_res, /*OUT*/IRType* t_arg )
 {
    switch (cvt) {
+      case ILGop_IdentV128:
+         *t_res = Ity_V128; *t_arg = Ity_V128; break;
       case ILGop_Ident64:
          *t_res = Ity_I64; *t_arg = Ity_I64; break;
       case ILGop_Ident32:
diff --git a/priv/ir_opt.c b/priv/ir_opt.c
index 52cef9b..c9b2c3b 100644
--- a/priv/ir_opt.c
+++ b/priv/ir_opt.c
@@ -1264,6 +1264,7 @@
       case Iop_Xor64: return IRExpr_Const(IRConst_U64(0));
       case Iop_XorV128:
       case Iop_AndV128: return IRExpr_Const(IRConst_V128(0));
+      case Iop_XorV256:
       case Iop_AndV256: return IRExpr_Const(IRConst_V256(0));
       default: vpanic("mkZeroOfPrimopResultType: bad primop");
    }
@@ -2285,6 +2286,7 @@
             case Iop_Xor32:
             case Iop_Xor64:
             case Iop_XorV128:
+            case Iop_XorV256:
                /* Xor8/16/32/64/V128(t,t) ==> 0, for some IRTemp t */
                if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                   e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op);
@@ -2887,6 +2889,8 @@
       typeOfIRLoadGOp(lg->cvt, &cvtRes, &cvtArg);
       IROp cvtOp = Iop_INVALID;
       switch (lg->cvt) {
+         case ILGop_IdentV128:
+         case ILGop_Ident64:
          case ILGop_Ident32: break;
          case ILGop_8Uto32:  cvtOp = Iop_8Uto32;  break;
          case ILGop_8Sto32:  cvtOp = Iop_8Sto32;  break;
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index 67edc8a..e61291c 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -2592,6 +2592,7 @@
 typedef
    enum {
       ILGop_INVALID=0x1D00,
+      ILGop_IdentV128, /* 128 bit vector, no conversion */
       ILGop_Ident64,   /* 64 bit, no conversion */
       ILGop_Ident32,   /* 32 bit, no conversion */
       ILGop_16Uto32,   /* 16 bit load, Z-widen to 32 */