diff --git a/priv/guest-x86/ghelpers.c b/priv/guest-x86/ghelpers.c
index 855d753..06b6088 100644
--- a/priv/guest-x86/ghelpers.c
+++ b/priv/guest-x86/ghelpers.c
@@ -1581,6 +1581,20 @@
    vex_state->guest_FPROUND = (UInt)Irrm_NEAREST;
    vex_state->guest_FC3210  = 0;
 
+#  define SSEZERO(_xmm) _xmm[0]=_xmm[1]=_xmm[2]=_xmm[3] = 0;
+
+   vex_state->guest_SSEROUND = (UInt)Irrm_NEAREST;
+   SSEZERO(vex_state->guest_XMM0);
+   SSEZERO(vex_state->guest_XMM1);
+   SSEZERO(vex_state->guest_XMM2);
+   SSEZERO(vex_state->guest_XMM3);
+   SSEZERO(vex_state->guest_XMM4);
+   SSEZERO(vex_state->guest_XMM5);
+   SSEZERO(vex_state->guest_XMM6);
+   SSEZERO(vex_state->guest_XMM7);
+
+#  undef SSEZERO
+
    vex_state->guest_CS = 0;
    vex_state->guest_DS = 0;
    vex_state->guest_ES = 0;
diff --git a/priv/guest-x86/toIR.c b/priv/guest-x86/toIR.c
index 6dd1aed..409cc24 100644
--- a/priv/guest-x86/toIR.c
+++ b/priv/guest-x86/toIR.c
@@ -103,26 +103,48 @@
 /*--- Offsets of various parts of the x86 guest state.     ---*/
 /*------------------------------------------------------------*/
 
-#define OFFB_FPREGS   offsetof(VexGuestX86State,guest_FPREG[0])
-#define OFFB_FPTAGS   offsetof(VexGuestX86State,guest_FPTAG[0])
-#define OFFB_EAX      offsetof(VexGuestX86State,guest_EAX)
-#define OFFB_EBX      offsetof(VexGuestX86State,guest_EBX)
-#define OFFB_ECX      offsetof(VexGuestX86State,guest_ECX)
-#define OFFB_EDX      offsetof(VexGuestX86State,guest_EDX)
-#define OFFB_EIP      offsetof(VexGuestX86State,guest_EIP)
+#define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
+#define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
+#define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
+#define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
+#define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
+#define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
+#define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
+#define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
 
-#define OFFB_CC_OP    offsetof(VexGuestX86State,guest_CC_OP)
-#define OFFB_CC_DEP1  offsetof(VexGuestX86State,guest_CC_DEP1)
-#define OFFB_CC_DEP2  offsetof(VexGuestX86State,guest_CC_DEP2)
-#define OFFB_CC_NDEP  offsetof(VexGuestX86State,guest_CC_NDEP)
+#define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
 
-#define OFFB_DFLAG    offsetof(VexGuestX86State,guest_DFLAG)
-#define OFFB_IDFLAG   offsetof(VexGuestX86State,guest_IDFLAG)
-#define OFFB_FTOP     offsetof(VexGuestX86State,guest_FTOP)
-#define OFFB_FC3210   offsetof(VexGuestX86State,guest_FC3210)
-#define OFFB_FPROUND  offsetof(VexGuestX86State,guest_FPROUND)
+#define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
+#define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
+#define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
+#define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
 
-#define OFFB_EMWARN   offsetof(VexGuestX86State,guest_EMWARN)
+#define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
+#define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
+#define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
+#define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
+#define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
+#define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
+#define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
+
+#define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
+#define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
+#define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
+#define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
+#define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
+#define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
+
+#define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
+#define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
+#define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
+#define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
+#define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
+#define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
+#define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
+#define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
+#define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
+
+#define OFFB_EMWARN    offsetof(VexGuestX86State,guest_EMWARN)
 
 
 /*------------------------------------------------------------*/
@@ -461,24 +483,24 @@
 
    if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
       switch (archreg) {
-         case R_EAX: return offsetof(VexGuestX86State,guest_EAX);
-         case R_EBX: return offsetof(VexGuestX86State,guest_EBX);
-         case R_ECX: return offsetof(VexGuestX86State,guest_ECX);
-         case R_EDX: return offsetof(VexGuestX86State,guest_EDX);
-         case R_ESI: return offsetof(VexGuestX86State,guest_ESI);
-         case R_EDI: return offsetof(VexGuestX86State,guest_EDI);
-         case R_ESP: return offsetof(VexGuestX86State,guest_ESP);
-         case R_EBP: return offsetof(VexGuestX86State,guest_EBP);
+         case R_EAX: return OFFB_EAX;
+         case R_EBX: return OFFB_EBX;
+         case R_ECX: return OFFB_ECX;
+         case R_EDX: return OFFB_EDX;
+         case R_ESI: return OFFB_ESI;
+         case R_EDI: return OFFB_EDI;
+         case R_ESP: return OFFB_ESP;
+         case R_EBP: return OFFB_EBP;
          default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
       }
    }
 
    vassert(archreg >= 4 && archreg < 8 && sz == 1);
    switch (archreg-4) {
-      case R_EAX: return 1+ offsetof(VexGuestX86State,guest_EAX);
-      case R_EBX: return 1+ offsetof(VexGuestX86State,guest_EBX);
-      case R_ECX: return 1+ offsetof(VexGuestX86State,guest_ECX);
-      case R_EDX: return 1+ offsetof(VexGuestX86State,guest_EDX);
+      case R_EAX: return 1+ OFFB_EAX;
+      case R_EBX: return 1+ OFFB_EBX;
+      case R_ECX: return 1+ OFFB_ECX;
+      case R_EDX: return 1+ OFFB_EDX;
       default: vpanic("integerGuestRegOffset(x86,le)(1h)");
    }
 
@@ -489,16 +511,31 @@
 static Int segmentGuestRegOffset ( UInt sreg )
 {
    switch (sreg) {
-      case R_ES: return offsetof(VexGuestX86State,guest_ES);
-      case R_CS: return offsetof(VexGuestX86State,guest_CS);
-      case R_SS: return offsetof(VexGuestX86State,guest_SS);
-      case R_DS: return offsetof(VexGuestX86State,guest_DS);
-      case R_FS: return offsetof(VexGuestX86State,guest_FS);
-      case R_GS: return offsetof(VexGuestX86State,guest_GS);
+      case R_ES: return OFFB_ES;
+      case R_CS: return OFFB_CS;
+      case R_SS: return OFFB_SS;
+      case R_DS: return OFFB_DS;
+      case R_FS: return OFFB_FS;
+      case R_GS: return OFFB_GS;
       default: vpanic("segmentGuestRegOffset(x86)");
    }
 }
 
+static Int xmmGuestRegOffset ( UInt xmmreg )
+{
+   switch (xmmreg) {
+      case 0: return OFFB_XMM0;
+      case 1: return OFFB_XMM1;
+      case 2: return OFFB_XMM2;
+      case 3: return OFFB_XMM3;
+      case 4: return OFFB_XMM4;
+      case 5: return OFFB_XMM5;
+      case 6: return OFFB_XMM6;
+      case 7: return OFFB_XMM7;
+      default: vpanic("xmmGuestRegOffset");
+   }
+}
+
 static IRExpr* getIReg ( Int sz, UInt archreg )
 {
    vassert(sz == 1 || sz == 2 || sz == 4);
@@ -510,8 +547,10 @@
 /* Ditto, but write to a reg instead. */
 static void putIReg ( Int sz, UInt archreg, IRExpr* e )
 {
+   IRType ty = typeOfIRExpr(irbb->tyenv, e);
    vassert(sz == 1 || sz == 2 || sz == 4);
    vassert(archreg < 8);
+   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
    stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
 }
 
@@ -528,6 +567,17 @@
 }
 #endif
 
+static IRExpr* getXMMReg ( UInt xmmreg )
+{
+   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
+}
+
+static void putXMMReg ( UInt xmmreg, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irbb->tyenv,e) == Ity_V128);
+   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
+}
+
 static void assign ( IRTemp dst, IRExpr* e )
 {
    stmt( IRStmt_Tmp(dst, e) );
@@ -1258,33 +1308,33 @@
 
 
 
-static Char* nameGrp1 ( Int opc_aux )
+static HChar* nameGrp1 ( Int opc_aux )
 {
-   static Char* grp1_names[8] 
+   static HChar* grp1_names[8] 
      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
    return grp1_names[opc_aux];
 }
 
-static Char* nameGrp2 ( Int opc_aux )
+static HChar* nameGrp2 ( Int opc_aux )
 {
-   static Char* grp2_names[8] 
+   static HChar* grp2_names[8] 
      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
    return grp2_names[opc_aux];
 }
 
-static Char* nameGrp4 ( Int opc_aux )
+static HChar* nameGrp4 ( Int opc_aux )
 {
-   static Char* grp4_names[8] 
+   static HChar* grp4_names[8] 
      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
    return grp4_names[opc_aux];
 }
 
-static Char* nameGrp5 ( Int opc_aux )
+static HChar* nameGrp5 ( Int opc_aux )
 {
-   static Char* grp5_names[8] 
+   static HChar* grp5_names[8] 
      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
    return grp5_names[opc_aux];
@@ -1298,14 +1348,14 @@
 //--    return grp8_names[opc_aux];
 //-- }
 
-static Char* nameIReg ( Int size, Int reg )
+static HChar* nameIReg ( Int size, Int reg )
 {
-   static Char* ireg32_names[8] 
+   static HChar* ireg32_names[8] 
      = { "%eax", "%ecx", "%edx", "%ebx", 
          "%esp", "%ebp", "%esi", "%edi" };
-   static Char* ireg16_names[8] 
+   static HChar* ireg16_names[8] 
      = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
-   static Char* ireg8_names[8] 
+   static HChar* ireg8_names[8] 
      = { "%al", "%cl", "%dl", "%bl", 
          "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
    if (reg < 0 || reg > 7) goto bad;
@@ -1319,7 +1369,7 @@
    return NULL; /*notreached*/
 }
 
-static Char* nameSReg ( UInt sreg )
+static HChar* nameSReg ( UInt sreg )
 {
    switch (sreg) {
       case R_ES: return "%es";
@@ -1332,21 +1382,22 @@
    }
 }
 
-static Char* nameMMXReg ( Int mmxreg )
+static HChar* nameMMXReg ( Int mmxreg )
 {
-   static Char* mmx_names[8] 
+   static HChar* mmx_names[8] 
      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
    return mmx_names[mmxreg];
 }
 
-//-- const Char* VG_(name_of_xmm_reg) ( Int xmmreg )
-//-- {
-//--    static const Char* xmm_names[8] 
-//--      = { "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
-//--    if (xmmreg < 0 || xmmreg > 7) VG_(core_panic)("name_of_xmm_reg");
-//--    return xmm_names[xmmreg];
-//-- }
+static HChar* nameXMMReg ( Int xmmreg )
+{
+   static HChar* xmm_names[8] 
+     = { "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+         "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
+   if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
+   return xmm_names[xmmreg];
+}
  
 static Char* nameMMXGran ( UChar gran )
 {
@@ -1855,7 +1906,7 @@
                    UInt        delta0,
                    Char*       t_x86opc )
 {
-   UChar   dis_buf[50];
+   HChar   dis_buf[50];
    Int     len;
    IRType  ty   = szToITy(size);
    IRTemp  dst1 = newTemp(ty);
@@ -1964,7 +2015,7 @@
                    UInt        delta0,
                    Char*       t_x86opc )
 {
-   UChar   dis_buf[50];
+   HChar   dis_buf[50];
    Int     len;
    IRType  ty   = szToITy(size);
    IRTemp  dst1 = newTemp(ty);
@@ -2068,7 +2119,7 @@
 {
    Int len;
    UChar rm = getIByte(delta0);
-   UChar dis_buf[50];
+   HChar dis_buf[50];
 
    if (epartIsReg(rm)) {
       putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
@@ -2112,7 +2163,7 @@
 {
    Int len;
    UChar rm = getIByte(delta0);
-   UChar dis_buf[50];
+   HChar dis_buf[50];
 
    if (epartIsReg(rm)) {
       putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
@@ -2186,7 +2237,7 @@
    /* E refers to memory */    
    {
       Int    len;
-      UChar  dis_buf[50];
+      HChar  dis_buf[50];
       IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
 
       putIReg(szd, gregOfRM(rm),
@@ -2299,7 +2350,7 @@
                 Int am_sz, Int d_sz, Int sz, UInt d32 )
 {
    Int     len;
-   UChar   dis_buf[50];
+   HChar   dis_buf[50];
    IRType  ty   = szToITy(sz);
    IRTemp  dst1 = newTemp(ty);
    IRTemp  src  = newTemp(ty);
@@ -2382,7 +2433,7 @@
                 Char* shift_expr_txt )
 {
    /* delta on entry points at the modrm byte. */
-   UChar  dis_buf[50];
+   HChar  dis_buf[50];
    Int    len;
    Bool   isShift, isRotate, isRotateRC;
    IRType ty    = szToITy(sz);
@@ -2635,7 +2686,7 @@
 //--       And eip on entry points at the modrm byte. */
 //--    Int   t1, t2, t_fetched, t_mask;
 //--    UInt  pair;
-//--    Char  dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UInt  v_mask;
 //-- 
 //--    /* There is no 1-byte form of this instruction, AFAICS. */
@@ -2776,10 +2827,10 @@
 static 
 UInt dis_Grp3 ( UChar sorb, Int sz, UInt delta )
 {
-   UInt  d32;
-   UChar modrm;
-   UChar dis_buf[50];
-   Int len;
+   UInt    d32;
+   UChar   modrm;
+   HChar   dis_buf[50];
+   Int     len;
    IRTemp  addr;
    IRType  ty = szToITy(sz);
    IRTemp  t1 = newTemp(ty);
@@ -2906,9 +2957,9 @@
 static
 UInt dis_Grp4 ( UChar sorb, UInt delta )
 {
-  Int alen;
+   Int   alen;
    UChar modrm;
-   UChar dis_buf[50];
+   HChar dis_buf[50];
    IRType ty = Ity_I8;
    IRTemp t1 = newTemp(ty);
    IRTemp t2 = newTemp(ty);
@@ -2967,7 +3018,7 @@
 {
    Int     len;
    UChar   modrm;
-   UChar   dis_buf[50];
+   HChar   dis_buf[50];
    IRTemp  addr = IRTemp_INVALID;
    IRType  ty = szToITy(sz);
    IRTemp  t1 = newTemp(ty);
@@ -3267,7 +3318,7 @@
                    UInt        delta0 )
 {
    Int    alen;
-   UChar  dis_buf[50];
+   HChar  dis_buf[50];
    UChar  rm = getIByte(delta0);
    IRType ty = szToITy(size);
    IRTemp te = newTemp(ty);
@@ -3309,7 +3360,7 @@
                       Int         litsize )
 {
    Int    d32, alen;
-   Char   dis_buf[50];
+   HChar  dis_buf[50];
    UChar  rm = getIByte(delta);
    IRType ty = szToITy(size);
    IRTemp te = newTemp(ty);
@@ -3638,7 +3689,7 @@
 {
    Int    len;
    UInt   r_src, r_dst;
-   UChar  dis_buf[50];
+   HChar  dis_buf[50];
    IRTemp t1, t2;
 
    /* On entry, delta points at the second byte of the insn (the modrm
@@ -4452,23 +4503,23 @@
                d->nFxState = 5;
 
                d->fxState[0].fx     = Ifx_Write;
-               d->fxState[0].offset = offsetof(VexGuestX86State,guest_FTOP);
+               d->fxState[0].offset = OFFB_FTOP;
                d->fxState[0].size   = sizeof(UInt);
 
                d->fxState[1].fx     = Ifx_Write;
-               d->fxState[1].offset = offsetof(VexGuestX86State,guest_FPREG);
+               d->fxState[1].offset = OFFB_FPREGS;
                d->fxState[1].size   = 8 * sizeof(ULong);
 
                d->fxState[2].fx     = Ifx_Write;
-               d->fxState[2].offset = offsetof(VexGuestX86State,guest_FPTAG);
+               d->fxState[2].offset = OFFB_FPTAGS;
                d->fxState[2].size   = 8 * sizeof(UChar);
 
                d->fxState[3].fx     = Ifx_Write;
-               d->fxState[3].offset = offsetof(VexGuestX86State,guest_FPROUND);
+               d->fxState[3].offset = OFFB_FPROUND;
                d->fxState[3].size   = sizeof(UInt);
 
                d->fxState[4].fx     = Ifx_Write;
-               d->fxState[4].offset = offsetof(VexGuestX86State,guest_FC3210);
+               d->fxState[4].offset = OFFB_FC3210;
                d->fxState[4].size   = sizeof(UInt);
 
                stmt( IRStmt_Dirty(d) );
@@ -4509,23 +4560,23 @@
                d->nFxState = 5;
 
                d->fxState[0].fx     = Ifx_Read;
-               d->fxState[0].offset = offsetof(VexGuestX86State,guest_FTOP);
+               d->fxState[0].offset = OFFB_FTOP;
                d->fxState[0].size   = sizeof(UInt);
 
                d->fxState[1].fx     = Ifx_Read;
-               d->fxState[1].offset = offsetof(VexGuestX86State,guest_FPREG);
+               d->fxState[1].offset = OFFB_FPREGS;
                d->fxState[1].size   = 8 * sizeof(ULong);
 
                d->fxState[2].fx     = Ifx_Read;
-               d->fxState[2].offset = offsetof(VexGuestX86State,guest_FPTAG);
+               d->fxState[2].offset = OFFB_FPTAGS;
                d->fxState[2].size   = 8 * sizeof(UChar);
 
                d->fxState[3].fx     = Ifx_Read;
-               d->fxState[3].offset = offsetof(VexGuestX86State,guest_FPROUND);
+               d->fxState[3].offset = OFFB_FPROUND;
                d->fxState[3].size   = sizeof(UInt);
 
                d->fxState[4].fx     = Ifx_Read;
-               d->fxState[4].offset = offsetof(VexGuestX86State,guest_FC3210);
+               d->fxState[4].offset = OFFB_FC3210;
                d->fxState[4].size   = sizeof(UInt);
 
                stmt( IRStmt_Dirty(d) );
@@ -4879,7 +4930,7 @@
                                Char* name,
                                Bool  show_granularity )
 {
-   Char    dis_buf[50];
+   HChar   dis_buf[50];
    UChar   modrm = getIByte(delta);
    Bool    isReg = epartIsReg(modrm);
    IRExpr* argL  = NULL;
@@ -5014,7 +5065,7 @@
 {
    Int   len;
    UChar modrm;
-   UChar dis_buf[50];
+   HChar dis_buf[50];
    UChar opc = getIByte(delta);
    delta++;
 
@@ -5337,7 +5388,7 @@
    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
       for printing it.   And eip on entry points at the modrm byte. */
    Int len;
-   UChar dis_buf[50];
+   HChar dis_buf[50];
 
    IRType ty       = szToITy(sz);
    IRTemp gsrc     = newTemp(ty);
@@ -5463,7 +5514,7 @@
 static
 UInt dis_bt_G_E ( UChar sorb, Int sz, UInt delta, BtOp op )
 {
-   Char   dis_buf[50];
+   HChar  dis_buf[50];
    UChar  modrm;
    Int    len;
    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0, 
@@ -5594,7 +5645,7 @@
 {
    Bool   isReg;
    UChar  modrm;
-   Char   dis_buf[50];
+   HChar  dis_buf[50];
    
    IRType ty  = szToITy(sz);
    IRTemp src = newTemp(ty);
@@ -5760,7 +5811,7 @@
                        Int         size, 
                        UInt        delta0 )
 {
-   UChar dis_buf[50];
+   HChar dis_buf[50];
    Int   len;
 
    IRType ty    = szToITy(size);
@@ -5813,7 +5864,7 @@
 //--                      Addr        eip0 )
 //-- {
 //--    Int   tal, tah, junkl, junkh, destl, desth, srcl, srch, accl, acch;
-//--    UChar dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar rm;
 //--    UInt  pair;
 //-- 
@@ -5899,7 +5950,7 @@
                     UInt        delta0 )
 {
    UChar rm  = getIByte(delta0);
-   UChar dis_buf[50];
+   HChar dis_buf[50];
    Int   len;
 
    IRType ty   = szToITy(sz);
@@ -5950,7 +6001,7 @@
 {
    Int   len;
    UChar rm = getIByte(delta0);
-   UChar dis_buf[50];
+   HChar dis_buf[50];
 
    //   Int tmpd = newTemp(cb);
    //Int tmpt = newTemp(cb);
@@ -6009,7 +6060,7 @@
 //--                      Addr        eip0 )
 //-- {
 //--    UChar rm  = getUChar(eip0);
-//--    UChar dis_buf[50];
+//--    HChar dis_buf[50];
 //-- 
 //--    if (epartIsReg(rm)) {
 //--       Int tmpv = newTemp(cb);
@@ -6054,7 +6105,7 @@
                      UInt  delta0 )
 {
    UChar rm = getIByte(delta0);
-   //UChar dis_buf[50];
+   //HChar dis_buf[50];
 
    vassert(sz == 2 || sz == 4);
 
@@ -6099,7 +6150,7 @@
 //--                                Char* name,
 //--                                Bool show_granularity )
 //-- {
-//--     Char dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    Bool  isReg = epartIsReg(modrm);
 //-- 
@@ -6141,7 +6192,7 @@
 //--                                     Char* name,
 //--                                     Bool show_granularity )
 //-- {
-//--     Char dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    UChar imm8;
 //--    Bool  isReg = epartIsReg(modrm);
@@ -6196,7 +6247,7 @@
 //--                            UChar opc2, 
 //--                            UChar opc3 )
 //-- {
-//--     Char dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    Bool  isReg = epartIsReg(modrm);
 //-- 
@@ -6241,7 +6292,7 @@
 //--                            UChar opc1, 
 //--                            UChar opc2 )
 //-- {
-//--     Char dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    Bool  isReg = epartIsReg(modrm);
 //-- 
@@ -6285,7 +6336,7 @@
 //--                                 UChar opc1, 
 //--                                 UChar opc2 )
 //-- {
-//--     Char dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    UChar imm8;
 //--    Bool  isReg = epartIsReg(modrm);
@@ -6333,7 +6384,7 @@
 //--                                 UChar opc2,
 //--                                 UChar opc3 )
 //-- {
-//--     Char dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    UChar imm8;
 //--    Bool  isReg = epartIsReg(modrm);
@@ -6381,7 +6432,7 @@
 //--                                   UChar insn1, 
 //--                                   UChar insn2 )
 //-- {
-//--     Char dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    Bool  isReg = epartIsReg(modrm);
 //--    UInt  pair;
@@ -6431,7 +6482,7 @@
 //--                                   UChar insn0, 
 //--                                   UChar insn1 )
 //-- {
-//--     Char dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    Bool  isReg = epartIsReg(modrm);
 //--    UInt  pair;
@@ -6484,7 +6535,7 @@
 //--                        UChar opc1, 
 //--                        UChar opc2 )
 //-- {
-//--    UChar dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    if (epartIsReg(modrm)) {
 //--       /* Completely internal SSE insn. */
@@ -6524,7 +6575,7 @@
 //--                          UChar opc1, 
 //--                          UChar opc2 )
 //-- {
-//--    UChar dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    if (epartIsReg(modrm)) {
 //--       /* Completely internal SSE insn. */
@@ -6565,7 +6616,7 @@
 //--                        UChar opc2, 
 //--                        UChar opc3 )
 //-- {
-//--    UChar dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    if (epartIsReg(modrm)) {
 //--       /* Completely internal SSE insn. */
@@ -6606,7 +6657,7 @@
 //--                          UChar opc2, 
 //--                          UChar opc3 )
 //-- {
-//--    UChar dis_buf[50];
+//--    HChar dis_buf[50];
 //--    UChar modrm = getUChar(eip);
 //--    if (epartIsReg(modrm)) {
 //--       /* Completely internal SSE insn. */
@@ -6668,6 +6719,47 @@
    jmp_treg(Ijk_Ret,t2);
 }
 
+/* ------ SSE/SSE2/SSE3 helpers ----- */
+
+static void putXMMRegLO64( Int xmmreg, IRExpr* e64 )
+{
+   putXMMReg( 
+      xmmreg,
+      binop(Iop_64HLto128,
+            unop(Iop_128HIto64, getXMMReg(xmmreg)),
+            e64 ) 
+   );
+}
+
+static void putXMMRegHI64( Int xmmreg, IRExpr* e64 )
+{
+   putXMMReg( 
+      xmmreg,
+      binop(Iop_64HLto128,
+            e64, 
+            unop(Iop_128to64, getXMMReg(xmmreg)))
+   );
+}
+
+static UInt dis_SSE_E_to_G ( UChar sorb, UInt delta, 
+                             HChar* opname, IROp op )
+{
+   HChar dis_buf[50];
+   Int   alen;
+   UChar rm = getIByte(delta);
+   if (epartIsReg(rm)) {
+      putXMMReg( gregOfRM(rm), 
+                 binop(op, getXMMReg(gregOfRM(rm)),
+                 getXMMReg(eregOfRM(rm))) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRM(rm)),
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+1;
+   } else {
+      vassert(0);
+   }
+}
+
 
 /*------------------------------------------------------------*/
 /*--- Disassemble a single instruction                     ---*/
@@ -6691,9 +6783,10 @@
    Int       alen;
    UChar     opc, modrm, abyte;
    UInt      d32;
-   UChar     dis_buf[50];
+   HChar     dis_buf[50];
    Int       am_sz, d_sz;
    DisResult whatNext = Dis_Continue;
+   UChar*    insn; /* used in SSE decoders */
 
    //Char  loc_buf[M_VG_ERRTXT];
 
@@ -6787,14 +6880,87 @@
          break;
    }
 
-//--    /* ---------------------------------------------------- */
-//--    /* --- The SSE/SSE2 decoder.                        --- */
-//--    /* ---------------------------------------------------- */
-//-- 
-//--    /* If it looks like this CPU might support SSE, try decoding SSE
-//--       insns.  */
-//--    if (VG_(have_ssestate)) {
-//--    UChar* insn = (UChar*)eip;
+   /* ---------------------------------------------------- */
+   /* --- The SSE decoder.                             --- */
+   /* ---------------------------------------------------- */
+
+   /* Note, this doesn't handle SSE2 or SSE3. */
+
+   insn = (UChar*)&guest_code[delta];
+
+   /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
+   if (insn[0] == 0x0F && insn[1] == 0x12) {
+      delta += 2;
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      delta += alen;
+
+      putXMMRegLO64( gregOfRM(insn[2]),  
+                     loadLE(Ity_I64, mkexpr(addr)) );
+
+      DIP("movlps %s, %s\n", 
+          dis_buf, nameXMMReg( gregOfRM(insn[2]) ));
+
+      goto decode_success;
+   }
+
+   /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
+   if (insn[0] == 0x0F && insn[1] == 0x13) {
+      delta += 2;
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      delta += alen;
+
+      storeLE( mkexpr(addr), 
+               unop(Iop_128to64, getXMMReg( gregOfRM(insn[2]) )) );
+
+      DIP("movlps %s, %s\n", 
+          nameXMMReg( gregOfRM(insn[2]) ),
+          dis_buf);
+
+      goto decode_success;
+   }
+
+   /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
+   if (insn[0] == 0x0F && insn[1] == 0x16) {
+      vassert(sz == 4);
+      delta += 2;
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      delta += alen;
+
+      putXMMRegHI64( gregOfRM(insn[2]),  
+                     loadLE(Ity_I64, mkexpr(addr)) );
+
+      DIP("movhps %s, %s\n", 
+          dis_buf, nameXMMReg( gregOfRM(insn[2]) ));
+
+      goto decode_success;
+   }
+
+   /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
+   if (insn[0] == 0x0F && insn[1] == 0x17) {
+      vassert(sz == 4);
+      delta += 2;
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      delta += alen;
+
+      storeLE( mkexpr(addr), 
+               unop(Iop_128HIto64, getXMMReg( gregOfRM(insn[2]) )) );
+
+      DIP("movhps %s, %s\n", 
+          nameXMMReg( gregOfRM(insn[2]) ),
+          dis_buf);
+
+      goto decode_success;
+   }
+
+   /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
+   if (insn[0] == 0x0F && insn[1] == 0x58) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G( sorb, delta+2, "addps", Iop_Add32Fx4 );
+      goto decode_success;
+   }
+
+
+
 //-- 
 //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
 //--    if (insn[0] == 0x0F && insn[1] == 0xAE 
diff --git a/priv/ir/irdefs.c b/priv/ir/irdefs.c
index 8105796..632fc8e 100644
--- a/priv/ir/irdefs.c
+++ b/priv/ir/irdefs.c
@@ -55,6 +55,7 @@
     case Ity_I64:     vex_printf( "I64"); break;
     case Ity_F32:     vex_printf( "F32"); break;
     case Ity_F64:     vex_printf( "F64"); break;
+    case Ity_V128:    vex_printf( "V128"); break;
     default: vex_printf("ty = 0x%x\n", (Int)ty);
              vpanic("ppIRType");
   }
@@ -217,7 +218,13 @@
       case Iop_ReinterpF64asI64: vex_printf("ReinterpF64asI64"); return;
       case Iop_ReinterpI64asF64: vex_printf("ReinterpI64asF64"); return;
 
-      default:           vpanic("ppIROp(1)");
+      case Iop_Add32Fx4: vex_printf("Add32Fx4"); return;
+
+      case Iop_64HLto128: vex_printf("64HLto128"); return;
+      case Iop_128to64:   vex_printf("128to64");   return;
+      case Iop_128HIto64: vex_printf("128HIto64"); return;
+
+      default: vpanic("ppIROp(1)");
    }
   
    switch (op - base) {
@@ -237,9 +244,9 @@
       vex_printf("BIND-%d", e->Iex.Binder.binder);
       break;
     case Iex_Get:
-      vex_printf( "GET(%d,", e->Iex.Get.offset);
+      vex_printf( "GET:" );
       ppIRType(e->Iex.Get.ty);
-      vex_printf(")");
+      vex_printf("(%d)", e->Iex.Get.offset);
       break;
     case Iex_GetI:
       vex_printf( "GETI" );
@@ -1059,6 +1066,13 @@
       case Iop_F32toF64: UNARY(Ity_F64,Ity_F32);
       case Iop_F64toF32: UNARY(Ity_F32,Ity_F64);
 
+      case Iop_64HLto128: BINARY(Ity_V128, Ity_I64,Ity_I64);
+      case Iop_128to64: case Iop_128HIto64: 
+         UNARY(Ity_I64, Ity_V128);
+
+      case Iop_Add32Fx4:
+         BINARY(Ity_V128, Ity_V128,Ity_V128);
+
       default:
          ppIROp(op);
          vpanic("typeOfPrimop");
@@ -1187,6 +1201,7 @@
       case Ity_INVALID: case Ity_I1:
       case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64: 
       case Ity_F32: case Ity_F64:
+      case Ity_V128:
          return True;
       default: 
          return False;
@@ -1713,12 +1728,13 @@
 Int sizeofIRType ( IRType ty )
 {
    switch (ty) {
-      case Ity_I8:  return 1;
-      case Ity_I16: return 2;
-      case Ity_I32: return 4;
-      case Ity_I64: return 8;
-      case Ity_F32: return 4;
-      case Ity_F64: return 8;
+      case Ity_I8:   return 1;
+      case Ity_I16:  return 2;
+      case Ity_I32:  return 4;
+      case Ity_I64:  return 8;
+      case Ity_F32:  return 4;
+      case Ity_F64:  return 8;
+      case Ity_V128: return 16;
       default: vex_printf("\n"); ppIRType(ty); vex_printf("\n");
                vpanic("sizeofIRType");
    }
diff --git a/priv/main/vex_main.c b/priv/main/vex_main.c
index b6e70b6..4fb4860 100644
--- a/priv/main/vex_main.c
+++ b/priv/main/vex_main.c
@@ -125,6 +125,7 @@
    vassert(1 == sizeof(Bool));
    vassert(4 == sizeof(Addr32));
    vassert(8 == sizeof(Addr64));
+   vassert(16 == sizeof(U128));
 
    vassert(sizeof(void*) == 4 || sizeof(void*) == 8);
    vassert(sizeof(void*) == sizeof(int*));
diff --git a/pub/libvex_basictypes.h b/pub/libvex_basictypes.h
index cf800c6..ef06af2 100644
--- a/pub/libvex_basictypes.h
+++ b/pub/libvex_basictypes.h
@@ -59,6 +59,9 @@
 typedef  unsigned long long int   ULong;
 typedef    signed long long int   Long;
 
+/* Always 128 bits. */
+typedef  UInt  U128[4];
+
 
 typedef  float   Float;    /* IEEE754 single-precision (32-bit) value */
 typedef  double  Double;   /* IEEE754 double-precision (64-bit) value */
diff --git a/pub/libvex_guest_x86.h b/pub/libvex_guest_x86.h
index e3ddfed..c80fafd 100644
--- a/pub/libvex_guest_x86.h
+++ b/pub/libvex_guest_x86.h
@@ -130,6 +130,16 @@
       UChar guest_FPTAG[8];
       UInt  guest_FPROUND;
       UInt  guest_FC3210;
+      /* SSE */
+      UInt  guest_SSEROUND;
+      U128  guest_XMM0;
+      U128  guest_XMM1;
+      U128  guest_XMM2;
+      U128  guest_XMM3;
+      U128  guest_XMM4;
+      U128  guest_XMM5;
+      U128  guest_XMM6;
+      U128  guest_XMM7;
       /* Segment registers. */
       UShort guest_CS;
       UShort guest_DS;
@@ -140,7 +150,7 @@
       /* Emulation warnings */
       UInt   guest_EMWARN;
       /* Padding to make it have an 8-aligned size */
-      /* UInt   padding; */
+      UInt   padding;
    }
    VexGuestX86State;
 
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index c37adde..73be625 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -46,10 +46,16 @@
 /* ------------------ Types ------------------ */
 
 typedef 
-   enum { Ity_INVALID=0x10FFF,
-          Ity_I1=0x11000, 
-          Ity_I8, Ity_I16, Ity_I32, Ity_I64,
-          Ity_F32, Ity_F64
+   enum { 
+      Ity_INVALID=0x10FFF,
+      Ity_I1=0x11000, 
+      Ity_I8, 
+      Ity_I16, 
+      Ity_I32, 
+      Ity_I64,
+      Ity_F32,   /* IEEE 754 float */
+      Ity_F64,   /* IEEE 754 double */
+      Ity_V128   /* 128-bit SIMD */
    }
    IRType;
 
@@ -60,11 +66,15 @@
 /* ------------------ Constants ------------------ */
 
 typedef
-   enum { Ico_U1=0x12000,
-          Ico_U8, Ico_U16, Ico_U32, Ico_U64,
-          Ico_F64, /* 64-bit IEEE754 floating */
-          Ico_F64i /* 64-bit unsigned int to be interpreted literally
-                      as a IEEE754 double value. */
+   enum { 
+      Ico_U1=0x12000,
+      Ico_U8, 
+      Ico_U16, 
+      Ico_U32, 
+      Ico_U64,
+      Ico_F64, /* 64-bit IEEE754 floating */
+      Ico_F64i /* 64-bit unsigned int to be interpreted literally
+                  as a IEEE754 double value. */
    }
    IRConstTag;
 
@@ -303,7 +313,50 @@
 
       /* Reinterpretation.  Take an F64 and produce an I64 with 
          the same bit pattern, or vice versa. */
-      Iop_ReinterpF64asI64, Iop_ReinterpI64asF64
+      Iop_ReinterpF64asI64, Iop_ReinterpI64asF64,
+
+      /* ------------------ 128-bit SIMD. ------------------ */
+
+      /* 128-bit ops */
+      Iop_And128, Iop_Or128, Iop_Xor128, Iop_Andn128,
+
+      /* --- 32x4 vector FP --- */
+
+      /* binary */
+      Iop_Add32Fx4, Iop_Sub32Fx4, Iop_Mul32Fx4, Iop_Div32Fx4, 
+      Iop_Max32Fx4, Iop_Min32Fx4,
+
+      /* unary */
+      Iop_Recip32Fx4, Iop_Sqrt32Fx4, Iop_RSqrt32Fx4,
+      Iop_ItoF32x4, /* first arg is IRRoundingMode (Ity_I32) */
+      Iop_FtoI32x4, /* first arg is IRRoundingMode (Ity_I32) */
+
+      /* --- 32x4 lowest-lane-only scalar FP --- */
+
+      /* In binary cases, upper 3/4 is copied from first operand.  In
+	 unary cases, upper 3/4 is copied from the operand. */
+
+      /* binary */
+      Iop_Add32F0x4, Iop_Sub32F0x4, Iop_Mul32F0x4, Iop_Div32F0x4, 
+      Iop_Max32F0x4, Iop_Min32F0x4,
+
+      /* unary */
+      Iop_Recip32F0x4, Iop_Sqrt32F0x4, Iop_RSqrt32F0x4,
+      Iop_ItoF320x4, /* first arg is IRRoundingMode (Ity_I32) */
+      Iop_FtoI320x4, /* first arg is IRRoundingMode (Ity_I32) */
+
+      /* --- pack / unpack --- */
+
+      /* 64 <-> 128 bit pack/unpack */
+      Iop_128to64,     // :: V128 -> I64, low half
+      Iop_128HIto64,   // :: V128 -> I64, high half
+      Iop_64HLto128,   // :: (I64,I64) -> V128
+
+      /* 128 -> 32 bit unpack */
+      Iop_128W3to32,   // :: V128 -> I32, bits 127-96
+      Iop_128W2to32,   // :: V128 -> I32, bits 95-64
+      Iop_128W1to32,   // :: V128 -> I32, bits 63-32
+      Iop_128W0to32    // :: V128 -> I32, bits 31-0
    }
    IROp;
 
