diff --git a/priv/host-x86/hdefs.c b/priv/host-x86/hdefs.c
index d61f216..c6d472f 100644
--- a/priv/host-x86/hdefs.c
+++ b/priv/host-x86/hdefs.c
@@ -1855,6 +1855,8 @@
                         subopc_imm = 0; opc_imma = 0x05; break;
          case Xalu_SUB: opc = 0x2B; opc_rr = 0x29; 
                         subopc_imm = 5; opc_imma = 0x2D; break;
+         case Xalu_SBB: opc = 0x1B; opc_rr = 0x19; 
+                        subopc_imm = 3; opc_imma = 0x1D; break;
          case Xalu_AND: opc = 0x23; opc_rr = 0x21; 
                         subopc_imm = 4; opc_imma = 0x25; break;
          case Xalu_XOR: opc = 0x33; opc_rr = 0x31; 
diff --git a/priv/host-x86/isel.c b/priv/host-x86/isel.c
index a913691..7d14df2 100644
--- a/priv/host-x86/isel.c
+++ b/priv/host-x86/isel.c
@@ -2450,6 +2450,33 @@
    if (e->tag == Iex_Unop) {
    switch (e->Iex.Unop.op) {
 
+      case Iop_CmpNEZ32x4: {
+         /* sigh, we have to generate crappy code for SSE1 */
+         /* basically, the idea is: for each lane:
+               movl lane, %r ; negl %r   (now CF = lane==0 ? 0 : 1)
+               sbbl %r, %r               (now %r = 1Sto32(CF))
+               movl %r, lane
+         */
+         Int       i;
+         X86AMode* am;
+         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
+         HReg      arg  = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg      dst  = newVRegV(env);
+         HReg      r32  = newVRegI(env);
+         sub_from_esp(env, 16);
+         addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0));
+         for (i = 0; i < 4; i++) {
+            am = X86AMode_IR(i*4, hregX86_ESP());
+            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), r32));
+            addInstr(env, X86Instr_Unary32(Xun_NEG, X86RM_Reg(r32)));
+            addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32), r32));
+            addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32), am));
+         }
+         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
+         add_to_esp(env, 16);
+         return dst;
+      }
+
       case Iop_Recip32Fx4: op = Xsse_RCPF;   goto do_32Fx4_unary;
       case Iop_RSqrt32Fx4: op = Xsse_RSQRTF; goto do_32Fx4_unary;
       case Iop_Sqrt32Fx4:  op = Xsse_SQRTF;  goto do_32Fx4_unary;
diff --git a/priv/ir/irdefs.c b/priv/ir/irdefs.c
index 1308c64..0f0306c 100644
--- a/priv/ir/irdefs.c
+++ b/priv/ir/irdefs.c
@@ -295,9 +295,10 @@
       case Iop_Set128lo32: vex_printf("Set128lo32"); return;
       case Iop_Set128lo64: vex_printf("Set128lo64"); return;
 
-      case Iop_And128:  vex_printf("And128"); return;
-      case Iop_Or128:   vex_printf("Or128");  return;
-      case Iop_Xor128:  vex_printf("Xor128"); return;
+      case Iop_And128:     vex_printf("And128"); return;
+      case Iop_Or128:      vex_printf("Or128");  return;
+      case Iop_Xor128:     vex_printf("Xor128"); return;
+      case Iop_CmpNEZ32x4: vex_printf("CmpNEZ32x4"); return;
 
       case Iop_Add8x16:   vex_printf("Add8x16"); return;
       case Iop_Add16x8:   vex_printf("Add16x8"); return;
@@ -1275,6 +1276,7 @@
       case Iop_RSqrt64Fx2: case Iop_RSqrt64F0x2:
       case Iop_Sqrt32Fx4:  case Iop_Sqrt32F0x4:
       case Iop_Sqrt64Fx2:  case Iop_Sqrt64F0x2:
+      case Iop_CmpNEZ32x4:
          UNARY(Ity_V128, Ity_V128);
 
       case Iop_ShlN16x8: case Iop_ShlN32x4: case Iop_ShlN64x2:
diff --git a/pub/libvex_guest_x86.h b/pub/libvex_guest_x86.h
index 2201194..ac95b04 100644
--- a/pub/libvex_guest_x86.h
+++ b/pub/libvex_guest_x86.h
@@ -124,34 +124,34 @@
 
 typedef
    struct {
-      UInt  guest_EAX;
+      UInt  guest_EAX;         /* 0 */
       UInt  guest_ECX;
       UInt  guest_EDX;
       UInt  guest_EBX;
       UInt  guest_ESP;
       UInt  guest_EBP;
       UInt  guest_ESI;
-      UInt  guest_EDI;
+      UInt  guest_EDI;         /* 28 */
       /* 4-word thunk used to calculate O S Z A C P flags. */
-      UInt  guest_CC_OP;
+      UInt  guest_CC_OP;       /* 32 */
       UInt  guest_CC_DEP1;
       UInt  guest_CC_DEP2;
-      UInt  guest_CC_NDEP;
+      UInt  guest_CC_NDEP;     /* 44 */
       /* The D flag is stored here, encoded as either -1 or +1 */
-      UInt  guest_DFLAG;
+      UInt  guest_DFLAG;       /* 48 */
       /* Bit 21 (ID) of eflags stored here, as either 0 or 1. */
-      UInt  guest_IDFLAG;
+      UInt  guest_IDFLAG;      /* 52 */
       /* EIP */
-      UInt  guest_EIP;
+      UInt  guest_EIP;         /* 56 */
       /* FPU */
-      UInt  guest_FTOP;
-      ULong guest_FPREG[8];
-      UChar guest_FPTAG[8];
-      UInt  guest_FPROUND;
-      UInt  guest_FC3210;
+      UInt  guest_FTOP;        /* 60 */
+      ULong guest_FPREG[8];    /* 64 */
+      UChar guest_FPTAG[8];   /* 128 */
+      UInt  guest_FPROUND;    /* 136 */
+      UInt  guest_FC3210;     /* 140 */
       /* SSE */
-      UInt  guest_SSEROUND;
-      U128  guest_XMM0;
+      UInt  guest_SSEROUND;   /* 144 */
+      U128  guest_XMM0;       /* 148 */
       U128  guest_XMM1;
       U128  guest_XMM2;
       U128  guest_XMM3;
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index d8d8cc5..d923962 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -377,21 +377,27 @@
 
       /* --- pack / unpack --- */
 
-      /* 64 <-> 128 bit pack/unpack */
+      /* 64 <-> 128 bit */
       Iop_128to64,     // :: V128 -> I64, low half
       Iop_128HIto64,   // :: V128 -> I64, high half
       Iop_64HLto128,   // :: (I64,I64) -> V128
 
-      Iop_32Uto128,
       Iop_64Uto128,
-      Iop_Set128lo32,
       Iop_Set128lo64,
 
+      /* 32 <-> 128 bit */
+      Iop_32Uto128,
+      Iop_128to32,     // :: V128 -> I32, lowest lane
+      Iop_Set128lo32,  // :: (V128,I32) -> V128
+
       /* ------------------ 128-bit SIMD Integer. ------------------ */
 
       /* BITWISE OPS */
       Iop_And128, Iop_Or128, Iop_Xor128, 
 
+      /* MISC (32x4 integer cmp != 0) */
+      Iop_CmpNEZ32x4,
+
       /* ADDITION (normal / unsigned sat / signed sat) */
       Iop_Add8x16,   Iop_Add16x8,   Iop_Add32x4,  Iop_Add64x2,
       Iop_QAdd8Ux16, Iop_QAdd16Ux8,
