Merge from branches/THUMB: new IR primops and associated
infrastructure, needed to represent NEON instructions.  Way more new
ones than I would like, but I can't see a way to avoid having them.



git-svn-id: svn://svn.valgrind.org/vex/trunk@2016 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index 4d35401..48519e4 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -313,48 +313,125 @@
       case Iop_I32UtoFx4: vex_printf("I32UtoFx4"); return;
       case Iop_I32StoFx4: vex_printf("I32StoFx4"); return;
 
+      case Iop_F32toF16x4: vex_printf("F32toF16x4"); return;
+      case Iop_F16toF32x4: vex_printf("F16toF32x4"); return;
+
+      case Iop_Rsqrte32Fx4: vex_printf("VRsqrte32Fx4"); return;
+      case Iop_Rsqrte32x4:  vex_printf("VRsqrte32x4"); return;
+      case Iop_Rsqrte32Fx2: vex_printf("VRsqrte32Fx2"); return;
+      case Iop_Rsqrte32x2:  vex_printf("VRsqrte32x2"); return;
+
       case Iop_QFtoI32Ux4_RZ: vex_printf("QFtoI32Ux4_RZ"); return;
       case Iop_QFtoI32Sx4_RZ: vex_printf("QFtoI32Sx4_RZ"); return;
 
+      case Iop_FtoI32Ux4_RZ: vex_printf("FtoI32Ux4_RZ"); return;
+      case Iop_FtoI32Sx4_RZ: vex_printf("FtoI32Sx4_RZ"); return;
+
+      case Iop_I32UtoFx2: vex_printf("I32UtoFx2"); return;
+      case Iop_I32StoFx2: vex_printf("I32StoFx2"); return;
+
+      case Iop_FtoI32Ux2_RZ: vex_printf("FtoI32Ux2_RZ"); return;
+      case Iop_FtoI32Sx2_RZ: vex_printf("FtoI32Sx2_RZ"); return;
+
       case Iop_RoundF32x4_RM: vex_printf("RoundF32x4_RM"); return;
       case Iop_RoundF32x4_RP: vex_printf("RoundF32x4_RP"); return;
       case Iop_RoundF32x4_RN: vex_printf("RoundF32x4_RN"); return;
       case Iop_RoundF32x4_RZ: vex_printf("RoundF32x4_RZ"); return;
 
+      case Iop_Abs8x8: vex_printf("Abs8x8"); return;
+      case Iop_Abs16x4: vex_printf("Abs16x4"); return;
+      case Iop_Abs32x2: vex_printf("Abs32x2"); return;
       case Iop_Add8x8: vex_printf("Add8x8"); return;
       case Iop_Add16x4: vex_printf("Add16x4"); return;
       case Iop_Add32x2: vex_printf("Add32x2"); return;
       case Iop_QAdd8Ux8: vex_printf("QAdd8Ux8"); return;
       case Iop_QAdd16Ux4: vex_printf("QAdd16Ux4"); return;
+      case Iop_QAdd32Ux2: vex_printf("QAdd32Ux2"); return;
+      case Iop_QAdd64Ux1: vex_printf("QAdd64Ux1"); return;
       case Iop_QAdd8Sx8: vex_printf("QAdd8Sx8"); return;
       case Iop_QAdd16Sx4: vex_printf("QAdd16Sx4"); return;
+      case Iop_QAdd32Sx2: vex_printf("QAdd32Sx2"); return;
+      case Iop_QAdd64Sx1: vex_printf("QAdd64Sx1"); return;
+      case Iop_PwAdd8x8: vex_printf("PwAdd8x8"); return;
+      case Iop_PwAdd16x4: vex_printf("PwAdd16x4"); return;
+      case Iop_PwAdd32x2: vex_printf("PwAdd32x2"); return;
+      case Iop_PwAdd32Fx2: vex_printf("PwAdd32Fx2"); return;
+      case Iop_PwAddL8Ux8: vex_printf("PwAddL8Ux8"); return;
+      case Iop_PwAddL16Ux4: vex_printf("PwAddL16Ux4"); return;
+      case Iop_PwAddL32Ux2: vex_printf("PwAddL32Ux2"); return;
+      case Iop_PwAddL8Sx8: vex_printf("PwAddL8Sx8"); return;
+      case Iop_PwAddL16Sx4: vex_printf("PwAddL16Sx4"); return;
+      case Iop_PwAddL32Sx2: vex_printf("PwAddL32Sx2"); return;
       case Iop_Sub8x8: vex_printf("Sub8x8"); return;
       case Iop_Sub16x4: vex_printf("Sub16x4"); return;
       case Iop_Sub32x2: vex_printf("Sub32x2"); return;
       case Iop_QSub8Ux8: vex_printf("QSub8Ux8"); return;
       case Iop_QSub16Ux4: vex_printf("QSub16Ux4"); return;
+      case Iop_QSub32Ux2: vex_printf("QSub32Ux2"); return;
+      case Iop_QSub64Ux1: vex_printf("QSub64Ux1"); return;
       case Iop_QSub8Sx8: vex_printf("QSub8Sx8"); return;
       case Iop_QSub16Sx4: vex_printf("QSub16Sx4"); return;
+      case Iop_QSub32Sx2: vex_printf("QSub32Sx2"); return;
+      case Iop_QSub64Sx1: vex_printf("QSub64Sx1"); return;
+      case Iop_Mul8x8: vex_printf("Mul8x8"); return;
       case Iop_Mul16x4: vex_printf("Mul16x4"); return;
       case Iop_Mul32x2: vex_printf("Mul32x2"); return;
-      case Iop_Mul32x4: vex_printf("Mul32x4"); return;
+      case Iop_Mul32Fx2: vex_printf("Mul32Fx2"); return;
+      case Iop_PolynomialMul8x8: vex_printf("PolynomialMul8x8"); return;
       case Iop_MulHi16Ux4: vex_printf("MulHi16Ux4"); return;
       case Iop_MulHi16Sx4: vex_printf("MulHi16Sx4"); return;
+      case Iop_QDMulHi16Sx4: vex_printf("QDMulHi16Sx4"); return;
+      case Iop_QDMulHi32Sx2: vex_printf("QDMulHi32Sx2"); return;
+      case Iop_QRDMulHi16Sx4: vex_printf("QRDMulHi16Sx4"); return;
+      case Iop_QRDMulHi32Sx2: vex_printf("QRDMulHi32Sx2"); return;
+      case Iop_QDMulLong16Sx4: vex_printf("QDMulLong16Sx4"); return;
+      case Iop_QDMulLong32Sx2: vex_printf("QDMulLong32Sx2"); return;
       case Iop_Avg8Ux8: vex_printf("Avg8Ux8"); return;
       case Iop_Avg16Ux4: vex_printf("Avg16Ux4"); return;
+      case Iop_Max8Sx8: vex_printf("Max8Sx8"); return;
       case Iop_Max16Sx4: vex_printf("Max16Sx4"); return;
+      case Iop_Max32Sx2: vex_printf("Max32Sx2"); return;
       case Iop_Max8Ux8: vex_printf("Max8Ux8"); return;
+      case Iop_Max16Ux4: vex_printf("Max16Ux4"); return;
+      case Iop_Max32Ux2: vex_printf("Max32Ux2"); return;
+      case Iop_Min8Sx8: vex_printf("Min8Sx8"); return;
       case Iop_Min16Sx4: vex_printf("Min16Sx4"); return;
+      case Iop_Min32Sx2: vex_printf("Min32Sx2"); return;
       case Iop_Min8Ux8: vex_printf("Min8Ux8"); return;
+      case Iop_Min16Ux4: vex_printf("Min16Ux4"); return;
+      case Iop_Min32Ux2: vex_printf("Min32Ux2"); return;
+      case Iop_PwMax8Sx8: vex_printf("PwMax8Sx8"); return;
+      case Iop_PwMax16Sx4: vex_printf("PwMax16Sx4"); return;
+      case Iop_PwMax32Sx2: vex_printf("PwMax32Sx2"); return;
+      case Iop_PwMax8Ux8: vex_printf("PwMax8Ux8"); return;
+      case Iop_PwMax16Ux4: vex_printf("PwMax16Ux4"); return;
+      case Iop_PwMax32Ux2: vex_printf("PwMax32Ux2"); return;
+      case Iop_PwMin8Sx8: vex_printf("PwMin8Sx8"); return;
+      case Iop_PwMin16Sx4: vex_printf("PwMin16Sx4"); return;
+      case Iop_PwMin32Sx2: vex_printf("PwMin32Sx2"); return;
+      case Iop_PwMin8Ux8: vex_printf("PwMin8Ux8"); return;
+      case Iop_PwMin16Ux4: vex_printf("PwMin16Ux4"); return;
+      case Iop_PwMin32Ux2: vex_printf("PwMin32Ux2"); return;
       case Iop_CmpEQ8x8: vex_printf("CmpEQ8x8"); return;
       case Iop_CmpEQ16x4: vex_printf("CmpEQ16x4"); return;
       case Iop_CmpEQ32x2: vex_printf("CmpEQ32x2"); return;
+      case Iop_CmpGT8Ux8: vex_printf("CmpGT8Ux8"); return;
+      case Iop_CmpGT16Ux4: vex_printf("CmpGT16Ux4"); return;
+      case Iop_CmpGT32Ux2: vex_printf("CmpGT32Ux2"); return;
       case Iop_CmpGT8Sx8: vex_printf("CmpGT8Sx8"); return;
       case Iop_CmpGT16Sx4: vex_printf("CmpGT16Sx4"); return;
       case Iop_CmpGT32Sx2: vex_printf("CmpGT32Sx2"); return;
+      case Iop_Cnt8x8: vex_printf("Cnt8x8"); return;
+      case Iop_Clz8Sx8: vex_printf("Clz8Sx8"); return;
+      case Iop_Clz16Sx4: vex_printf("Clz16Sx4"); return;
+      case Iop_Clz32Sx2: vex_printf("Clz32Sx2"); return;
+      case Iop_Cls8Sx8: vex_printf("Cls8Sx8"); return;
+      case Iop_Cls16Sx4: vex_printf("Cls16Sx4"); return;
+      case Iop_Cls32Sx2: vex_printf("Cls32Sx2"); return;
       case Iop_ShlN8x8: vex_printf("ShlN8x8"); return;
       case Iop_ShlN16x4: vex_printf("ShlN16x4"); return;
       case Iop_ShlN32x2: vex_printf("ShlN32x2"); return;
+      case Iop_ShrN8x8: vex_printf("ShrN8x8"); return;
       case Iop_ShrN16x4: vex_printf("ShrN16x4"); return;
       case Iop_ShrN32x2: vex_printf("ShrN32x2"); return;
       case Iop_SarN8x8: vex_printf("SarN8x8"); return;
@@ -369,15 +446,62 @@
       case Iop_InterleaveLO8x8: vex_printf("InterleaveLO8x8"); return;
       case Iop_InterleaveLO16x4: vex_printf("InterleaveLO16x4"); return;
       case Iop_InterleaveLO32x2: vex_printf("InterleaveLO32x2"); return;
+      case Iop_CatOddLanes8x8: vex_printf("CatOddLanes8x8"); return;
       case Iop_CatOddLanes16x4: vex_printf("CatOddLanes16x4"); return;
+      case Iop_CatEvenLanes8x8: vex_printf("CatEvenLanes8x8"); return;
       case Iop_CatEvenLanes16x4: vex_printf("CatEvenLanes16x4"); return;
+      case Iop_InterleaveOddLanes8x8: vex_printf("InterleaveOddLanes8x8"); return;
+      case Iop_InterleaveOddLanes16x4: vex_printf("InterleaveOddLanes16x4"); return;
+      case Iop_InterleaveEvenLanes8x8: vex_printf("InterleaveEvenLanes8x8"); return;
+      case Iop_InterleaveEvenLanes16x4: vex_printf("InterleaveEvenLanes16x4"); return;
+      case Iop_Shl8x8: vex_printf("Shl8x8"); return;
+      case Iop_Shl16x4: vex_printf("Shl16x4"); return;
+      case Iop_Shl32x2: vex_printf("Shl32x2"); return;
+      case Iop_Shr8x8: vex_printf("Shr8x8"); return;
+      case Iop_Shr16x4: vex_printf("Shr16x4"); return;
+      case Iop_Shr32x2: vex_printf("Shr32x2"); return;
+      case Iop_QShl8x8: vex_printf("QShl8x8"); return;
+      case Iop_QShl16x4: vex_printf("QShl16x4"); return;
+      case Iop_QShl32x2: vex_printf("QShl32x2"); return;
+      case Iop_QShl64x1: vex_printf("QShl64x1"); return;
+      case Iop_QSal8x8: vex_printf("QSal8x8"); return;
+      case Iop_QSal16x4: vex_printf("QSal16x4"); return;
+      case Iop_QSal32x2: vex_printf("QSal32x2"); return;
+      case Iop_QSal64x1: vex_printf("QSal64x1"); return;
+      case Iop_QShlN8x8: vex_printf("QShlN8x8"); return;
+      case Iop_QShlN16x4: vex_printf("QShlN16x4"); return;
+      case Iop_QShlN32x2: vex_printf("QShlN32x2"); return;
+      case Iop_QShlN64x1: vex_printf("QShlN64x1"); return;
+      case Iop_QShlN8Sx8: vex_printf("QShlN8Sx8"); return;
+      case Iop_QShlN16Sx4: vex_printf("QShlN16Sx4"); return;
+      case Iop_QShlN32Sx2: vex_printf("QShlN32Sx2"); return;
+      case Iop_QShlN64Sx1: vex_printf("QShlN64Sx1"); return;
+      case Iop_QSalN8x8: vex_printf("QSalN8x8"); return;
+      case Iop_QSalN16x4: vex_printf("QSalN16x4"); return;
+      case Iop_QSalN32x2: vex_printf("QSalN32x2"); return;
+      case Iop_QSalN64x1: vex_printf("QSalN64x1"); return;
+      case Iop_Sar8x8: vex_printf("Sar8x8"); return;
+      case Iop_Sar16x4: vex_printf("Sar16x4"); return;
+      case Iop_Sar32x2: vex_printf("Sar32x2"); return;
+      case Iop_Sal8x8: vex_printf("Sal8x8"); return;
+      case Iop_Sal16x4: vex_printf("Sal16x4"); return;
+      case Iop_Sal32x2: vex_printf("Sal32x2"); return;
+      case Iop_Sal64x1: vex_printf("Sal64x1"); return;
       case Iop_Perm8x8: vex_printf("Perm8x8"); return;
+      case Iop_Reverse16_8x8: vex_printf("Reverse16_8x8"); return;
+      case Iop_Reverse32_8x8: vex_printf("Reverse32_8x8"); return;
+      case Iop_Reverse32_16x4: vex_printf("Reverse32_16x4"); return;
+      case Iop_Reverse64_8x8: vex_printf("Reverse64_8x8"); return;
+      case Iop_Reverse64_16x4: vex_printf("Reverse64_16x4"); return;
+      case Iop_Reverse64_32x2: vex_printf("Reverse64_32x2"); return;
+      case Iop_Abs32Fx2: vex_printf("Abs32Fx2"); return;
 
       case Iop_CmpNEZ32x2: vex_printf("CmpNEZ32x2"); return;
       case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return;
       case Iop_CmpNEZ8x8:  vex_printf("CmpNEZ8x8"); return;
 
       case Iop_Add32Fx4:  vex_printf("Add32Fx4"); return;
+      case Iop_Add32Fx2:  vex_printf("Add32Fx2"); return;
       case Iop_Add32F0x4: vex_printf("Add32F0x4"); return;
       case Iop_Add64Fx2:  vex_printf("Add64Fx2"); return;
       case Iop_Add64F0x2: vex_printf("Add64F0x2"); return;
@@ -388,11 +512,17 @@
       case Iop_Div64F0x2: vex_printf("Div64F0x2"); return;
 
       case Iop_Max32Fx4:  vex_printf("Max32Fx4"); return;
+      case Iop_Max32Fx2:  vex_printf("Max32Fx2"); return;
+      case Iop_PwMax32Fx4:  vex_printf("PwMax32Fx4"); return;
+      case Iop_PwMax32Fx2:  vex_printf("PwMax32Fx2"); return;
       case Iop_Max32F0x4: vex_printf("Max32F0x4"); return;
       case Iop_Max64Fx2:  vex_printf("Max64Fx2"); return;
       case Iop_Max64F0x2: vex_printf("Max64F0x2"); return;
 
       case Iop_Min32Fx4:  vex_printf("Min32Fx4"); return;
+      case Iop_Min32Fx2:  vex_printf("Min32Fx2"); return;
+      case Iop_PwMin32Fx4:  vex_printf("PwMin32Fx4"); return;
+      case Iop_PwMin32Fx2:  vex_printf("PwMin32Fx2"); return;
       case Iop_Min32F0x4: vex_printf("Min32F0x4"); return;
       case Iop_Min64Fx2:  vex_printf("Min64Fx2"); return;
       case Iop_Min64F0x2: vex_printf("Min64F0x2"); return;
@@ -402,10 +532,18 @@
       case Iop_Mul64Fx2:  vex_printf("Mul64Fx2"); return;
       case Iop_Mul64F0x2: vex_printf("Mul64F0x2"); return;
 
+      case Iop_Recip32x2: vex_printf("Recip32x2"); return;
+      case Iop_Recip32Fx2:  vex_printf("Recip32Fx2"); return;
       case Iop_Recip32Fx4:  vex_printf("Recip32Fx4"); return;
+      case Iop_Recip32x4:  vex_printf("Recip32x4"); return;
       case Iop_Recip32F0x4: vex_printf("Recip32F0x4"); return;
       case Iop_Recip64Fx2:  vex_printf("Recip64Fx2"); return;
       case Iop_Recip64F0x2: vex_printf("Recip64F0x2"); return;
+      case Iop_Recps32Fx2:  vex_printf("VRecps32Fx2"); return;
+      case Iop_Recps32Fx4:  vex_printf("VRecps32Fx4"); return;
+      case Iop_Abs32Fx4:  vex_printf("Abs32Fx4"); return;
+      case Iop_Rsqrts32Fx4:  vex_printf("VRsqrts32Fx4"); return;
+      case Iop_Rsqrts32Fx2:  vex_printf("VRsqrts32Fx2"); return;
 
       case Iop_RSqrt32Fx4:  vex_printf("RSqrt32Fx4"); return;
       case Iop_RSqrt32F0x4: vex_printf("RSqrt32F0x4"); return;
@@ -418,6 +556,7 @@
       case Iop_Sqrt64F0x2: vex_printf("Sqrt64F0x2"); return;
 
       case Iop_Sub32Fx4:  vex_printf("Sub32Fx4"); return;
+      case Iop_Sub32Fx2:  vex_printf("Sub32Fx2"); return;
       case Iop_Sub32F0x4: vex_printf("Sub32F0x4"); return;
       case Iop_Sub64Fx2:  vex_printf("Sub64Fx2"); return;
       case Iop_Sub64F0x2: vex_printf("Sub64F0x2"); return;
@@ -432,6 +571,9 @@
       case Iop_CmpLT64Fx2: vex_printf("CmpLT64Fx2"); return;
       case Iop_CmpLE64Fx2: vex_printf("CmpLE64Fx2"); return;
       case Iop_CmpUN64Fx2: vex_printf("CmpUN64Fx2"); return;
+      case Iop_CmpGT32Fx2: vex_printf("CmpGT32Fx2"); return;
+      case Iop_CmpEQ32Fx2: vex_printf("CmpEQ32Fx2"); return;
+      case Iop_CmpGE32Fx2: vex_printf("CmpGE32Fx2"); return;
 
       case Iop_CmpEQ32F0x4: vex_printf("CmpEQ32F0x4"); return;
       case Iop_CmpLT32F0x4: vex_printf("CmpLT32F0x4"); return;
@@ -442,6 +584,9 @@
       case Iop_CmpLE64F0x2: vex_printf("CmpLE64F0x2"); return;
       case Iop_CmpUN64F0x2: vex_printf("CmpUN64F0x2"); return;
 
+      case Iop_Neg32Fx4: vex_printf("Neg32Fx4"); return;
+      case Iop_Neg32Fx2: vex_printf("Neg32Fx2"); return;
+
       case Iop_V128to64:   vex_printf("V128to64");   return;
       case Iop_V128HIto64: vex_printf("V128HIto64"); return;
       case Iop_64HLtoV128: vex_printf("64HLtoV128"); return;
@@ -456,6 +601,9 @@
       case Iop_Dup8x16: vex_printf("Dup8x16"); return;
       case Iop_Dup16x8: vex_printf("Dup16x8"); return;
       case Iop_Dup32x4: vex_printf("Dup32x4"); return;
+      case Iop_Dup8x8: vex_printf("Dup8x8"); return;
+      case Iop_Dup16x4: vex_printf("Dup16x4"); return;
+      case Iop_Dup32x2: vex_printf("Dup32x2"); return;
 
       case Iop_NotV128:    vex_printf("NotV128"); return;
       case Iop_AndV128:    vex_printf("AndV128"); return;
@@ -467,6 +615,10 @@
       case Iop_CmpNEZ32x4: vex_printf("CmpNEZ32x4"); return;
       case Iop_CmpNEZ64x2: vex_printf("CmpNEZ64x2"); return;
 
+      case Iop_Abs8x16: vex_printf("Abs8x16"); return;
+      case Iop_Abs16x8: vex_printf("Abs16x8"); return;
+      case Iop_Abs32x4: vex_printf("Abs32x4"); return;
+
       case Iop_Add8x16:   vex_printf("Add8x16"); return;
       case Iop_Add16x8:   vex_printf("Add16x8"); return;
       case Iop_Add32x4:   vex_printf("Add32x4"); return;
@@ -477,6 +629,17 @@
       case Iop_QAdd8Sx16: vex_printf("QAdd8Sx16"); return;
       case Iop_QAdd16Sx8: vex_printf("QAdd16Sx8"); return;
       case Iop_QAdd32Sx4: vex_printf("QAdd32Sx4"); return;
+      case Iop_QAdd64Ux2: vex_printf("QAdd64Ux2"); return;
+      case Iop_QAdd64Sx2: vex_printf("QAdd64Sx2"); return;
+      case Iop_PwAdd8x16: vex_printf("PwAdd8x16"); return;
+      case Iop_PwAdd16x8: vex_printf("PwAdd16x8"); return;
+      case Iop_PwAdd32x4: vex_printf("PwAdd32x4"); return;
+      case Iop_PwAddL8Ux16: vex_printf("PwAddL8Ux16"); return;
+      case Iop_PwAddL16Ux8: vex_printf("PwAddL16Ux8"); return;
+      case Iop_PwAddL32Ux4: vex_printf("PwAddL32Ux4"); return;
+      case Iop_PwAddL8Sx16: vex_printf("PwAddL8Sx16"); return;
+      case Iop_PwAddL16Sx8: vex_printf("PwAddL16Sx8"); return;
+      case Iop_PwAddL32Sx4: vex_printf("PwAddL32Sx4"); return;
 
       case Iop_Sub8x16:   vex_printf("Sub8x16"); return;
       case Iop_Sub16x8:   vex_printf("Sub16x8"); return;
@@ -488,12 +651,28 @@
       case Iop_QSub8Sx16: vex_printf("QSub8Sx16"); return;
       case Iop_QSub16Sx8: vex_printf("QSub16Sx8"); return;
       case Iop_QSub32Sx4: vex_printf("QSub32Sx4"); return;
+      case Iop_QSub64Ux2: vex_printf("QSub64Ux2"); return;
+      case Iop_QSub64Sx2: vex_printf("QSub64Sx2"); return;
 
+      case Iop_Mul8x16:    vex_printf("Mul8x16"); return;
       case Iop_Mul16x8:    vex_printf("Mul16x8"); return;
+      case Iop_Mul32x4:    vex_printf("Mul32x4"); return;
+      case Iop_Mull8Ux8:    vex_printf("Mull8Ux8"); return;
+      case Iop_Mull8Sx8:    vex_printf("Mull8Sx8"); return;
+      case Iop_Mull16Ux4:    vex_printf("Mull16Ux4"); return;
+      case Iop_Mull16Sx4:    vex_printf("Mull16Sx4"); return;
+      case Iop_Mull32Ux2:    vex_printf("Mull32Ux2"); return;
+      case Iop_Mull32Sx2:    vex_printf("Mull32Sx2"); return;
+      case Iop_PolynomialMul8x16: vex_printf("PolynomialMul8x16"); return;
+      case Iop_PolynomialMull8x8: vex_printf("PolynomialMull8x8"); return;
       case Iop_MulHi16Ux8: vex_printf("MulHi16Ux8"); return;
       case Iop_MulHi32Ux4: vex_printf("MulHi32Ux4"); return;
       case Iop_MulHi16Sx8: vex_printf("MulHi16Sx8"); return;
       case Iop_MulHi32Sx4: vex_printf("MulHi32Sx4"); return;
+      case Iop_QDMulHi16Sx8: vex_printf("QDMulHi16Sx8"); return;
+      case Iop_QDMulHi32Sx4: vex_printf("QDMulHi32Sx4"); return;
+      case Iop_QRDMulHi16Sx8: vex_printf("QRDMulHi16Sx8"); return;
+      case Iop_QRDMulHi32Sx4: vex_printf("QRDMulHi32Sx4"); return;
 
       case Iop_MullEven8Ux16: vex_printf("MullEven8Ux16"); return;
       case Iop_MullEven16Ux8: vex_printf("MullEven16Ux8"); return;
@@ -532,6 +711,14 @@
       case Iop_CmpGT16Ux8: vex_printf("CmpGT16Ux8"); return;
       case Iop_CmpGT32Ux4: vex_printf("CmpGT32Ux4"); return;
 
+      case Iop_Cnt8x16: vex_printf("Cnt8x16"); return;
+      case Iop_Clz8Sx16: vex_printf("Clz8Sx16"); return;
+      case Iop_Clz16Sx8: vex_printf("Clz16Sx8"); return;
+      case Iop_Clz32Sx4: vex_printf("Clz32Sx4"); return;
+      case Iop_Cls8Sx16: vex_printf("Cls8Sx16"); return;
+      case Iop_Cls16Sx8: vex_printf("Cls16Sx8"); return;
+      case Iop_Cls32Sx4: vex_printf("Cls32Sx4"); return;
+
       case Iop_ShlV128: vex_printf("ShlV128"); return;
       case Iop_ShrV128: vex_printf("ShrV128"); return;
 
@@ -546,16 +733,44 @@
       case Iop_SarN8x16: vex_printf("SarN8x16"); return;
       case Iop_SarN16x8: vex_printf("SarN16x8"); return;
       case Iop_SarN32x4: vex_printf("SarN32x4"); return;
+      case Iop_SarN64x2: vex_printf("SarN64x2"); return;
 
       case Iop_Shl8x16: vex_printf("Shl8x16"); return;
       case Iop_Shl16x8: vex_printf("Shl16x8"); return;
       case Iop_Shl32x4: vex_printf("Shl32x4"); return;
+      case Iop_Shl64x2: vex_printf("Shl64x2"); return;
+      case Iop_QSal8x16: vex_printf("QSal8x16"); return;
+      case Iop_QSal16x8: vex_printf("QSal16x8"); return;
+      case Iop_QSal32x4: vex_printf("QSal32x4"); return;
+      case Iop_QSal64x2: vex_printf("QSal64x2"); return;
+      case Iop_QShl8x16: vex_printf("QShl8x16"); return;
+      case Iop_QShl16x8: vex_printf("QShl16x8"); return;
+      case Iop_QShl32x4: vex_printf("QShl32x4"); return;
+      case Iop_QShl64x2: vex_printf("QShl64x2"); return;
+      case Iop_QSalN8x16: vex_printf("QSalN8x16"); return;
+      case Iop_QSalN16x8: vex_printf("QSalN16x8"); return;
+      case Iop_QSalN32x4: vex_printf("QSalN32x4"); return;
+      case Iop_QSalN64x2: vex_printf("QSalN64x2"); return;
+      case Iop_QShlN8x16: vex_printf("QShlN8x16"); return;
+      case Iop_QShlN16x8: vex_printf("QShlN16x8"); return;
+      case Iop_QShlN32x4: vex_printf("QShlN32x4"); return;
+      case Iop_QShlN64x2: vex_printf("QShlN64x2"); return;
+      case Iop_QShlN8Sx16: vex_printf("QShlN8Sx16"); return;
+      case Iop_QShlN16Sx8: vex_printf("QShlN16Sx8"); return;
+      case Iop_QShlN32Sx4: vex_printf("QShlN32Sx4"); return;
+      case Iop_QShlN64Sx2: vex_printf("QShlN64Sx2"); return;
       case Iop_Shr8x16: vex_printf("Shr8x16"); return;
       case Iop_Shr16x8: vex_printf("Shr16x8"); return;
       case Iop_Shr32x4: vex_printf("Shr32x4"); return;
+      case Iop_Shr64x2: vex_printf("Shr64x2"); return;
       case Iop_Sar8x16: vex_printf("Sar8x16"); return;
       case Iop_Sar16x8: vex_printf("Sar16x8"); return;
       case Iop_Sar32x4: vex_printf("Sar32x4"); return;
+      case Iop_Sar64x2: vex_printf("Sar64x2"); return;
+      case Iop_Sal8x16: vex_printf("Sal8x16"); return;
+      case Iop_Sal16x8: vex_printf("Sal16x8"); return;
+      case Iop_Sal32x4: vex_printf("Sal32x4"); return;
+      case Iop_Sal64x2: vex_printf("Sal64x2"); return;
       case Iop_Rol8x16: vex_printf("Rol8x16"); return;
       case Iop_Rol16x8: vex_printf("Rol16x8"); return;
       case Iop_Rol32x4: vex_printf("Rol32x4"); return;
@@ -566,6 +781,24 @@
       case Iop_QNarrow32Ux4: vex_printf("QNarrow32Ux4"); return;
       case Iop_QNarrow16Sx8: vex_printf("QNarrow16Sx8"); return;
       case Iop_QNarrow32Sx4: vex_printf("QNarrow32Sx4"); return;
+      case Iop_Shorten16x8: vex_printf("Shorten16x8"); return;
+      case Iop_Shorten32x4: vex_printf("Shorten32x4"); return;
+      case Iop_Shorten64x2: vex_printf("Shorten64x2"); return;
+      case Iop_QShortenU16Ux8: vex_printf("QShortenU16Ux8"); return;
+      case Iop_QShortenU32Ux4: vex_printf("QShortenU32Ux4"); return;
+      case Iop_QShortenU64Ux2: vex_printf("QShortenU64Ux2"); return;
+      case Iop_QShortenS16Sx8: vex_printf("QShortenS16Sx8"); return;
+      case Iop_QShortenS32Sx4: vex_printf("QShortenS32Sx4"); return;
+      case Iop_QShortenS64Sx2: vex_printf("QShortenS64Sx2"); return;
+      case Iop_QShortenU16Sx8: vex_printf("QShortenU16Sx8"); return;
+      case Iop_QShortenU32Sx4: vex_printf("QShortenU32Sx4"); return;
+      case Iop_QShortenU64Sx2: vex_printf("QShortenU64Sx2"); return;
+      case Iop_Longen8Ux8: vex_printf("Longen8Ux8"); return;
+      case Iop_Longen16Ux4: vex_printf("Longen16Ux4"); return;
+      case Iop_Longen32Ux2: vex_printf("Longen32Ux2"); return;
+      case Iop_Longen8Sx8: vex_printf("Longen8Sx8"); return;
+      case Iop_Longen16Sx4: vex_printf("Longen16Sx4"); return;
+      case Iop_Longen32Sx2: vex_printf("Longen32Sx2"); return;
 
       case Iop_InterleaveHI8x16: vex_printf("InterleaveHI8x16"); return;
       case Iop_InterleaveHI16x8: vex_printf("InterleaveHI16x8"); return;
@@ -576,7 +809,51 @@
       case Iop_InterleaveLO32x4: vex_printf("InterleaveLO32x4"); return;
       case Iop_InterleaveLO64x2: vex_printf("InterleaveLO64x2"); return;
 
+      case Iop_CatOddLanes8x16: vex_printf("CatOddLanes8x16"); return;
+      case Iop_CatOddLanes16x8: vex_printf("CatOddLanes16x8"); return;
+      case Iop_CatOddLanes32x4: vex_printf("CatOddLanes32x4"); return;
+      case Iop_CatEvenLanes8x16: vex_printf("CatEvenLanes8x16"); return;
+      case Iop_CatEvenLanes16x8: vex_printf("CatEvenLanes16x8"); return;
+      case Iop_CatEvenLanes32x4: vex_printf("CatEvenLanes32x4"); return;
+
+      case Iop_InterleaveOddLanes8x16: vex_printf("InterleaveOddLanes8x16"); return;
+      case Iop_InterleaveOddLanes16x8: vex_printf("InterleaveOddLanes16x8"); return;
+      case Iop_InterleaveOddLanes32x4: vex_printf("InterleaveOddLanes32x4"); return;
+      case Iop_InterleaveEvenLanes8x16: vex_printf("InterleaveEvenLanes8x16"); return;
+      case Iop_InterleaveEvenLanes16x8: vex_printf("InterleaveEvenLanes16x8"); return;
+      case Iop_InterleaveEvenLanes32x4: vex_printf("InterleaveEvenLanes32x4"); return;
+
+      case Iop_GetElem8x16: vex_printf("GetElem8x16"); return;
+      case Iop_GetElem16x8: vex_printf("GetElem16x8"); return;
+      case Iop_GetElem32x4: vex_printf("GetElem32x4"); return;
+      case Iop_GetElem64x2: vex_printf("GetElem64x2"); return;
+
+      case Iop_GetElem8x8: vex_printf("GetElem8x8"); return;
+      case Iop_GetElem16x4: vex_printf("GetElem16x4"); return;
+      case Iop_GetElem32x2: vex_printf("GetElem32x2"); return;
+      case Iop_SetElem8x8: vex_printf("SetElem8x8"); return;
+      case Iop_SetElem16x4: vex_printf("SetElem16x4"); return;
+      case Iop_SetElem32x2: vex_printf("SetElem32x2"); return;
+
+      case Iop_Extract64: vex_printf("Extract64"); return;
+      case Iop_ExtractV128: vex_printf("ExtractV128"); return;
+
       case Iop_Perm8x16: vex_printf("Perm8x16"); return;
+      case Iop_Reverse16_8x16: vex_printf("Reverse16_8x16"); return;
+      case Iop_Reverse32_8x16: vex_printf("Reverse32_8x16"); return;
+      case Iop_Reverse32_16x8: vex_printf("Reverse32_16x8"); return;
+      case Iop_Reverse64_8x16: vex_printf("Reverse64_8x16"); return;
+      case Iop_Reverse64_16x8: vex_printf("Reverse64_16x8"); return;
+      case Iop_Reverse64_32x4: vex_printf("Reverse64_32x4"); return;
+
+      case Iop_F32ToFixed32Ux4_RZ: vex_printf("F32ToFixed32Ux4_RZ"); return;
+      case Iop_F32ToFixed32Sx4_RZ: vex_printf("F32ToFixed32Sx4_RZ"); return;
+      case Iop_Fixed32UToF32x4_RN: vex_printf("Fixed32UToF32x4_RN"); return;
+      case Iop_Fixed32SToF32x4_RN: vex_printf("Fixed32SToF32x4_RN"); return;
+      case Iop_F32ToFixed32Ux2_RZ: vex_printf("F32ToFixed32Ux2_RZ"); return;
+      case Iop_F32ToFixed32Sx2_RZ: vex_printf("F32ToFixed32Sx2_RZ"); return;
+      case Iop_Fixed32UToF32x2_RN: vex_printf("Fixed32UToF32x2_RN"); return;
+      case Iop_Fixed32SToF32x2_RN: vex_printf("Fixed32SToF32x2_RN"); return;
 
       default: vpanic("ppIROp(1)");
    }
@@ -1182,6 +1459,21 @@
    vec[7] = NULL;
    return vec;
 }
+IRExpr** mkIRExprVec_8 ( IRExpr* arg1, IRExpr* arg2, IRExpr* arg3,
+                         IRExpr* arg4, IRExpr* arg5, IRExpr* arg6,
+                         IRExpr* arg7, IRExpr* arg8 ) {
+   IRExpr** vec = LibVEX_Alloc(9 * sizeof(IRExpr*));
+   vec[0] = arg1;
+   vec[1] = arg2;
+   vec[2] = arg3;
+   vec[3] = arg4;
+   vec[4] = arg5;
+   vec[5] = arg6;
+   vec[6] = arg7;
+   vec[7] = arg8;
+   vec[8] = NULL;
+   return vec;
+}
 
 
 /* Constructors -- IRDirty */
@@ -1625,29 +1917,67 @@
       case Iop_CmpORD64S:
       case Iop_Avg8Ux8: case Iop_Avg16Ux4:
       case Iop_Add8x8: case Iop_Add16x4: case Iop_Add32x2:
+      case Iop_Add32Fx2: case Iop_Sub32Fx2:
       case Iop_CmpEQ8x8: case Iop_CmpEQ16x4: case Iop_CmpEQ32x2:
       case Iop_CmpGT8Sx8: case Iop_CmpGT16Sx4: case Iop_CmpGT32Sx2:
+      case Iop_CmpGT8Ux8: case Iop_CmpGT16Ux4: case Iop_CmpGT32Ux2:
+      case Iop_CmpGT32Fx2: case Iop_CmpEQ32Fx2: case Iop_CmpGE32Fx2:
       case Iop_InterleaveHI8x8: case Iop_InterleaveLO8x8:
       case Iop_InterleaveHI16x4: case Iop_InterleaveLO16x4:
       case Iop_InterleaveHI32x2: case Iop_InterleaveLO32x2:
+      case Iop_CatOddLanes8x8: case Iop_CatEvenLanes8x8:
       case Iop_CatOddLanes16x4: case Iop_CatEvenLanes16x4:
+      case Iop_InterleaveOddLanes8x8: case Iop_InterleaveEvenLanes8x8:
+      case Iop_InterleaveOddLanes16x4: case Iop_InterleaveEvenLanes16x4:
       case Iop_Perm8x8:
-      case Iop_Max8Ux8: case Iop_Max16Sx4:
-      case Iop_Min8Ux8: case Iop_Min16Sx4:
-      case Iop_Mul16x4: case Iop_Mul32x2:
+      case Iop_Max8Ux8: case Iop_Max16Ux4: case Iop_Max32Ux2:
+      case Iop_Max8Sx8: case Iop_Max16Sx4: case Iop_Max32Sx2:
+      case Iop_Max32Fx2: case Iop_Min32Fx2:
+      case Iop_PwMax32Fx2: case Iop_PwMin32Fx2:
+      case Iop_Min8Ux8: case Iop_Min16Ux4: case Iop_Min32Ux2:
+      case Iop_Min8Sx8: case Iop_Min16Sx4: case Iop_Min32Sx2:
+      case Iop_PwMax8Ux8: case Iop_PwMax16Ux4: case Iop_PwMax32Ux2:
+      case Iop_PwMax8Sx8: case Iop_PwMax16Sx4: case Iop_PwMax32Sx2:
+      case Iop_PwMin8Ux8: case Iop_PwMin16Ux4: case Iop_PwMin32Ux2:
+      case Iop_PwMin8Sx8: case Iop_PwMin16Sx4: case Iop_PwMin32Sx2:
+      case Iop_Mul8x8: case Iop_Mul16x4: case Iop_Mul32x2:
+      case Iop_Mul32Fx2:
+      case Iop_PolynomialMul8x8:
       case Iop_MulHi16Sx4: case Iop_MulHi16Ux4:
+      case Iop_QDMulHi16Sx4: case Iop_QDMulHi32Sx2:
+      case Iop_QRDMulHi16Sx4: case Iop_QRDMulHi32Sx2:
       case Iop_QAdd8Sx8: case Iop_QAdd16Sx4:
+      case Iop_QAdd32Sx2: case Iop_QAdd64Sx1:
       case Iop_QAdd8Ux8: case Iop_QAdd16Ux4:
+      case Iop_QAdd32Ux2: case Iop_QAdd64Ux1:
+      case Iop_PwAdd8x8: case Iop_PwAdd16x4: case Iop_PwAdd32x2:
+      case Iop_PwAdd32Fx2:
       case Iop_QNarrow32Sx2:
       case Iop_QNarrow16Sx4: case Iop_QNarrow16Ux4:
       case Iop_Sub8x8: case Iop_Sub16x4: case Iop_Sub32x2:
       case Iop_QSub8Sx8: case Iop_QSub16Sx4:
+      case Iop_QSub32Sx2: case Iop_QSub64Sx1:
       case Iop_QSub8Ux8: case Iop_QSub16Ux4:
+      case Iop_QSub32Ux2: case Iop_QSub64Ux1:
+      case Iop_Shl8x8: case Iop_Shl16x4: case Iop_Shl32x2:
+      case Iop_Shr8x8: case Iop_Shr16x4: case Iop_Shr32x2:
+      case Iop_Sar8x8: case Iop_Sar16x4: case Iop_Sar32x2:
+      case Iop_Sal8x8: case Iop_Sal16x4: case Iop_Sal32x2: case Iop_Sal64x1:
+      case Iop_QShl8x8: case Iop_QShl16x4: case Iop_QShl32x2: case Iop_QShl64x1:
+      case Iop_QSal8x8: case Iop_QSal16x4: case Iop_QSal32x2: case Iop_QSal64x1:
+      case Iop_Recps32Fx2:
+      case Iop_Rsqrts32Fx2:
          BINARY(Ity_I64,Ity_I64, Ity_I64);
 
       case Iop_ShlN32x2: case Iop_ShlN16x4: case Iop_ShlN8x8:
-      case Iop_ShrN32x2: case Iop_ShrN16x4:
+      case Iop_ShrN32x2: case Iop_ShrN16x4: case Iop_ShrN8x8:
       case Iop_SarN32x2: case Iop_SarN16x4: case Iop_SarN8x8:
+      case Iop_QShlN8x8: case Iop_QShlN16x4:
+      case Iop_QShlN32x2: case Iop_QShlN64x1:
+      case Iop_QShlN8Sx8: case Iop_QShlN16Sx4:
+      case Iop_QShlN32Sx2: case Iop_QShlN64Sx1:
+      case Iop_QSalN8x8: case Iop_QSalN16x4:
+      case Iop_QSalN32x2: case Iop_QSalN64x1:
          BINARY(Ity_I64,Ity_I8, Ity_I64);
 
       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
@@ -1668,6 +1998,22 @@
 
       case Iop_Not64:
       case Iop_CmpNEZ32x2: case Iop_CmpNEZ16x4: case Iop_CmpNEZ8x8:
+      case Iop_Cnt8x8:
+      case Iop_Clz8Sx8: case Iop_Clz16Sx4: case Iop_Clz32Sx2:
+      case Iop_Cls8Sx8: case Iop_Cls16Sx4: case Iop_Cls32Sx2:
+      case Iop_PwAddL8Ux8: case Iop_PwAddL16Ux4: case Iop_PwAddL32Ux2:
+      case Iop_PwAddL8Sx8: case Iop_PwAddL16Sx4: case Iop_PwAddL32Sx2:
+      case Iop_Reverse64_8x8: case Iop_Reverse64_16x4: case Iop_Reverse64_32x2:
+      case Iop_Reverse32_8x8: case Iop_Reverse32_16x4:
+      case Iop_Reverse16_8x8:
+      case Iop_FtoI32Sx2_RZ: case Iop_FtoI32Ux2_RZ:
+      case Iop_I32StoFx2: case Iop_I32UtoFx2:
+      case Iop_Recip32x2: case Iop_Recip32Fx2:
+      case Iop_Abs32Fx2:
+      case Iop_Rsqrte32Fx2:
+      case Iop_Rsqrte32x2:
+      case Iop_Neg32Fx2:
+      case Iop_Abs8x8: case Iop_Abs16x4: case Iop_Abs32x2:
          UNARY(Ity_I64, Ity_I64);
 
       case Iop_CmpEQ8: case Iop_CmpNE8:
@@ -1853,16 +2199,31 @@
       case Iop_I32StoFx4:
       case Iop_QFtoI32Ux4_RZ:
       case Iop_QFtoI32Sx4_RZ:
+      case Iop_FtoI32Ux4_RZ:
+      case Iop_FtoI32Sx4_RZ:
       case Iop_RoundF32x4_RM:
       case Iop_RoundF32x4_RP:
       case Iop_RoundF32x4_RN:
       case Iop_RoundF32x4_RZ:
+      case Iop_Abs32Fx4:
+      case Iop_Rsqrte32Fx4:
+      case Iop_Rsqrte32x4:
          UNARY(Ity_V128, Ity_V128);
 
       case Iop_64HLtoV128: BINARY(Ity_I64,Ity_I64, Ity_V128);
-      case Iop_V128to64: case Iop_V128HIto64: 
+      case Iop_V128to64: case Iop_V128HIto64:
+      case Iop_Shorten16x8: case Iop_Shorten32x4: case Iop_Shorten64x2:
+      case Iop_QShortenU16Ux8: case Iop_QShortenU32Ux4: case Iop_QShortenU64Ux2:
+      case Iop_QShortenS16Sx8: case Iop_QShortenS32Sx4: case Iop_QShortenS64Sx2:
+      case Iop_QShortenU16Sx8: case Iop_QShortenU32Sx4: case Iop_QShortenU64Sx2:
+      case Iop_F32toF16x4:
          UNARY(Ity_V128, Ity_I64);
 
+      case Iop_Longen8Ux8: case Iop_Longen16Ux4: case Iop_Longen32Ux2:
+      case Iop_Longen8Sx8: case Iop_Longen16Sx4: case Iop_Longen32Sx2:
+      case Iop_F16toF32x4:
+         UNARY(Ity_I64, Ity_V128);
+
       case Iop_V128to32:    UNARY(Ity_V128, Ity_I32);
       case Iop_32UtoV128:   UNARY(Ity_I32, Ity_V128);
       case Iop_64UtoV128:   UNARY(Ity_I64, Ity_V128);
@@ -1872,6 +2233,9 @@
       case Iop_Dup8x16: UNARY(Ity_I8, Ity_V128);
       case Iop_Dup16x8: UNARY(Ity_I16, Ity_V128);
       case Iop_Dup32x4: UNARY(Ity_I32, Ity_V128);
+      case Iop_Dup8x8:  UNARY(Ity_I8, Ity_I64);
+      case Iop_Dup16x4: UNARY(Ity_I16, Ity_I64);
+      case Iop_Dup32x2: UNARY(Ity_I32, Ity_I64);
 
       case Iop_CmpEQ32Fx4: case Iop_CmpLT32Fx4:
       case Iop_CmpEQ64Fx2: case Iop_CmpLT64Fx2:
@@ -1887,6 +2251,7 @@
       case Iop_Div32Fx4: case Iop_Div32F0x4:
       case Iop_Div64Fx2: case Iop_Div64F0x2:
       case Iop_Max32Fx4: case Iop_Max32F0x4:
+      case Iop_PwMax32Fx4: case Iop_PwMin32Fx4:
       case Iop_Max64Fx2: case Iop_Max64F0x2:
       case Iop_Min32Fx4: case Iop_Min32F0x4:
       case Iop_Min64Fx2: case Iop_Min64F0x2:
@@ -1897,15 +2262,23 @@
       case Iop_AndV128: case Iop_OrV128: case Iop_XorV128:
       case Iop_Add8x16:   case Iop_Add16x8:   
       case Iop_Add32x4:   case Iop_Add64x2:
-      case Iop_QAdd8Ux16: case Iop_QAdd16Ux8: case Iop_QAdd32Ux4:
-      case Iop_QAdd8Sx16: case Iop_QAdd16Sx8: case Iop_QAdd32Sx4:
+      case Iop_QAdd8Ux16: case Iop_QAdd16Ux8:
+      case Iop_QAdd32Ux4: //case Iop_QAdd64Ux2:
+      case Iop_QAdd8Sx16: case Iop_QAdd16Sx8:
+      case Iop_QAdd32Sx4: case Iop_QAdd64Sx2:
+      case Iop_PwAdd8x16: case Iop_PwAdd16x8: case Iop_PwAdd32x4:
       case Iop_Sub8x16:   case Iop_Sub16x8:
       case Iop_Sub32x4:   case Iop_Sub64x2:
-      case Iop_QSub8Ux16: case Iop_QSub16Ux8: case Iop_QSub32Ux4:
-      case Iop_QSub8Sx16: case Iop_QSub16Sx8: case Iop_QSub32Sx4:
-      case Iop_Mul16x8: case Iop_Mul32x4:
+      case Iop_QSub8Ux16: case Iop_QSub16Ux8:
+      case Iop_QSub32Ux4: //case Iop_QSub64Ux2:
+      case Iop_QSub8Sx16: case Iop_QSub16Sx8:
+      case Iop_QSub32Sx4: case Iop_QSub64Sx2:
+      case Iop_Mul8x16: case Iop_Mul16x8: case Iop_Mul32x4:
+      case Iop_PolynomialMul8x16:
       case Iop_MulHi16Ux8: case Iop_MulHi32Ux4: 
       case Iop_MulHi16Sx8: case Iop_MulHi32Sx4: 
+      case Iop_QDMulHi16Sx8: case Iop_QDMulHi32Sx4:
+      case Iop_QRDMulHi16Sx8: case Iop_QRDMulHi32Sx4:
       case Iop_MullEven8Ux16: case Iop_MullEven16Ux8:
       case Iop_MullEven8Sx16: case Iop_MullEven16Sx8:
       case Iop_Avg8Ux16: case Iop_Avg16Ux8: case Iop_Avg32Ux4:
@@ -1918,22 +2291,40 @@
       case Iop_CmpGT8Sx16: case Iop_CmpGT16Sx8: case Iop_CmpGT32Sx4:
       case Iop_CmpGT64Sx2:
       case Iop_CmpGT8Ux16: case Iop_CmpGT16Ux8: case Iop_CmpGT32Ux4:
-      case Iop_Shl8x16: case Iop_Shl16x8: case Iop_Shl32x4:
-      case Iop_Shr8x16: case Iop_Shr16x8: case Iop_Shr32x4:
-      case Iop_Sar8x16: case Iop_Sar16x8: case Iop_Sar32x4:
+      case Iop_Shl8x16: case Iop_Shl16x8: case Iop_Shl32x4: case Iop_Shl64x2:
+      case Iop_QShl8x16: case Iop_QShl16x8: case Iop_QShl32x4: case Iop_QShl64x2:
+      case Iop_QSal8x16: case Iop_QSal16x8: case Iop_QSal32x4: case Iop_QSal64x2:
+      case Iop_Shr8x16: case Iop_Shr16x8: case Iop_Shr32x4: case Iop_Shr64x2:
+      case Iop_Sar8x16: case Iop_Sar16x8: case Iop_Sar32x4: case Iop_Sar64x2:
+      case Iop_Sal8x16: case Iop_Sal16x8: case Iop_Sal32x4: case Iop_Sal64x2:
       case Iop_Rol8x16: case Iop_Rol16x8: case Iop_Rol32x4:
       case Iop_QNarrow16Ux8: case Iop_QNarrow32Ux4:
       case Iop_QNarrow16Sx8: case Iop_QNarrow32Sx4:
       case Iop_Narrow16x8:   case Iop_Narrow32x4:
       case Iop_InterleaveHI8x16: case Iop_InterleaveHI16x8:
       case Iop_InterleaveHI32x4: case Iop_InterleaveHI64x2:
-      case Iop_InterleaveLO8x16: case Iop_InterleaveLO16x8: 
+      case Iop_InterleaveLO8x16: case Iop_InterleaveLO16x8:
       case Iop_InterleaveLO32x4: case Iop_InterleaveLO64x2:
+      case Iop_CatOddLanes8x16: case Iop_CatEvenLanes8x16:
+      case Iop_CatOddLanes16x8: case Iop_CatEvenLanes16x8:
+      case Iop_CatOddLanes32x4: case Iop_CatEvenLanes32x4:
+      case Iop_InterleaveOddLanes8x16: case Iop_InterleaveEvenLanes8x16:
+      case Iop_InterleaveOddLanes16x8: case Iop_InterleaveEvenLanes16x8:
+      case Iop_InterleaveOddLanes32x4: case Iop_InterleaveEvenLanes32x4:
       case Iop_Perm8x16:
+      case Iop_Recps32Fx4:
+      case Iop_Rsqrts32Fx4:
          BINARY(Ity_V128,Ity_V128, Ity_V128);
 
+      case Iop_PolynomialMull8x8:
+      case Iop_Mull8Ux8: case Iop_Mull8Sx8:
+      case Iop_Mull16Ux4: case Iop_Mull16Sx4:
+      case Iop_Mull32Ux2: case Iop_Mull32Sx2:
+         BINARY(Ity_I64, Ity_I64, Ity_V128);
+
       case Iop_NotV128:
       case Iop_Recip32Fx4: case Iop_Recip32F0x4:
+      case Iop_Recip32x4:
       case Iop_Recip64Fx2: case Iop_Recip64F0x2:
       case Iop_RSqrt32Fx4: case Iop_RSqrt32F0x4:
       case Iop_RSqrt64Fx2: case Iop_RSqrt64F0x2:
@@ -1941,6 +2332,16 @@
       case Iop_Sqrt64Fx2:  case Iop_Sqrt64F0x2:
       case Iop_CmpNEZ8x16: case Iop_CmpNEZ16x8:
       case Iop_CmpNEZ32x4: case Iop_CmpNEZ64x2:
+      case Iop_Cnt8x16:
+      case Iop_Clz8Sx16: case Iop_Clz16Sx8: case Iop_Clz32Sx4:
+      case Iop_Cls8Sx16: case Iop_Cls16Sx8: case Iop_Cls32Sx4:
+      case Iop_PwAddL8Ux16: case Iop_PwAddL16Ux8: case Iop_PwAddL32Ux4:
+      case Iop_PwAddL8Sx16: case Iop_PwAddL16Sx8: case Iop_PwAddL32Sx4:
+      case Iop_Reverse64_8x16: case Iop_Reverse64_16x8: case Iop_Reverse64_32x4:
+      case Iop_Reverse32_8x16: case Iop_Reverse32_16x8:
+      case Iop_Reverse16_8x16:
+      case Iop_Neg32Fx4:
+      case Iop_Abs8x16: case Iop_Abs16x8: case Iop_Abs32x4:
          UNARY(Ity_V128, Ity_V128);
 
       case Iop_ShlV128: case Iop_ShrV128:
@@ -1948,9 +2349,57 @@
       case Iop_ShlN32x4: case Iop_ShlN64x2:
       case Iop_ShrN8x16: case Iop_ShrN16x8: 
       case Iop_ShrN32x4: case Iop_ShrN64x2:
-      case Iop_SarN8x16: case Iop_SarN16x8: case Iop_SarN32x4:
+      case Iop_SarN8x16: case Iop_SarN16x8:
+      case Iop_SarN32x4: case Iop_SarN64x2:
+      case Iop_QShlN8x16: case Iop_QShlN16x8:
+      case Iop_QShlN32x4: case Iop_QShlN64x2:
+      case Iop_QShlN8Sx16: case Iop_QShlN16Sx8:
+      case Iop_QShlN32Sx4: case Iop_QShlN64Sx2:
+      case Iop_QSalN8x16: case Iop_QSalN16x8:
+      case Iop_QSalN32x4: case Iop_QSalN64x2:
          BINARY(Ity_V128,Ity_I8, Ity_V128);
 
+      case Iop_F32ToFixed32Ux4_RZ:
+      case Iop_F32ToFixed32Sx4_RZ:
+      case Iop_Fixed32UToF32x4_RN:
+      case Iop_Fixed32SToF32x4_RN:
+         BINARY(Ity_V128, Ity_I8, Ity_V128);
+
+      case Iop_F32ToFixed32Ux2_RZ:
+      case Iop_F32ToFixed32Sx2_RZ:
+      case Iop_Fixed32UToF32x2_RN:
+      case Iop_Fixed32SToF32x2_RN:
+         BINARY(Ity_I64, Ity_I8, Ity_I64);
+
+      case Iop_GetElem8x16:
+         BINARY(Ity_V128, Ity_I8, Ity_I8);
+      case Iop_GetElem16x8:
+         BINARY(Ity_V128, Ity_I8, Ity_I16);
+      case Iop_GetElem32x4:
+         BINARY(Ity_V128, Ity_I8, Ity_I32);
+      case Iop_GetElem64x2:
+         BINARY(Ity_V128, Ity_I8, Ity_I64);
+      case Iop_GetElem8x8:
+         BINARY(Ity_I64, Ity_I8, Ity_I8);
+      case Iop_GetElem16x4:
+         BINARY(Ity_I64, Ity_I8, Ity_I16);
+      case Iop_GetElem32x2:
+         BINARY(Ity_I64, Ity_I8, Ity_I32);
+      case Iop_SetElem8x8:
+         TERNARY(Ity_I64, Ity_I8, Ity_I8, Ity_I64);
+      case Iop_SetElem16x4:
+         TERNARY(Ity_I64, Ity_I8, Ity_I16, Ity_I64);
+      case Iop_SetElem32x2:
+         TERNARY(Ity_I64, Ity_I8, Ity_I32, Ity_I64);
+
+      case Iop_Extract64:
+         TERNARY(Ity_I64, Ity_I64, Ity_I8, Ity_I64);
+      case Iop_ExtractV128:
+         TERNARY(Ity_V128, Ity_V128, Ity_I8, Ity_V128);
+
+      case Iop_QDMulLong16Sx4: case Iop_QDMulLong32Sx2:
+         BINARY(Ity_I64, Ity_I64, Ity_V128);
+
       default:
          ppIROp(op);
          vpanic("typeOfPrimop");
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index 0e291eb..ee4df18 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -675,6 +675,49 @@
       Iop_CalcFPRF, /* Calc 5 fpscr[FPRF] bits (Class, <, =, >, Unord)
                        from FP result */
 
+      /* ------------------ 64-bit SIMD FP ------------------------ */
+
+      /* Convertion to/from int */
+      Iop_I32UtoFx2,  Iop_I32StoFx2,    /* I32x4 -> F32x4 */
+      Iop_FtoI32Ux2_RZ,  Iop_FtoI32Sx2_RZ,    /* F32x4 -> I32x4 */
+      /* Fixed32 format is floating-point number with fixed number of fraction
+         bits. The number of fraction bits is passed as a second argument of
+         type I8. */
+      Iop_F32ToFixed32Ux2_RZ, Iop_F32ToFixed32Sx2_RZ, /* fp -> fixed-point */
+      Iop_Fixed32UToF32x2_RN, Iop_Fixed32SToF32x2_RN, /* fixed-point -> fp */
+
+      /* Binary operations */
+      Iop_Max32Fx2,      Iop_Min32Fx2,
+      /* Pairwise Min and Max. See integer pairwise operations for more
+         details. */
+      Iop_PwMax32Fx2,    Iop_PwMin32Fx2,
+      /* Note: For the following compares, the arm front-end assumes a
+         nan in a lane of either argument returns zero for that lane. */
+      Iop_CmpEQ32Fx2, Iop_CmpGT32Fx2, Iop_CmpGE32Fx2,
+
+      /* Vector Reciprocal Estimate finds an approximate reciprocal of each
+      element in the operand vector, and places the results in the destination
+      vector.  */
+      Iop_Recip32Fx2,
+
+      /* Vector Reciprocal Step computes (2.0 - arg1 * arg2).
+         Note, that if one of the arguments is zero and another one is infinity
+         of arbitrary sign the result of the operation is 2.0. */
+      Iop_Recps32Fx2,
+
+      /* Vector Reciprocal Square Root Estimate finds an approximate reciprocal
+         square root of each element in the operand vector. */
+      Iop_Rsqrte32Fx2,
+
+      /* Vector Reciprocal Square Root Step computes (3.0 - arg1 * arg2) / 2.0.
+         Note, that of one of the arguments is zero and another one is infiinty
+         of arbitrary sign the result of the operation is 1.5. */
+      Iop_Rsqrts32Fx2,
+
+      /* Unary */
+      Iop_Neg32Fx2, Iop_Abs32Fx2,
+
+
       /* ------------------ 64-bit SIMD Integer. ------------------ */
 
       /* MISC (vector integer cmp != 0) */
@@ -682,54 +725,142 @@
 
       /* ADDITION (normal / unsigned sat / signed sat) */
       Iop_Add8x8,   Iop_Add16x4,   Iop_Add32x2,
-      Iop_QAdd8Ux8, Iop_QAdd16Ux4,
-      Iop_QAdd8Sx8, Iop_QAdd16Sx4,
+      Iop_QAdd8Ux8, Iop_QAdd16Ux4, Iop_QAdd32Ux2, Iop_QAdd64Ux1,
+      Iop_QAdd8Sx8, Iop_QAdd16Sx4, Iop_QAdd32Sx2, Iop_QAdd64Sx1,
+
+      /* PAIRWISE operations */
+      /* Iop_PwFoo16x4( [a,b,c,d], [e,f,g,h] ) =
+            [Foo16(a,b), Foo16(c,d), Foo16(e,f), Foo16(g,h)] */
+      Iop_PwAdd8x8,  Iop_PwAdd16x4,  Iop_PwAdd32x2,
+      Iop_PwMax8Sx8, Iop_PwMax16Sx4, Iop_PwMax32Sx2,
+      Iop_PwMax8Ux8, Iop_PwMax16Ux4, Iop_PwMax32Ux2,
+      Iop_PwMin8Sx8, Iop_PwMin16Sx4, Iop_PwMin32Sx2,
+      Iop_PwMin8Ux8, Iop_PwMin16Ux4, Iop_PwMin32Ux2,
+      /* Longening variant is unary. The resulting vector contains two times
+         less elements than operand, but they are two times wider.
+         Example:
+            Iop_PAddL16Ux4( [a,b,c,d] ) = [a+b,c+d]
+               where a+b and c+d are unsigned 32-bit values. */
+      Iop_PwAddL8Ux8, Iop_PwAddL16Ux4, Iop_PwAddL32Ux2,
+      Iop_PwAddL8Sx8, Iop_PwAddL16Sx4, Iop_PwAddL32Sx2,
 
       /* SUBTRACTION (normal / unsigned sat / signed sat) */
       Iop_Sub8x8,   Iop_Sub16x4,   Iop_Sub32x2,
-      Iop_QSub8Ux8, Iop_QSub16Ux4,
-      Iop_QSub8Sx8, Iop_QSub16Sx4,
+      Iop_QSub8Ux8, Iop_QSub16Ux4, Iop_QSub32Ux2, Iop_QSub64Ux1,
+      Iop_QSub8Sx8, Iop_QSub16Sx4, Iop_QSub32Sx2, Iop_QSub64Sx1,
 
-      /* MULTIPLICATION (normal / high half of signed/unsigned) */
-      Iop_Mul16x4, Iop_Mul32x2,
+      /* ABSOLUTE VALUE */
+      Iop_Abs8x8, Iop_Abs16x4, Iop_Abs32x2,
+
+      /* MULTIPLICATION (normal / high half of signed/unsigned / plynomial ) */
+      Iop_Mul8x8, Iop_Mul16x4, Iop_Mul32x2,
+      Iop_Mul32Fx2,
       Iop_MulHi16Ux4,
       Iop_MulHi16Sx4,
+      /* Plynomial multiplication treats it's arguments as coefficients of
+         polynoms over {0, 1}. */
+      Iop_PolynomialMul8x8,
+
+      /* Vector Saturating Doubling Multiply Returning High Half and
+         Vector Saturating Rounding Doubling Multiply Returning High Half */
+      /* These IROp's multiply corresponding elements in two vectors, double
+         the results, and place the most significant half of the final results
+         in the destination vector. The results are truncated or rounded. If
+         any of the results overflow, they are saturated. */
+      Iop_QDMulHi16Sx4, Iop_QDMulHi32Sx2,
+      Iop_QRDMulHi16Sx4, Iop_QRDMulHi32Sx2,
 
       /* AVERAGING: note: (arg1 + arg2 + 1) >>u 1 */
       Iop_Avg8Ux8,
       Iop_Avg16Ux4,
 
       /* MIN/MAX */
-      Iop_Max16Sx4,
-      Iop_Max8Ux8,
-      Iop_Min16Sx4,
-      Iop_Min8Ux8,
+      Iop_Max8Sx8, Iop_Max16Sx4, Iop_Max32Sx2,
+      Iop_Max8Ux8, Iop_Max16Ux4, Iop_Max32Ux2,
+      Iop_Min8Sx8, Iop_Min16Sx4, Iop_Min32Sx2,
+      Iop_Min8Ux8, Iop_Min16Ux4, Iop_Min32Ux2,
 
       /* COMPARISON */
       Iop_CmpEQ8x8,  Iop_CmpEQ16x4,  Iop_CmpEQ32x2,
+      Iop_CmpGT8Ux8, Iop_CmpGT16Ux4, Iop_CmpGT32Ux2,
       Iop_CmpGT8Sx8, Iop_CmpGT16Sx4, Iop_CmpGT32Sx2,
 
+      /* COUNT ones / leading zeroes / leading sign bits (not including topmost
+         bit) */
+      Iop_Cnt8x8,
+      Iop_Clz8Sx8, Iop_Clz16Sx4, Iop_Clz32Sx2,
+      Iop_Cls8Sx8, Iop_Cls16Sx4, Iop_Cls32Sx2,
+
+      /* VECTOR x VECTOR SHIFT / ROTATE */
+      Iop_Shl8x8, Iop_Shl16x4, Iop_Shl32x2,
+      Iop_Shr8x8, Iop_Shr16x4, Iop_Shr32x2,
+      Iop_Sar8x8, Iop_Sar16x4, Iop_Sar32x2,
+      Iop_Sal8x8, Iop_Sal16x4, Iop_Sal32x2, Iop_Sal64x1,
+
       /* VECTOR x SCALAR SHIFT (shift amt :: Ity_I8) */
       Iop_ShlN8x8, Iop_ShlN16x4, Iop_ShlN32x2,
-                   Iop_ShrN16x4, Iop_ShrN32x2,
+      Iop_ShrN8x8, Iop_ShrN16x4, Iop_ShrN32x2,
       Iop_SarN8x8, Iop_SarN16x4, Iop_SarN32x2,
 
+      /* VECTOR x VECTOR SATURATING SHIFT */
+      Iop_QShl8x8, Iop_QShl16x4, Iop_QShl32x2, Iop_QShl64x1,
+      Iop_QSal8x8, Iop_QSal16x4, Iop_QSal32x2, Iop_QSal64x1,
+      /* VECTOR x INTEGER SATURATING SHIFT */
+      Iop_QShlN8Sx8, Iop_QShlN16Sx4, Iop_QShlN32Sx2, Iop_QShlN64Sx1,
+      Iop_QShlN8x8, Iop_QShlN16x4, Iop_QShlN32x2, Iop_QShlN64x1,
+      Iop_QSalN8x8, Iop_QSalN16x4, Iop_QSalN32x2, Iop_QSalN64x1,
+
       /* NARROWING -- narrow 2xI64 into 1xI64, hi half from left arg */
       Iop_QNarrow16Ux4,
       Iop_QNarrow16Sx4,
       Iop_QNarrow32Sx2,
 
-      /* INTERLEAVING -- interleave lanes from low or high halves of
+      /* INTERLEAVING */
+      /* Interleave lanes from low or high halves of
          operands.  Most-significant result lane is from the left
          arg. */
       Iop_InterleaveHI8x8, Iop_InterleaveHI16x4, Iop_InterleaveHI32x2,
       Iop_InterleaveLO8x8, Iop_InterleaveLO16x4, Iop_InterleaveLO32x2,
+      /* Interleave odd/even lanes of operands.  Most-significant result lane
+         is from the left arg.  Note that Interleave{Odd,Even}Lanes32x2 are
+         identical to Interleave{HI,LO}32x2 and so are omitted.*/
+      Iop_InterleaveOddLanes8x8, Iop_InterleaveEvenLanes8x8,
+      Iop_InterleaveOddLanes16x4, Iop_InterleaveEvenLanes16x4,
+
 
       /* CONCATENATION -- build a new value by concatenating either
          the even or odd lanes of both operands.  Note that
          Cat{Odd,Even}Lanes32x2 are identical to Interleave{HI,LO}32x2
          and so are omitted. */
-      Iop_CatOddLanes16x4, Iop_CatEvenLanes16x4,
+      Iop_CatOddLanes8x8, Iop_CatOddLanes16x4,
+      Iop_CatEvenLanes8x8, Iop_CatEvenLanes16x4,
+
+      /* GET / SET elements of VECTOR
+         GET is binop (I64, I8) -> I<elem_size>
+         SET is triop (I64, I8, I<elem_size>) -> I64 */
+      /* Note: the arm back-end handles only constant second argument */
+      Iop_GetElem8x8, Iop_GetElem16x4, Iop_GetElem32x2,
+      Iop_SetElem8x8, Iop_SetElem16x4, Iop_SetElem32x2,
+
+      /* DUPLICATING -- copy value to all lanes */
+      Iop_Dup8x8,   Iop_Dup16x4,   Iop_Dup32x2,
+
+      /* EXTRACT -- copy 8-arg3 highest bytes from arg1 to 8-arg3 lowest bytes
+         of result and arg3 lowest bytes of arg2 to arg3 highest bytes of
+         result.
+         It is a triop: (I64, I64, I8) -> I64 */
+      /* Note: the arm back-end handles only constant third argumnet. */
+      Iop_Extract64,
+
+      /* REVERSE the order of elements in each Half-words, Words,
+         Double-words */
+      /* Examples:
+            Reverse16_8x8([a,b,c,d,e,f,g,h]) = [b,a,d,c,f,e,h,g]
+            Reverse32_8x8([a,b,c,d,e,f,g,h]) = [d,c,b,a,h,g,f,e]
+            Reverse64_8x8([a,b,c,d,e,f,g,h]) = [h,g,f,e,d,c,b,a] */
+      Iop_Reverse16_8x8,
+      Iop_Reverse32_8x8, Iop_Reverse32_16x4,
+      Iop_Reverse64_8x8, Iop_Reverse64_16x4, Iop_Reverse64_32x2,
 
       /* PERMUTING -- copy src bytes to dst,
          as indexed by control vector bytes:
@@ -738,6 +869,10 @@
          is undefined. */
       Iop_Perm8x8,
 
+      /* Vector Reciprocal Estimate and Vector Reciprocal Square Root Estimate
+         See floating-point equiwalents for details. */
+      Iop_Recip32x2, Iop_Rsqrte32x2,
+
       /* ------------------ 128-bit SIMD FP. ------------------ */
 
       /* --- 32x4 vector FP --- */
@@ -745,22 +880,59 @@
       /* binary */
       Iop_Add32Fx4, Iop_Sub32Fx4, Iop_Mul32Fx4, Iop_Div32Fx4, 
       Iop_Max32Fx4, Iop_Min32Fx4,
-      /* Note: For the following compares, the ppc front-end assumes a
+      Iop_Add32Fx2, Iop_Sub32Fx2,
+      /* Note: For the following compares, the ppc and arm front-ends assume a
          nan in a lane of either argument returns zero for that lane. */
-      Iop_CmpEQ32Fx4, Iop_CmpLT32Fx4, Iop_CmpLE32Fx4, Iop_CmpUN32Fx4, 
+      Iop_CmpEQ32Fx4, Iop_CmpLT32Fx4, Iop_CmpLE32Fx4, Iop_CmpUN32Fx4,
       Iop_CmpGT32Fx4, Iop_CmpGE32Fx4,
 
+      /* Vector Absolute */
+      Iop_Abs32Fx4,
+
+      /* Pairwise Max and Min. See integer pairwise operations for details. */
+      Iop_PwMax32Fx4, Iop_PwMin32Fx4,
+
       /* unary */
-      Iop_Recip32Fx4, Iop_Sqrt32Fx4, Iop_RSqrt32Fx4,
+      Iop_Sqrt32Fx4, Iop_RSqrt32Fx4,
+      Iop_Neg32Fx4,
+
+      /* Vector Reciprocal Estimate finds an approximate reciprocal of each
+      element in the operand vector, and places the results in the destination
+      vector.  */
+      Iop_Recip32Fx4,
+
+      /* Vector Reciprocal Step computes (2.0 - arg1 * arg2).
+         Note, that if one of the arguments is zero and another one is infinity
+         of arbitrary sign the result of the operation is 2.0. */
+      Iop_Recps32Fx4,
+
+      /* Vector Reciprocal Square Root Estimate finds an approximate reciprocal
+         square root of each element in the operand vector. */
+      Iop_Rsqrte32Fx4,
+
+      /* Vector Reciprocal Square Root Step computes (3.0 - arg1 * arg2) / 2.0.
+         Note, that of one of the arguments is zero and another one is infiinty
+         of arbitrary sign the result of the operation is 1.5. */
+      Iop_Rsqrts32Fx4,
+
 
       /* --- Int to/from FP conversion --- */
       /* Unlike the standard fp conversions, these irops take no
          rounding mode argument. Instead the irop trailers _R{M,P,N,Z}
          indicate the mode: {-inf, +inf, nearest, zero} respectively. */
-      Iop_I32UtoFx4,     Iop_I32StoFx4,       /* I32x4 -> F32x4       */
-      Iop_QFtoI32Ux4_RZ, Iop_QFtoI32Sx4_RZ,   /* F32x4 -> I32x4       */
+      Iop_I32UtoFx4,  Iop_I32StoFx4,       /* I32x4 -> F32x4       */
+      Iop_FtoI32Ux4_RZ,  Iop_FtoI32Sx4_RZ,    /* F32x4 -> I32x4       */
+      Iop_QFtoI32Ux4_RZ, Iop_QFtoI32Sx4_RZ,   /* F32x4 -> I32x4 (with saturation) */
       Iop_RoundF32x4_RM, Iop_RoundF32x4_RP,   /* round to fp integer  */
       Iop_RoundF32x4_RN, Iop_RoundF32x4_RZ,   /* round to fp integer  */
+      /* Fixed32 format is floating-point number with fixed number of fraction
+         bits. The number of fraction bits is passed as a second argument of
+         type I8. */
+      Iop_F32ToFixed32Ux4_RZ, Iop_F32ToFixed32Sx4_RZ, /* fp -> fixed-point */
+      Iop_Fixed32UToF32x4_RN, Iop_Fixed32SToF32x4_RN, /* fixed-point -> fp */
+
+      /* --- Single to/from half conversion --- */
+      Iop_F32toF16x4, Iop_F16toF32x4,         /* F32x4 <-> F16x4      */
 
       /* --- 32x4 lowest-lane-only scalar FP --- */
 
@@ -826,22 +998,56 @@
       Iop_CmpNEZ8x16, Iop_CmpNEZ16x8, Iop_CmpNEZ32x4, Iop_CmpNEZ64x2,
 
       /* ADDITION (normal / unsigned sat / signed sat) */
-      Iop_Add8x16,   Iop_Add16x8,   Iop_Add32x4,  Iop_Add64x2,
-      Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4,
-      Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4,
+      Iop_Add8x16,   Iop_Add16x8,   Iop_Add32x4,   Iop_Add64x2,
+      Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2,
+      Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2,
 
       /* SUBTRACTION (normal / unsigned sat / signed sat) */
-      Iop_Sub8x16,   Iop_Sub16x8,   Iop_Sub32x4,  Iop_Sub64x2,
-      Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4,
-      Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4,
+      Iop_Sub8x16,   Iop_Sub16x8,   Iop_Sub32x4,   Iop_Sub64x2,
+      Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2,
+      Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2,
 
       /* MULTIPLICATION (normal / high half of signed/unsigned) */
-      Iop_Mul16x8,    Iop_Mul32x4,
-      Iop_MulHi16Ux8, Iop_MulHi32Ux4,
-      Iop_MulHi16Sx8, Iop_MulHi32Sx4,
+      Iop_Mul8x16,  Iop_Mul16x8,    Iop_Mul32x4,
+                    Iop_MulHi16Ux8, Iop_MulHi32Ux4,
+                    Iop_MulHi16Sx8, Iop_MulHi32Sx4,
       /* (widening signed/unsigned of even lanes, with lowest lane=zero) */
       Iop_MullEven8Ux16, Iop_MullEven16Ux8,
       Iop_MullEven8Sx16, Iop_MullEven16Sx8,
+      /* FIXME: document these */
+      Iop_Mull8Ux8, Iop_Mull8Sx8,
+      Iop_Mull16Ux4, Iop_Mull16Sx4,
+      Iop_Mull32Ux2, Iop_Mull32Sx2,
+      /* Vector Saturating Doubling Multiply Returning High Half and
+         Vector Saturating Rounding Doubling Multiply Returning High Half */
+      /* These IROp's multiply corresponding elements in two vectors, double
+         the results, and place the most significant half of the final results
+         in the destination vector. The results are truncated or rounded. If
+         any of the results overflow, they are saturated. */
+      Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4,
+      Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4,
+      /* Doubling saturating multiplication (long) (I64, I64) -> V128 */
+      Iop_QDMulLong16Sx4, Iop_QDMulLong32Sx2,
+      /* Plynomial multiplication treats it's arguments as coefficients of
+         polynoms over {0, 1}. */
+      Iop_PolynomialMul8x16, /* (V128, V128) -> V128 */
+      Iop_PolynomialMull8x8, /*   (I64, I64) -> V128 */
+
+      /* PAIRWISE operations */
+      /* Iop_PwFoo16x4( [a,b,c,d], [e,f,g,h] ) =
+            [Foo16(a,b), Foo16(c,d), Foo16(e,f), Foo16(g,h)] */
+      Iop_PwAdd8x16, Iop_PwAdd16x8, Iop_PwAdd32x4,
+      Iop_PwAdd32Fx2,
+      /* Longening variant is unary. The resulting vector contains two times
+         less elements than operand, but they are two times wider.
+         Example:
+            Iop_PwAddL16Ux4( [a,b,c,d] ) = [a+b,c+d]
+               where a+b and c+d are unsigned 32-bit values. */
+      Iop_PwAddL8Ux16, Iop_PwAddL16Ux8, Iop_PwAddL32Ux4,
+      Iop_PwAddL8Sx16, Iop_PwAddL16Sx8, Iop_PwAddL32Sx4,
+
+      /* ABSOLUTE VALUE */
+      Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4,
 
       /* AVERAGING: note: (arg1 + arg2 + 1) >>u 1 */
       Iop_Avg8Ux16, Iop_Avg16Ux8, Iop_Avg32Ux4,
@@ -858,40 +1064,110 @@
       Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2,
       Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4,
 
+      /* COUNT ones / leading zeroes / leading sign bits (not including topmost
+         bit) */
+      Iop_Cnt8x16,
+      Iop_Clz8Sx16, Iop_Clz16Sx8, Iop_Clz32Sx4,
+      Iop_Cls8Sx16, Iop_Cls16Sx8, Iop_Cls32Sx4,
+
       /* VECTOR x SCALAR SHIFT (shift amt :: Ity_I8) */
       Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2,
       Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2,
-      Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4,
+      Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2,
 
       /* VECTOR x VECTOR SHIFT / ROTATE */
-      Iop_Shl8x16, Iop_Shl16x8, Iop_Shl32x4,
-      Iop_Shr8x16, Iop_Shr16x8, Iop_Shr32x4,
-      Iop_Sar8x16, Iop_Sar16x8, Iop_Sar32x4,
+      Iop_Shl8x16, Iop_Shl16x8, Iop_Shl32x4, Iop_Shl64x2,
+      Iop_Shr8x16, Iop_Shr16x8, Iop_Shr32x4, Iop_Shr64x2,
+      Iop_Sar8x16, Iop_Sar16x8, Iop_Sar32x4, Iop_Sar64x2,
+      Iop_Sal8x16, Iop_Sal16x8, Iop_Sal32x4, Iop_Sal64x2,
       Iop_Rol8x16, Iop_Rol16x8, Iop_Rol32x4,
 
+      /* VECTOR x VECTOR SATURATING SHIFT */
+      Iop_QShl8x16, Iop_QShl16x8, Iop_QShl32x4, Iop_QShl64x2,
+      Iop_QSal8x16, Iop_QSal16x8, Iop_QSal32x4, Iop_QSal64x2,
+      /* VECTOR x INTEGER SATURATING SHIFT */
+      Iop_QShlN8Sx16, Iop_QShlN16Sx8, Iop_QShlN32Sx4, Iop_QShlN64Sx2,
+      Iop_QShlN8x16, Iop_QShlN16x8, Iop_QShlN32x4, Iop_QShlN64x2,
+      Iop_QSalN8x16, Iop_QSalN16x8, Iop_QSalN32x4, Iop_QSalN64x2,
+
       /* NARROWING -- narrow 2xV128 into 1xV128, hi half from left arg */
       /* Note: the 16{U,S} and 32{U,S} are the pre-narrow lane widths. */
       Iop_QNarrow16Ux8, Iop_QNarrow32Ux4,
       Iop_QNarrow16Sx8, Iop_QNarrow32Sx4,
       Iop_Narrow16x8, Iop_Narrow32x4,
+      /* Shortening V128->I64, lo half from each element */
+      Iop_Shorten16x8, Iop_Shorten32x4, Iop_Shorten64x2,
+      /* Saturating shortening from signed source to signed/unsigned destination */
+      Iop_QShortenS16Sx8, Iop_QShortenS32Sx4, Iop_QShortenS64Sx2,
+      Iop_QShortenU16Sx8, Iop_QShortenU32Sx4, Iop_QShortenU64Sx2,
+      /* Saturating shortening from unsigned source to unsigned destination */
+      Iop_QShortenU16Ux8, Iop_QShortenU32Ux4, Iop_QShortenU64Ux2,
 
-      /* INTERLEAVING -- interleave lanes from low or high halves of
+      /* WIDENING */
+      /* Longening --- sign or zero extends each element of the argument
+         vector to the twice original size. The resulting vector consists of
+         the same number of elements but each element and the vector itself
+         are two times wider.
+         All operations are I64->V128.
+         Example
+            Iop_Longen32Sx2( [a, b] ) = [c, d]
+               where c = Iop_32Sto64(a) and d = Iop_32Sto64(b) */
+      Iop_Longen8Ux8, Iop_Longen16Ux4, Iop_Longen32Ux2,
+      Iop_Longen8Sx8, Iop_Longen16Sx4, Iop_Longen32Sx2,
+
+      /* INTERLEAVING */
+      /* Interleave lanes from low or high halves of
          operands.  Most-significant result lane is from the left
          arg. */
       Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
       Iop_InterleaveHI32x4, Iop_InterleaveHI64x2,
-      Iop_InterleaveLO8x16, Iop_InterleaveLO16x8, 
+      Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
       Iop_InterleaveLO32x4, Iop_InterleaveLO64x2,
+      /* Interleave odd/even lanes of operands.  Most-significant result lane
+         is from the left arg. */
+      Iop_InterleaveOddLanes8x16, Iop_InterleaveEvenLanes8x16,
+      Iop_InterleaveOddLanes16x8, Iop_InterleaveEvenLanes16x8,
+      Iop_InterleaveOddLanes32x4, Iop_InterleaveEvenLanes32x4,
+
+      /* CONCATENATION -- build a new value by concatenating either
+         the even or odd lanes of both operands. */
+      Iop_CatOddLanes8x16, Iop_CatOddLanes16x8, Iop_CatOddLanes32x4,
+      Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8, Iop_CatEvenLanes32x4,
+
+      /* GET elements of VECTOR
+         GET is binop (V128, I8) -> I<elem_size> */
+      /* Note: the arm back-end handles only constant second argument. */
+      Iop_GetElem8x16, Iop_GetElem16x8, Iop_GetElem32x4, Iop_GetElem64x2,
 
       /* DUPLICATING -- copy value to all lanes */
-      Iop_Dup8x16, Iop_Dup16x8, Iop_Dup32x4,
+      Iop_Dup8x16,   Iop_Dup16x8,   Iop_Dup32x4,
+
+      /* EXTRACT -- copy 16-arg3 highest bytes from arg1 to 16-arg3 lowest bytes
+         of result and arg3 lowest bytes of arg2 to arg3 highest bytes of
+         result.
+         It is a triop: (V128, V128, I8) -> V128 */
+      /* Note: the ARM back end handles only constant arg3 in this operation. */
+      Iop_ExtractV128,
+
+      /* REVERSE the order of elements in each Half-words, Words,
+         Double-words */
+      /* Examples:
+            Reverse32_16x8([a,b,c,d,e,f,g,h]) = [b,a,d,c,f,e,h,g]
+            Reverse64_16x8([a,b,c,d,e,f,g,h]) = [d,c,b,a,h,g,f,e] */
+      Iop_Reverse16_8x16,
+      Iop_Reverse32_8x16, Iop_Reverse32_16x8,
+      Iop_Reverse64_8x16, Iop_Reverse64_16x8, Iop_Reverse64_32x4,
 
       /* PERMUTING -- copy src bytes to dst,
          as indexed by control vector bytes:
             for i in 0 .. 15 . result[i] = argL[ argR[i] ] 
          argR[i] values may only be in the range 0 .. 15, else behaviour
          is undefined. */
-      Iop_Perm8x16
+      Iop_Perm8x16,
+
+      /* Vector Reciprocal Estimate and Vector Reciprocal Square Root Estimate
+         See floating-point equiwalents for details. */
+      Iop_Recip32x4, Iop_Rsqrte32x4
    }
    IROp;
 
@@ -1178,6 +1454,8 @@
                                 IRExpr*, IRExpr* );
 extern IRExpr** mkIRExprVec_7 ( IRExpr*, IRExpr*, IRExpr*, IRExpr*,
                                 IRExpr*, IRExpr*, IRExpr* );
+extern IRExpr** mkIRExprVec_8 ( IRExpr*, IRExpr*, IRExpr*, IRExpr*,
+                                IRExpr*, IRExpr*, IRExpr*, IRExpr*);
 
 /* IRExpr copiers:
    - shallowCopy: shallow-copy (ie. create a new vector that shares the
diff --git a/test_main.c b/test_main.c
index dad1270..2fc41a6 100644
--- a/test_main.c
+++ b/test_main.c
@@ -1647,13 +1647,21 @@
          return binary16Ix8(mce, vatom1, vatom2);
 
       case Iop_Sub32x4:
+      case Iop_QSub32Sx4:
+      case Iop_QSub32Ux4:
       case Iop_CmpGT32Sx4:
       case Iop_CmpEQ32x4:
       case Iop_Add32x4:
+      case Iop_QAdd32Ux4:
+      case Iop_QAdd32Sx4:
          return binary32Ix4(mce, vatom1, vatom2);
 
       case Iop_Sub64x2:
+      case Iop_QSub64Ux2:
+      case Iop_QSub64Sx2:
       case Iop_Add64x2:
+      case Iop_QAdd64Ux2:
+      case Iop_QAdd64Sx2:
          return binary64Ix2(mce, vatom1, vatom2);
 
       case Iop_QNarrow32Sx4: