Add support for 64-bit SIMD primops.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@3223 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 5162324..e9486e2 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -1013,6 +1013,21 @@
    return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
 }
 
+static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
+{
+   return assignNew(mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
+}
+
+static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
+{
+   return assignNew(mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
+}
+
+static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
+{
+   return assignNew(mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
+}
+
 
 /* Here's a simple scheme capable of handling ops derived from SSE1
    code and while only generating ops that can be efficiently
@@ -1188,10 +1203,33 @@
    return at3;
 }
 
+static
+IRAtom* vectorNarrow64 ( MCEnv* mce, IROp narrow_op, 
+                         IRAtom* vatom1, IRAtom* vatom2)
+{
+   IRAtom *at1, *at2, *at3;
+   IRAtom* (*pcast)( MCEnv*, IRAtom* );
+   switch (narrow_op) {
+      case Iop_QNarrow32Sx2: pcast = mkPCast32x2; break;
+      case Iop_QNarrow16Sx4: pcast = mkPCast16x4; break;
+      case Iop_QNarrow16Ux4: pcast = mkPCast16x4; break;
+      default: VG_(tool_panic)("vectorNarrow64");
+   }
+   tl_assert(isShadowAtom(mce,vatom1));
+   tl_assert(isShadowAtom(mce,vatom2));
+   at1 = assignNew(mce, Ity_I64, pcast(mce, vatom1));
+   at2 = assignNew(mce, Ity_I64, pcast(mce, vatom2));
+   at3 = assignNew(mce, Ity_I64, binop(narrow_op, at1, at2));
+   return at3;
+}
+
 
 /* --- --- Vector integer arithmetic --- --- */
 
 /* Simple ... UifU the args and per-lane pessimise the results. */
+
+/* --- 128-bit versions --- */
+
 static
 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
 {
@@ -1228,6 +1266,35 @@
    return at;   
 }
 
+/* --- 64-bit versions --- */
+
+static
+IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
+{
+   IRAtom* at;
+   at = mkUifU64(mce, vatom1, vatom2);
+   at = mkPCast8x8(mce, at);
+   return at;   
+}
+
+static
+IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
+{
+   IRAtom* at;
+   at = mkUifU64(mce, vatom1, vatom2);
+   at = mkPCast16x4(mce, at);
+   return at;   
+}
+
+static
+IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
+{
+   IRAtom* at;
+   at = mkUifU64(mce, vatom1, vatom2);
+   at = mkPCast32x2(mce, at);
+   return at;   
+}
+
 
 /*------------------------------------------------------------*/
 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
@@ -1254,7 +1321,68 @@
    tl_assert(sameKindedAtoms(atom2,vatom2));
    switch (op) {
 
-      /* 128-bit SIMD (SSE2-esque) */
+      /* 64-bit SIMD */
+
+      case Iop_ShrN16x4:
+      case Iop_ShrN32x2:
+      case Iop_SarN16x4:
+      case Iop_SarN32x2:
+      case Iop_ShlN16x4:
+      case Iop_ShlN32x2:
+         /* Same scheme as with all other shifts. */
+         complainIfUndefined(mce, atom2);
+         return assignNew(mce, Ity_I64, binop(op, vatom1, atom2));
+
+      case Iop_QNarrow32Sx2:
+      case Iop_QNarrow16Sx4:
+      case Iop_QNarrow16Ux4:
+         return vectorNarrow64(mce, op, vatom1, vatom2);
+
+      case Iop_Min8Ux8:
+      case Iop_Max8Ux8:
+      case Iop_Avg8Ux8:
+      case Iop_QSub8Sx8:
+      case Iop_QSub8Ux8:
+      case Iop_Sub8x8:
+      case Iop_CmpGT8Sx8:
+      case Iop_CmpEQ8x8:
+      case Iop_QAdd8Sx8:
+      case Iop_QAdd8Ux8:
+      case Iop_Add8x8:
+         return binary8Ix8(mce, vatom1, vatom2);
+
+      case Iop_Min16Sx4:
+      case Iop_Max16Sx4:
+      case Iop_Avg16Ux4:
+      case Iop_QSub16Ux4:
+      case Iop_QSub16Sx4:
+      case Iop_Sub16x4:
+      case Iop_Mul16x4:
+      case Iop_MulHi16Sx4:
+      case Iop_MulHi16Ux4:
+      case Iop_CmpGT16Sx4:
+      case Iop_CmpEQ16x4:
+      case Iop_QAdd16Sx4:
+      case Iop_QAdd16Ux4:
+      case Iop_Add16x4:
+         return binary16Ix4(mce, vatom1, vatom2);
+
+      case Iop_Sub32x2:
+      case Iop_CmpGT32Sx2:
+      case Iop_CmpEQ32x2:
+      case Iop_Add32x2:
+         return binary32Ix2(mce, vatom1, vatom2);
+
+      /* 64-bit data-steering */
+      case Iop_InterleaveLO32x2:
+      case Iop_InterleaveLO16x4:
+      case Iop_InterleaveLO8x8:
+      case Iop_InterleaveHI32x2:
+      case Iop_InterleaveHI16x4:
+      case Iop_InterleaveHI8x8:
+         return assignNew(mce, Ity_I64, binop(op, vatom1, vatom2));
+
+      /* 128-bit SIMD */
 
       case Iop_ShrN16x8:
       case Iop_ShrN32x4:
@@ -1334,8 +1462,6 @@
       case Iop_Add64F0x2:
          return binary64F0x2(mce, vatom1, vatom2);      
 
-      /* 128-bit SIMD (SSE1-esque) */
-
       case Iop_Sub32Fx4:
       case Iop_Mul32Fx4:
       case Iop_Min32Fx4: