Handle IR primops arising from running Altivec code.  It seems Altivec
is stronger in the vector integer area than SSE, but weaker in the
vector FP area.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5073 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 0b10048..04e52db 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -1449,6 +1449,7 @@
    IRAtom* (*pcast)( MCEnv*, IRAtom* );
    switch (narrow_op) {
       case Iop_QNarrow32Sx4: pcast = mkPCast32x4; break;
+      case Iop_QNarrow32Ux4: pcast = mkPCast32x4; break;
       case Iop_QNarrow16Sx8: pcast = mkPCast16x8; break;
       case Iop_QNarrow16Ux8: pcast = mkPCast16x8; break;
       default: VG_(tool_panic)("vectorNarrowV128");
@@ -1658,14 +1659,45 @@
          complainIfUndefined(mce, atom2);
          return assignNew(mce, Ity_V128, binop(op, vatom1, atom2));
 
+      case Iop_Shl8x16:
+      case Iop_Shr8x16:
+      case Iop_Sar8x16:
+      case Iop_Rotl8x16:
+         return mkUifUV128(mce,
+                   assignNew(mce, Ity_V128, binop(op, vatom1, atom2)),
+                   mkPCast8x16(mce,vatom2)
+                );
+
+      case Iop_Shl16x8:
+      case Iop_Shr16x8:
+      case Iop_Sar16x8:
+      case Iop_Rotl16x8:
+         return mkUifUV128(mce,
+                   assignNew(mce, Ity_V128, binop(op, vatom1, atom2)),
+                   mkPCast16x8(mce,vatom2)
+                );
+
+      case Iop_Shl32x4:
+      case Iop_Shr32x4:
+      case Iop_Sar32x4:
+      case Iop_Rotl32x4:
+         return mkUifUV128(mce,
+                   assignNew(mce, Ity_V128, binop(op, vatom1, atom2)),
+                   mkPCast32x4(mce,vatom2)
+                );
+
       case Iop_QSub8Ux16:
       case Iop_QSub8Sx16:
       case Iop_Sub8x16:
       case Iop_Min8Ux16:
+      case Iop_Min8Sx16:
       case Iop_Max8Ux16:
+      case Iop_Max8Sx16:
       case Iop_CmpGT8Sx16:
+      case Iop_CmpGT8Ux16:
       case Iop_CmpEQ8x16:
       case Iop_Avg8Ux16:
+      case Iop_Avg8Sx16:
       case Iop_QAdd8Ux16:
       case Iop_QAdd8Sx16:
       case Iop_Add8x16:
@@ -1678,10 +1710,14 @@
       case Iop_MulHi16Sx8:
       case Iop_MulHi16Ux8:
       case Iop_Min16Sx8:
+      case Iop_Min16Ux8:
       case Iop_Max16Sx8:
+      case Iop_Max16Ux8:
       case Iop_CmpGT16Sx8:
+      case Iop_CmpGT16Ux8:
       case Iop_CmpEQ16x8:
       case Iop_Avg16Ux8:
+      case Iop_Avg16Sx8:
       case Iop_QAdd16Ux8:
       case Iop_QAdd16Sx8:
       case Iop_Add16x8:
@@ -1689,8 +1725,19 @@
 
       case Iop_Sub32x4:
       case Iop_CmpGT32Sx4:
+      case Iop_CmpGT32Ux4:
       case Iop_CmpEQ32x4:
+      case Iop_QAdd32Sx4:
+      case Iop_QAdd32Ux4:
+      case Iop_QSub32Sx4:
+      case Iop_QSub32Ux4:
+      case Iop_Avg32Ux4:
+      case Iop_Avg32Sx4:
       case Iop_Add32x4:
+      case Iop_Max32Ux4:
+      case Iop_Max32Sx4:
+      case Iop_Min32Ux4:
+      case Iop_Min32Sx4:
          return binary32Ix4(mce, vatom1, vatom2);
 
       case Iop_Sub64x2:
@@ -1698,6 +1745,7 @@
          return binary64Ix2(mce, vatom1, vatom2);
 
       case Iop_QNarrow32Sx4:
+      case Iop_QNarrow32Ux4:
       case Iop_QNarrow16Sx8:
       case Iop_QNarrow16Ux8:
          return vectorNarrowV128(mce, op, vatom1, vatom2);
@@ -1774,6 +1822,50 @@
                    mkPCast8x16(mce, vatom2)
                 );
 
+     /* These two take the lower half of each 16-bit lane, sign/zero
+        extend it to 32, and multiply together, producing a 32x4
+        result (and implicitly ignoring half the operand bits).  So
+        treat it as a bunch of independent 16x8 operations, but then
+        do 32-bit shifts left-right to copy the lower half results
+        (which are all 0s or all 1s due to PCasting in binary16Ix8)
+        into the upper half of each result lane. */
+      case Iop_MullEven16Ux8:
+      case Iop_MullEven16Sx8: {
+         IRAtom* at;
+         at = binary16Ix8(mce,vatom1,vatom2);
+         at = assignNew(mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
+         at = assignNew(mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
+	 return at;
+      }
+
+      /* Same deal as Iop_MullEven16{S,U}x8 */
+      case Iop_MullEven8Ux16:
+      case Iop_MullEven8Sx16: {
+         IRAtom* at;
+         at = binary8Ix16(mce,vatom1,vatom2);
+         at = assignNew(mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
+         at = assignNew(mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
+	 return at;
+      }
+
+      /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
+         32x4 -> 16x8 laneage, discarding the upper half of each lane.
+         Simply apply same op to the V bits, since this really no more
+         than a data steering operation. */
+      case Iop_Narrow32Ux4: 
+      case Iop_Narrow16Ux8: 
+         return assignNew(mce, Ity_V128, 
+                               binop(op, vatom1, vatom2));
+
+      case Iop_ShrV128:
+      case Iop_ShlV128:
+         /* Same scheme as with all other shifts.  Note: 10 Nov 05:
+            this is wrong now, scalar shifts are done properly lazily.
+            Vector shifts should be fixed too. */
+         complainIfUndefined(mce, atom2);
+         return assignNew(mce, Ity_V128, binop(op, vatom1, atom2));
+
+
       /* I128-bit data-steering */
       case Iop_64HLto128:
          return assignNew(mce, Ity_I128, binop(op, vatom1, vatom2));