Change and simplify the way that Memcheck instruments saturating
narrowing operations.  The previous scheme was simply wrong and could
cause false negatives, by causing some narrowing operations to have a
defined output even when the inputs are undefined.  This was what
#279698 reported.  This patch is a fix for that bug.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@12190 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 04b23f2..96996d3 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -1957,37 +1957,92 @@
 
 /* --- --- Vector saturated narrowing --- --- */
 
-/* This is quite subtle.  What to do is simple:
+/* We used to do something very clever here, but on closer inspection
+   (2011-Jun-15), and in particular bug #279698, it turns out to be
+   wrong.  Part of the problem came from the fact that for a long
+   time, the IR primops to do with saturated narrowing were
+   underspecified and managed to confuse multiple cases which needed
+   to be separate: the op names had a signedness qualifier, but in
+   fact the source and destination signednesses needed to be specified
+   independently, so the op names really need two independent
+   signedness specifiers.
 
-   Let the original narrowing op be QNarrowW{S,U}xN.  Produce:
+   As of 2011-Jun-15 (ish) the underspecification was sorted out
+   properly.  The incorrect instrumentation remained, though.  That
+   has now (2011-Oct-22) been fixed.
 
-      the-narrowing-op( PCastWxN(vatom1), PCastWxN(vatom2))
+   What we now do is simple:
 
-   Why this is right is not so simple.  Consider a lane in the args,
-   vatom1 or 2, doesn't matter.
+   Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
+   number of lanes, X is the source lane width and signedness, and Y
+   is the destination lane width and signedness.  In all cases the
+   destination lane width is half the source lane width, so the names
+   have a bit of redundancy, but are at least easy to read.
 
-   After the PCast, that lane is all 0s (defined) or all
-   1s(undefined).
+   For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
+   to unsigned 16s.
 
-   Both signed and unsigned saturating narrowing of all 0s produces
-   all 0s, which is what we want.
+   Let Vanilla(OP) be a function that takes OP, one of these
+   saturating narrowing ops, and produces the same "shaped" narrowing
+   op which is not saturating, but merely dumps the most significant
+   bits.  "same shape" means that the lane numbers and widths are the
+   same as with OP.
 
-   The all-1s case is more complex.  Unsigned narrowing interprets an
-   all-1s input as the largest unsigned integer, and so produces all
-   1s as a result since that is the largest unsigned value at the
-   smaller width.
+   For example, Vanilla(Iop_QNarrowBin32Sto16Ux8) 
+                  = Iop_NarrowBin32to16x8,
+   that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
+   dumping the top half of each lane.
 
-   Signed narrowing interprets all 1s as -1.  Fortunately, -1 narrows
-   to -1, so we still wind up with all 1s at the smaller width.
+   So, with that in place, the scheme is simple, and it is simple to
+   pessimise each lane individually and then apply Vanilla(OP) so as
+   to get the result in the right "shape".  If the original OP is
+   QNarrowBinXtoYxZ then we produce
 
-   So: In short, pessimise the args, then apply the original narrowing
-   op.
+   Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
 
-   FIXME JRS 2011-Jun-15: figure out if this is still correct
-   following today's rationalisation/cleanup of vector narrowing
-   primops.
+   or for the case when OP is unary (Iop_QNarrowUn*)
+
+   Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
 */
 static
+IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
+{
+   switch (qnarrowOp) {
+      /* Binary: (128, 128) -> 128 */
+      case Iop_QNarrowBin16Sto8Ux16:
+      case Iop_QNarrowBin16Sto8Sx16:
+      case Iop_QNarrowBin16Uto8Ux16:
+         return Iop_NarrowBin16to8x16;
+      case Iop_QNarrowBin32Sto16Ux8:
+      case Iop_QNarrowBin32Sto16Sx8:
+      case Iop_QNarrowBin32Uto16Ux8:
+         return Iop_NarrowBin32to16x8;
+      /* Binary: (64, 64) -> 64 */
+      case Iop_QNarrowBin32Sto16Sx4:
+         return Iop_NarrowBin32to16x4;
+      case Iop_QNarrowBin16Sto8Ux8:
+      case Iop_QNarrowBin16Sto8Sx8:
+         return Iop_NarrowBin16to8x8;
+      /* Unary: 128 -> 64 */
+      case Iop_QNarrowUn64Uto32Ux2:
+      case Iop_QNarrowUn64Sto32Sx2:
+      case Iop_QNarrowUn64Sto32Ux2:
+         return Iop_NarrowUn64to32x2;
+      case Iop_QNarrowUn32Uto16Ux4:
+      case Iop_QNarrowUn32Sto16Sx4:
+      case Iop_QNarrowUn32Sto16Ux4:
+         return Iop_NarrowUn32to16x4;
+      case Iop_QNarrowUn16Uto8Ux8:
+      case Iop_QNarrowUn16Sto8Sx8:
+      case Iop_QNarrowUn16Sto8Ux8:
+         return Iop_NarrowUn16to8x8;
+      default: 
+         ppIROp(qnarrowOp);
+         VG_(tool_panic)("vanillaNarrowOpOfShape");
+   }
+}
+
+static
 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op, 
                               IRAtom* vatom1, IRAtom* vatom2)
 {
@@ -2002,11 +2057,12 @@
       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
       default: VG_(tool_panic)("vectorNarrowBinV128");
    }
+   IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
    tl_assert(isShadowAtom(mce,vatom1));
    tl_assert(isShadowAtom(mce,vatom2));
    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
-   at3 = assignNew('V', mce, Ity_V128, binop(narrow_op, at1, at2));
+   at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
    return at3;
 }
 
@@ -2022,26 +2078,36 @@
       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
       default: VG_(tool_panic)("vectorNarrowBin64");
    }
+   IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
    tl_assert(isShadowAtom(mce,vatom1));
    tl_assert(isShadowAtom(mce,vatom2));
    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
-   at3 = assignNew('V', mce, Ity_I64, binop(narrow_op, at1, at2));
+   at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
    return at3;
 }
 
 static
-IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp shorten_op,
+IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
                              IRAtom* vatom1)
 {
    IRAtom *at1, *at2;
    IRAtom* (*pcast)( MCEnv*, IRAtom* );
-   switch (shorten_op) {
-      /* FIXME: first 3 are too pessimistic; we can just
-         apply them directly to the V bits. */
-      case Iop_NarrowUn16to8x8:     pcast = mkPCast16x8; break;
-      case Iop_NarrowUn32to16x4:    pcast = mkPCast32x4; break;
-      case Iop_NarrowUn64to32x2:    pcast = mkPCast64x2; break;
+   tl_assert(isShadowAtom(mce,vatom1));
+   /* For vanilla narrowing (non-saturating), we can just apply
+      the op directly to the V bits. */
+   switch (narrow_op) {
+      case Iop_NarrowUn16to8x8:
+      case Iop_NarrowUn32to16x4:
+      case Iop_NarrowUn64to32x2:
+         at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
+         return at1;
+      default:
+         break; /* Do Plan B */
+   }
+   /* Plan B: for ops that involve a saturation operation on the args,
+      we must PCast before the vanilla narrow. */
+   switch (narrow_op) {
       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
@@ -2053,9 +2119,9 @@
       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
       default: VG_(tool_panic)("vectorNarrowUnV128");
    }
-   tl_assert(isShadowAtom(mce,vatom1));
+   IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
-   at2 = assignNew('V', mce, Ity_I64, unop(shorten_op, at1));
+   at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
    return at2;
 }