Merge branches/OTRACK_BY_INSTRUMENTATION into the trunk.  This adds
support to Memcheck for tracking the origin of uninitialised values,
if you use the --track-origins=yes flag.

This currently causes some Memcheck regression tests to fail, because
they now print an extra line of advisory text in their output.  This
will be fixed.

The core-tool interface is slightly changed.  The version number for
the interface needs to be incremented.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@7982 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 81644cc..67dab52 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -35,12 +35,13 @@
 #include "pub_tool_libcprint.h"
 #include "pub_tool_tooliface.h"
 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
-#include "mc_include.h"
-
 #include "pub_tool_xarray.h"
 #include "pub_tool_mallocfree.h"
 #include "pub_tool_libcbase.h"
 
+#include "mc_include.h"
+
+
 /* This file implements the Memcheck instrumentation, and in
    particular contains the core of its undefined value detection
    machinery.  For a comprehensive background of the terminology,
@@ -116,7 +117,7 @@
 
 struct _MCEnv;
 
-static IRType  shadowType ( IRType ty );
+static IRType  shadowTypeV ( IRType ty );
 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e );
 
 
@@ -130,6 +131,7 @@
       /* MODIFIED: the superblock being constructed.  IRStmts are
          added. */
       IRSB* bb;
+      Bool  trace;
 
       /* MODIFIED: a table [0 .. #temps_in_original_bb-1] which maps
          original temps to their current their current shadow temp.
@@ -139,7 +141,8 @@
          point original tmps are shadowed by integer tmps of the same
          size, and Bit-typed original tmps are shadowed by the type
          Ity_I8.  See comment below. */
-      IRTemp* tmpMap;
+      IRTemp* tmpMapV;        /* V-bit tmp shadows */
+      IRTemp* tmpMapB; /* origin tracking tmp shadows */
       Int     n_originalTmps; /* for range checking */
 
       /* MODIFIED: indicates whether "bogus" literals have so far been
@@ -183,15 +186,15 @@
 
 /* Find the tmp currently shadowing the given original tmp.  If none
    so far exists, allocate one.  */
-static IRTemp findShadowTmp ( MCEnv* mce, IRTemp orig )
+static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
 {
    tl_assert(orig < mce->n_originalTmps);
-   if (mce->tmpMap[orig] == IRTemp_INVALID) {
-      mce->tmpMap[orig] 
+   if (mce->tmpMapV[orig] == IRTemp_INVALID) {
+      mce->tmpMapV[orig] 
          = newIRTemp(mce->bb->tyenv, 
-                     shadowType(mce->bb->tyenv->types[orig]));
+                     shadowTypeV(mce->bb->tyenv->types[orig]));
    }
-   return mce->tmpMap[orig];
+   return mce->tmpMapV[orig];
 }
 
 /* Allocate a new shadow for the given original tmp.  This means any
@@ -200,12 +203,12 @@
    for undefinedness, but unfortunately IR's SSA property disallows
    this.  Instead we must abandon the old shadow, allocate a new one
    and use that instead. */
-static void newShadowTmp ( MCEnv* mce, IRTemp orig )
+static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
 {
    tl_assert(orig < mce->n_originalTmps);
-   mce->tmpMap[orig] 
+   mce->tmpMapV[orig] 
       = newIRTemp(mce->bb->tyenv, 
-                  shadowType(mce->bb->tyenv->types[orig]));
+                  shadowTypeV(mce->bb->tyenv->types[orig]));
 }
 
 
@@ -265,7 +268,7 @@
    given type.  The only valid shadow types are Bit, I8, I16, I32,
    I64, V128. */
 
-static IRType shadowType ( IRType ty )
+static IRType shadowTypeV ( IRType ty )
 {
    switch (ty) {
       case Ity_I1:
@@ -278,7 +281,7 @@
       case Ity_F64:  return Ity_I64;
       case Ity_V128: return Ity_V128;
       default: ppIRType(ty); 
-               VG_(tool_panic)("memcheck:shadowType");
+               VG_(tool_panic)("memcheck:shadowTypeV");
    }
 }
 
@@ -301,13 +304,21 @@
 /*--- Constructing IR fragments                            ---*/
 /*------------------------------------------------------------*/
 
-/* assign value to tmp */
-#define assign(_bb,_tmp,_expr)   \
-   addStmtToIRSB((_bb), IRStmt_WrTmp((_tmp),(_expr)))
-
 /* add stmt to a bb */
-#define stmt(_bb,_stmt)    \
-   addStmtToIRSB((_bb), (_stmt))
+static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
+   if (mce->trace) {
+      VG_(printf)("  %c: ", cat);
+      ppIRStmt(st);
+      VG_(printf)("\n");
+   }
+   addStmtToIRSB(mce->bb, st);
+}
+
+/* assign value to tmp */
+static inline 
+void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
+  stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
+}
 
 /* build various kinds of expressions */
 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
@@ -319,12 +330,20 @@
 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
 
-/* bind the given expression to a new temporary, and return the
+/* Bind the given expression to a new temporary, and return the
    temporary.  This effectively converts an arbitrary expression into
-   an atom. */
-static IRAtom* assignNew ( MCEnv* mce, IRType ty, IRExpr* e ) {
-   IRTemp t = newIRTemp(mce->bb->tyenv, ty);
-   assign(mce->bb, t, e);
+   an atom.
+
+   'ty' is the type of 'e' and hence the type that the new temporary
+   needs to be.  But passing it is redundant, since we can deduce the
+   type merely by inspecting 'e'.  So at least that fact to assert
+   that the two types agree. */
+static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e ) {
+   IRTemp t;
+   IRType tyE = typeOfIRExpr(mce->bb->tyenv, e);
+   tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
+   t = newIRTemp(mce->bb->tyenv, ty);
+   assign(cat, mce, t, e);
    return mkexpr(t);
 }
 
@@ -338,31 +357,31 @@
 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I8, binop(Iop_And8, a1, a2));
+   return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
 }
 
 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I16, binop(Iop_And16, a1, a2));
+   return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
 }
 
 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I32, binop(Iop_And32, a1, a2));
+   return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
 }
 
 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I64, binop(Iop_And64, a1, a2));
+   return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
 }
 
 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_V128, binop(Iop_AndV128, a1, a2));
+   return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
 }
 
 /* --------- Undefined-if-either-undefined --------- */
@@ -370,31 +389,31 @@
 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I8, binop(Iop_Or8, a1, a2));
+   return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
 }
 
 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I16, binop(Iop_Or16, a1, a2));
+   return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
 }
 
 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I32, binop(Iop_Or32, a1, a2));
+   return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
 }
 
 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I64, binop(Iop_Or64, a1, a2));
+   return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
 }
 
 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_V128, binop(Iop_OrV128, a1, a2));
+   return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
 }
 
 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
@@ -414,22 +433,22 @@
 
 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
    tl_assert(isShadowAtom(mce,a1));
-   return assignNew(mce, Ity_I8, unop(Iop_Left8, a1));
+   return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
 }
 
 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
    tl_assert(isShadowAtom(mce,a1));
-   return assignNew(mce, Ity_I16, unop(Iop_Left16, a1));
+   return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
 }
 
 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
    tl_assert(isShadowAtom(mce,a1));
-   return assignNew(mce, Ity_I32, unop(Iop_Left32, a1));
+   return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
 }
 
 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
    tl_assert(isShadowAtom(mce,a1));
-   return assignNew(mce, Ity_I64, unop(Iop_Left64, a1));
+   return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
 }
 
 /* --------- 'Improvement' functions for AND/OR. --------- */
@@ -442,7 +461,7 @@
    tl_assert(isOriginalAtom(mce, data));
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
-   return assignNew(mce, Ity_I8, binop(Iop_Or8, data, vbits));
+   return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
 }
 
 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
@@ -450,7 +469,7 @@
    tl_assert(isOriginalAtom(mce, data));
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
-   return assignNew(mce, Ity_I16, binop(Iop_Or16, data, vbits));
+   return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
 }
 
 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
@@ -458,7 +477,7 @@
    tl_assert(isOriginalAtom(mce, data));
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
-   return assignNew(mce, Ity_I32, binop(Iop_Or32, data, vbits));
+   return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
 }
 
 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
@@ -466,7 +485,7 @@
    tl_assert(isOriginalAtom(mce, data));
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
-   return assignNew(mce, Ity_I64, binop(Iop_Or64, data, vbits));
+   return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
 }
 
 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
@@ -474,7 +493,7 @@
    tl_assert(isOriginalAtom(mce, data));
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
-   return assignNew(mce, Ity_V128, binop(Iop_OrV128, data, vbits));
+   return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
 }
 
 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
@@ -486,9 +505,9 @@
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
    return assignNew(
-             mce, Ity_I8, 
+             'V', mce, Ity_I8, 
              binop(Iop_Or8, 
-                   assignNew(mce, Ity_I8, unop(Iop_Not8, data)), 
+                   assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)), 
                    vbits) );
 }
 
@@ -498,9 +517,9 @@
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
    return assignNew(
-             mce, Ity_I16, 
+             'V', mce, Ity_I16, 
              binop(Iop_Or16, 
-                   assignNew(mce, Ity_I16, unop(Iop_Not16, data)), 
+                   assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)), 
                    vbits) );
 }
 
@@ -510,9 +529,9 @@
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
    return assignNew(
-             mce, Ity_I32, 
+             'V', mce, Ity_I32, 
              binop(Iop_Or32, 
-                   assignNew(mce, Ity_I32, unop(Iop_Not32, data)), 
+                   assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)), 
                    vbits) );
 }
 
@@ -522,9 +541,9 @@
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
    return assignNew(
-             mce, Ity_I64, 
+             'V', mce, Ity_I64, 
              binop(Iop_Or64, 
-                   assignNew(mce, Ity_I64, unop(Iop_Not64, data)), 
+                   assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)), 
                    vbits) );
 }
 
@@ -534,9 +553,9 @@
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
    return assignNew(
-             mce, Ity_V128, 
+             'V', mce, Ity_V128, 
              binop(Iop_OrV128, 
-                   assignNew(mce, Ity_V128, unop(Iop_NotV128, data)), 
+                   assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)), 
                    vbits) );
 }
 
@@ -553,14 +572,14 @@
 
    /* Fast-track some common cases */
    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
-      return assignNew(mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
+      return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 
    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
-      return assignNew(mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
+      return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 
    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
-      IRAtom* tmp = assignNew(mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
-      return assignNew(mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
+      IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
+      return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    }
 
    /* Else do it the slow way .. */
@@ -570,24 +589,24 @@
          tmp1 = vbits;
          break;
       case Ity_I8: 
-         tmp1 = assignNew(mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
+         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
          break;
       case Ity_I16: 
-         tmp1 = assignNew(mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
+         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
          break;
       case Ity_I32: 
-         tmp1 = assignNew(mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
+         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
          break;
       case Ity_I64: 
-         tmp1 = assignNew(mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
+         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
          break;
       case Ity_I128: {
          /* Gah.  Chop it in half, OR the halves together, and compare
             that with zero. */
-         IRAtom* tmp2 = assignNew(mce, Ity_I64, unop(Iop_128HIto64, vbits));
-         IRAtom* tmp3 = assignNew(mce, Ity_I64, unop(Iop_128to64, vbits));
-         IRAtom* tmp4 = assignNew(mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
-         tmp1         = assignNew(mce, Ity_I1, 
+         IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
+         IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
+         IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
+         tmp1         = assignNew('V', mce, Ity_I1, 
                                        unop(Iop_CmpNEZ64, tmp4));
          break;
       }
@@ -601,20 +620,20 @@
       case Ity_I1:
          return tmp1;
       case Ity_I8: 
-         return assignNew(mce, Ity_I8, unop(Iop_1Sto8, tmp1));
+         return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
       case Ity_I16: 
-         return assignNew(mce, Ity_I16, unop(Iop_1Sto16, tmp1));
+         return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
       case Ity_I32: 
-         return assignNew(mce, Ity_I32, unop(Iop_1Sto32, tmp1));
+         return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
       case Ity_I64: 
-         return assignNew(mce, Ity_I64, unop(Iop_1Sto64, tmp1));
+         return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
       case Ity_V128:
-         tmp1 = assignNew(mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
-         tmp1 = assignNew(mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
+         tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
+         tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
          return tmp1;
       case Ity_I128:
-         tmp1 = assignNew(mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
-         tmp1 = assignNew(mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
+         tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
+         tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
          return tmp1;
       default: 
          ppIRType(dst_ty);
@@ -705,23 +724,25 @@
    }
 
    naive 
-      = mkPCastTo(mce,ty, assignNew(mce, ty, binop(opUIFU, vxx, vyy)));
+      = mkPCastTo(mce,ty,
+                  assignNew('V', mce, ty, binop(opUIFU, vxx, vyy)));
 
    vec 
       = assignNew(
-           mce,ty, 
+           'V', mce,ty, 
            binop( opOR,
-                  assignNew(mce,ty, binop(opOR, vxx, vyy)),
+                  assignNew('V', mce,ty, binop(opOR, vxx, vyy)),
                   assignNew(
-                     mce,ty, 
+                     'V', mce,ty, 
                      unop( opNOT,
-                           assignNew(mce,ty, binop(opXOR, xx, yy))))));
+                           assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
 
    improvement_term
-      = mkPCastTo( mce,ty, assignNew(mce,Ity_I1, binop(opCMP, vec, top)));
+      = mkPCastTo( mce,ty,
+                   assignNew('V', mce,Ity_I1, binop(opCMP, vec, top)));
 
    improved
-      = assignNew( mce,ty, binop(opDIFD, naive, improvement_term) );
+      = assignNew( 'V', mce,ty, binop(opDIFD, naive, improvement_term) );
 
    final_cast
       = mkPCastTo( mce, Ity_I1, improved );
@@ -817,18 +838,18 @@
          binop(
             opOR,
             assignNew(
-               mce,ty,
+               'V', mce,ty,
                binop(
                   opAND,
                   mkPCastTo(mce,ty, xxhash), 
                   threeLeft1
                )),
             assignNew(
-               mce,ty,
+               'V', mce,ty,
                binop(
                   opSHL,
                   assignNew(
-                     mce,ty,
+                     'V', mce,ty,
                      binop(opSHR, xxhash, mkU8(width-1))),
                   mkU8(3)
                ))
@@ -851,6 +872,9 @@
 /*--- Emit a test and complaint if something is undefined. ---*/
 /*------------------------------------------------------------*/
 
+static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
+
+
 /* Set the annotations on a dirty helper to indicate that the stack
    pointer and instruction pointers might be read.  This is the
    behaviour of all 'emit-a-complaint' style functions we might
@@ -884,9 +908,14 @@
    Int      sz;
    IRDirty* di;
    IRAtom*  cond;
+   IRAtom*  origin;
+   void*    fn;
+   HChar*   nm;
+   IRExpr** args;
+   Int      nargs;
 
    // Don't do V bit tests if we're not reporting undefined value errors.
-   if (!MC_(clo_undef_value_errors))
+   if (MC_(clo_mc_level) == 1)
       return;
 
    /* Since the original expression is atomic, there's no duplicated
@@ -906,51 +935,103 @@
    cond = mkPCastTo( mce, Ity_I1, vatom );
    /* cond will be 0 if all defined, and 1 if any not defined. */
 
+   /* Get the origin info for the value we are about to check.  At
+      least, if we are doing origin tracking.  If not, use a dummy
+      zero origin. */
+   if (MC_(clo_mc_level) == 3) {
+      origin = schemeE( mce, atom );
+      if (mce->hWordTy == Ity_I64) {
+         origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
+      }
+   } else {
+      origin = NULL;
+   }
+
+   fn    = NULL;
+   nm    = NULL;
+   args  = NULL;
+   nargs = -1;
+
    switch (sz) {
       case 0:
-         di = unsafeIRDirty_0_N( 
-                 0/*regparms*/, 
-                 "MC_(helperc_value_check0_fail)",
-                 VG_(fnptr_to_fnentry)( &MC_(helperc_value_check0_fail) ),
-                 mkIRExprVec_0() 
-              );
+         if (origin) {
+            fn    = &MC_(helperc_value_check0_fail_w_o);
+            nm    = "MC_(helperc_value_check0_fail_w_o)";
+            args  = mkIRExprVec_1(origin);
+            nargs = 1;
+         } else {
+            fn    = &MC_(helperc_value_check0_fail_no_o);
+            nm    = "MC_(helperc_value_check0_fail_no_o)";
+            args  = mkIRExprVec_0();
+            nargs = 0;
+         }
          break;
       case 1:
-         di = unsafeIRDirty_0_N( 
-                 0/*regparms*/, 
-                 "MC_(helperc_value_check1_fail)",
-                 VG_(fnptr_to_fnentry)( &MC_(helperc_value_check1_fail) ),
-                 mkIRExprVec_0() 
-              );
+         if (origin) {
+            fn    = &MC_(helperc_value_check1_fail_w_o);
+            nm    = "MC_(helperc_value_check1_fail_w_o)";
+            args  = mkIRExprVec_1(origin);
+            nargs = 1;
+         } else {
+            fn    = &MC_(helperc_value_check1_fail_no_o);
+            nm    = "MC_(helperc_value_check1_fail_no_o)";
+            args  = mkIRExprVec_0();
+            nargs = 0;
+         }
          break;
       case 4:
-         di = unsafeIRDirty_0_N( 
-                 0/*regparms*/, 
-                 "MC_(helperc_value_check4_fail)",
-                 VG_(fnptr_to_fnentry)( &MC_(helperc_value_check4_fail) ),
-                 mkIRExprVec_0() 
-              );
+         if (origin) {
+            fn    = &MC_(helperc_value_check4_fail_w_o);
+            nm    = "MC_(helperc_value_check4_fail_w_o)";
+            args  = mkIRExprVec_1(origin);
+            nargs = 1;
+         } else {
+            fn    = &MC_(helperc_value_check4_fail_no_o);
+            nm    = "MC_(helperc_value_check4_fail_no_o)";
+            args  = mkIRExprVec_0();
+            nargs = 0;
+         }
          break;
       case 8:
-         di = unsafeIRDirty_0_N( 
-                 0/*regparms*/, 
-                 "MC_(helperc_value_check8_fail)",
-                 VG_(fnptr_to_fnentry)( &MC_(helperc_value_check8_fail) ),
-                 mkIRExprVec_0() 
-              );
+         if (origin) {
+            fn    = &MC_(helperc_value_check8_fail_w_o);
+            nm    = "MC_(helperc_value_check8_fail_w_o)";
+            args  = mkIRExprVec_1(origin);
+            nargs = 1;
+         } else {
+            fn    = &MC_(helperc_value_check8_fail_no_o);
+            nm    = "MC_(helperc_value_check8_fail_no_o)";
+            args  = mkIRExprVec_0();
+            nargs = 0;
+         }
          break;
       default:
-         di = unsafeIRDirty_0_N( 
-                 1/*regparms*/, 
-                 "MC_(helperc_complain_undef)",
-                 VG_(fnptr_to_fnentry)( &MC_(helperc_complain_undef) ),
-                 mkIRExprVec_1( mkIRExpr_HWord( sz ))
-              );
+         if (origin) {
+            fn    = &MC_(helperc_value_checkN_fail_w_o);
+            nm    = "MC_(helperc_value_checkN_fail_w_o)";
+            args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
+            nargs = 2;
+         } else {
+            fn    = &MC_(helperc_value_checkN_fail_no_o);
+            nm    = "MC_(helperc_value_checkN_fail_no_o)";
+            args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
+            nargs = 1;
+         }
          break;
    }
+
+   tl_assert(fn);
+   tl_assert(nm);
+   tl_assert(args);
+   tl_assert(nargs >= 0 && nargs <= 2);
+   tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
+              || (MC_(clo_mc_level) == 2 && origin == NULL) );
+
+   di = unsafeIRDirty_0_N( nargs/*regparms*/, nm, 
+                           VG_(fnptr_to_fnentry)( fn ), args );
    di->guard = cond;
    setHelperAnns( mce, di );
-   stmt( mce->bb, IRStmt_Dirty(di));
+   stmt( 'V', mce, IRStmt_Dirty(di));
 
    /* Set the shadow tmp to be defined.  First, update the
       orig->shadow tmp mapping to reflect the fact that this shadow is
@@ -959,9 +1040,9 @@
    /* sameKindedAtoms ... */
    if (vatom->tag == Iex_RdTmp) {
       tl_assert(atom->tag == Iex_RdTmp);
-      newShadowTmp(mce, atom->Iex.RdTmp.tmp);
-      assign(mce->bb, findShadowTmp(mce, atom->Iex.RdTmp.tmp), 
-                      definedOfType(ty));
+      newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
+      assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), 
+                       definedOfType(ty));
    }
 }
 
@@ -1015,7 +1096,7 @@
    // Don't do shadow PUTs if we're not doing undefined value checking.
    // Their absence lets Vex's optimiser remove all the shadow computation
    // that they depend on, which includes GETs of the shadow registers.
-   if (!MC_(clo_undef_value_errors))
+   if (MC_(clo_mc_level) == 1)
       return;
    
    if (atom) {
@@ -1035,7 +1116,7 @@
       /* complainIfUndefined(mce, atom); */
    } else {
       /* Do a plain shadow Put. */
-      stmt( mce->bb, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ) );
+      stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ) );
    }
 }
 
@@ -1055,14 +1136,14 @@
    // Don't do shadow PUTIs if we're not doing undefined value checking.
    // Their absence lets Vex's optimiser remove all the shadow computation
    // that they depend on, which includes GETIs of the shadow registers.
-   if (!MC_(clo_undef_value_errors))
+   if (MC_(clo_mc_level) == 1)
       return;
    
    tl_assert(isOriginalAtom(mce,atom));
    vatom = expr2vbits( mce, atom );
    tl_assert(sameKindedAtoms(atom, vatom));
    ty   = descr->elemTy;
-   tyS  = shadowType(ty);
+   tyS  = shadowTypeV(ty);
    arrSize = descr->nElems * sizeofIRType(ty);
    tl_assert(ty != Ity_I1);
    tl_assert(isOriginalAtom(mce,ix));
@@ -1077,7 +1158,7 @@
       IRRegArray* new_descr 
          = mkIRRegArray( descr->base + mce->layout->total_sizeB, 
                          tyS, descr->nElems);
-      stmt( mce->bb, IRStmt_PutI( new_descr, ix, bias, vatom ));
+      stmt( 'V', mce, IRStmt_PutI( new_descr, ix, bias, vatom ));
    }
 }
 
@@ -1088,7 +1169,7 @@
 static 
 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
 {
-   IRType tyS = shadowType(ty);
+   IRType tyS = shadowTypeV(ty);
    tl_assert(ty != Ity_I1);
    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
       /* Always defined, return all zeroes of the relevant type */
@@ -1096,6 +1177,7 @@
    } else {
       /* return a cloned version of the Get that refers to the shadow
          area. */
+      /* FIXME: this isn't an atom! */
       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
    }
 }
@@ -1109,7 +1191,7 @@
                       IRRegArray* descr, IRAtom* ix, Int bias )
 {
    IRType ty   = descr->elemTy;
-   IRType tyS  = shadowType(ty);
+   IRType tyS  = shadowTypeV(ty);
    Int arrSize = descr->nElems * sizeofIRType(ty);
    tl_assert(ty != Ity_I1);
    tl_assert(isOriginalAtom(mce,ix));
@@ -1403,31 +1485,31 @@
    }
 
    // a_min = aa & ~qaa
-   a_min = assignNew(mce,ty, 
+   a_min = assignNew('V', mce,ty, 
                      binop(opAND, aa,
-                                  assignNew(mce,ty, unop(opNOT, qaa))));
+                                  assignNew('V', mce,ty, unop(opNOT, qaa))));
 
    // b_min = bb & ~qbb
-   b_min = assignNew(mce,ty, 
+   b_min = assignNew('V', mce,ty, 
                      binop(opAND, bb,
-                                  assignNew(mce,ty, unop(opNOT, qbb))));
+                                  assignNew('V', mce,ty, unop(opNOT, qbb))));
 
    // a_max = aa | qaa
-   a_max = assignNew(mce,ty, binop(opOR, aa, qaa));
+   a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
 
    // b_max = bb | qbb
-   b_max = assignNew(mce,ty, binop(opOR, bb, qbb));
+   b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
 
    if (add) {
       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
       return
-      assignNew(mce,ty,
+      assignNew('V', mce,ty,
          binop( opOR,
-                assignNew(mce,ty, binop(opOR, qaa, qbb)),
-                assignNew(mce,ty, 
+                assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
+                assignNew('V', mce,ty, 
                    binop( opXOR, 
-                          assignNew(mce,ty, binop(opADD, a_min, b_min)),
-                          assignNew(mce,ty, binop(opADD, a_max, b_max))
+                          assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
+                          assignNew('V', mce,ty, binop(opADD, a_max, b_max))
                    )
                 )
          )
@@ -1435,13 +1517,13 @@
    } else {
       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max + b_min))
       return
-      assignNew(mce,ty,
+      assignNew('V', mce,ty,
          binop( opOR,
-                assignNew(mce,ty, binop(opOR, qaa, qbb)),
-                assignNew(mce,ty, 
+                assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
+                assignNew('V', mce,ty, 
                    binop( opXOR, 
-                          assignNew(mce,ty, binop(opSUB, a_min, b_max)),
-                          assignNew(mce,ty, binop(opSUB, a_max, b_min))
+                          assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
+                          assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
                    )
                 )
          )
@@ -1488,9 +1570,9 @@
    tl_assert(sameKindedAtoms(qbb,bb));
    return 
       assignNew(
-         mce, ty,
+         'V', mce, ty,
          mkUifU( mce, ty,
-                 assignNew(mce, ty, binop(original_op, qaa, bb)),
+                 assignNew('V', mce, ty, binop(original_op, qaa, bb)),
                  mkPCastTo(mce, ty, qbb)
          )
    );
@@ -1505,37 +1587,37 @@
 
 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
+   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
 }
 
 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
+   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
 }
 
 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
+   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
 }
 
 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
+   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
 }
 
 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
+   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
 }
 
 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
+   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
 }
 
 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
+   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
 }
 
 
@@ -1582,7 +1664,7 @@
    tl_assert(isShadowAtom(mce, vatomX));
    tl_assert(isShadowAtom(mce, vatomY));
    at = mkUifUV128(mce, vatomX, vatomY);
-   at = assignNew(mce, Ity_V128, mkPCast32x4(mce, at));
+   at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
    return at;
 }
 
@@ -1591,7 +1673,7 @@
 {
    IRAtom* at;
    tl_assert(isShadowAtom(mce, vatomX));
-   at = assignNew(mce, Ity_V128, mkPCast32x4(mce, vatomX));
+   at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
    return at;
 }
 
@@ -1602,9 +1684,9 @@
    tl_assert(isShadowAtom(mce, vatomX));
    tl_assert(isShadowAtom(mce, vatomY));
    at = mkUifUV128(mce, vatomX, vatomY);
-   at = assignNew(mce, Ity_I32, unop(Iop_V128to32, at));
+   at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
    at = mkPCastTo(mce, Ity_I32, at);
-   at = assignNew(mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
+   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
    return at;
 }
 
@@ -1613,9 +1695,9 @@
 {
    IRAtom* at;
    tl_assert(isShadowAtom(mce, vatomX));
-   at = assignNew(mce, Ity_I32, unop(Iop_V128to32, vatomX));
+   at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
    at = mkPCastTo(mce, Ity_I32, at);
-   at = assignNew(mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
+   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
    return at;
 }
 
@@ -1628,7 +1710,7 @@
    tl_assert(isShadowAtom(mce, vatomX));
    tl_assert(isShadowAtom(mce, vatomY));
    at = mkUifUV128(mce, vatomX, vatomY);
-   at = assignNew(mce, Ity_V128, mkPCast64x2(mce, at));
+   at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
    return at;
 }
 
@@ -1637,7 +1719,7 @@
 {
    IRAtom* at;
    tl_assert(isShadowAtom(mce, vatomX));
-   at = assignNew(mce, Ity_V128, mkPCast64x2(mce, vatomX));
+   at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
    return at;
 }
 
@@ -1648,9 +1730,9 @@
    tl_assert(isShadowAtom(mce, vatomX));
    tl_assert(isShadowAtom(mce, vatomY));
    at = mkUifUV128(mce, vatomX, vatomY);
-   at = assignNew(mce, Ity_I64, unop(Iop_V128to64, at));
+   at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
    at = mkPCastTo(mce, Ity_I64, at);
-   at = assignNew(mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
+   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
    return at;
 }
 
@@ -1659,9 +1741,9 @@
 {
    IRAtom* at;
    tl_assert(isShadowAtom(mce, vatomX));
-   at = assignNew(mce, Ity_I64, unop(Iop_V128to64, vatomX));
+   at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
    at = mkPCastTo(mce, Ity_I64, at);
-   at = assignNew(mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
+   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
    return at;
 }
 
@@ -1708,9 +1790,9 @@
    }
    tl_assert(isShadowAtom(mce,vatom1));
    tl_assert(isShadowAtom(mce,vatom2));
-   at1 = assignNew(mce, Ity_V128, pcast(mce, vatom1));
-   at2 = assignNew(mce, Ity_V128, pcast(mce, vatom2));
-   at3 = assignNew(mce, Ity_V128, binop(narrow_op, at1, at2));
+   at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
+   at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
+   at3 = assignNew('V', mce, Ity_V128, binop(narrow_op, at1, at2));
    return at3;
 }
 
@@ -1728,9 +1810,9 @@
    }
    tl_assert(isShadowAtom(mce,vatom1));
    tl_assert(isShadowAtom(mce,vatom2));
-   at1 = assignNew(mce, Ity_I64, pcast(mce, vatom1));
-   at2 = assignNew(mce, Ity_I64, pcast(mce, vatom2));
-   at3 = assignNew(mce, Ity_I64, binop(narrow_op, at1, at2));
+   at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
+   at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
+   at3 = assignNew('V', mce, Ity_I64, binop(narrow_op, at1, at2));
    return at3;
 }
 
@@ -1927,7 +2009,7 @@
       case Iop_ShlN8x8:
          /* Same scheme as with all other shifts. */
          complainIfUndefined(mce, atom2);
-         return assignNew(mce, Ity_I64, binop(op, vatom1, atom2));
+         return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
 
       case Iop_QNarrow32Sx2:
       case Iop_QNarrow16Sx4:
@@ -1979,7 +2061,7 @@
       case Iop_InterleaveHI8x8:
       case Iop_CatOddLanes16x4:
       case Iop_CatEvenLanes16x4:
-         return assignNew(mce, Ity_I64, binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
 
       /* Perm8x8: rearrange values in left arg using steering values
         from right arg.  So rearrange the vbits in the same way but
@@ -1987,7 +2069,7 @@
       case Iop_Perm8x8:
          return mkUifU64(
                    mce,
-                   assignNew(mce, Ity_I64, binop(op, vatom1, atom2)),
+                   assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
                    mkPCast8x8(mce, vatom2)
                 );
 
@@ -2007,7 +2089,7 @@
             this is wrong now, scalar shifts are done properly lazily.
             Vector shifts should be fixed too. */
          complainIfUndefined(mce, atom2);
-         return assignNew(mce, Ity_V128, binop(op, vatom1, atom2));
+         return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
 
       /* V x V shifts/rotates are done using the standard lazy scheme. */
       case Iop_Shl8x16:
@@ -2015,7 +2097,7 @@
       case Iop_Sar8x16:
       case Iop_Rol8x16:
          return mkUifUV128(mce,
-                   assignNew(mce, Ity_V128, binop(op, vatom1, atom2)),
+                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
                    mkPCast8x16(mce,vatom2)
                 );
 
@@ -2024,7 +2106,7 @@
       case Iop_Sar16x8:
       case Iop_Rol16x8:
          return mkUifUV128(mce,
-                   assignNew(mce, Ity_V128, binop(op, vatom1, atom2)),
+                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
                    mkPCast16x8(mce,vatom2)
                 );
 
@@ -2033,7 +2115,7 @@
       case Iop_Sar32x4:
       case Iop_Rol32x4:
          return mkUifUV128(mce,
-                   assignNew(mce, Ity_V128, binop(op, vatom1, atom2)),
+                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
                    mkPCast32x4(mce,vatom2)
                 );
 
@@ -2163,7 +2245,7 @@
       case Iop_InterleaveHI32x4:
       case Iop_InterleaveHI16x8:
       case Iop_InterleaveHI8x16:
-         return assignNew(mce, Ity_V128, binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
  
      /* Perm8x16: rearrange values in left arg using steering values
         from right arg.  So rearrange the vbits in the same way but
@@ -2171,7 +2253,7 @@
       case Iop_Perm8x16:
          return mkUifUV128(
                    mce,
-                   assignNew(mce, Ity_V128, binop(op, vatom1, atom2)),
+                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
                    mkPCast8x16(mce, vatom2)
                 );
 
@@ -2186,8 +2268,8 @@
       case Iop_MullEven16Sx8: {
          IRAtom* at;
          at = binary16Ix8(mce,vatom1,vatom2);
-         at = assignNew(mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
-         at = assignNew(mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
+         at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
+         at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
 	 return at;
       }
 
@@ -2196,8 +2278,8 @@
       case Iop_MullEven8Sx16: {
          IRAtom* at;
          at = binary8Ix16(mce,vatom1,vatom2);
-         at = assignNew(mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
-         at = assignNew(mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
+         at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
+         at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
 	 return at;
       }
 
@@ -2207,8 +2289,8 @@
          than a data steering operation. */
       case Iop_Narrow32x4: 
       case Iop_Narrow16x8: 
-         return assignNew(mce, Ity_V128, 
-                               binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_V128, 
+                                    binop(op, vatom1, vatom2));
 
       case Iop_ShrV128:
       case Iop_ShlV128:
@@ -2216,12 +2298,12 @@
             this is wrong now, scalar shifts are done properly lazily.
             Vector shifts should be fixed too. */
          complainIfUndefined(mce, atom2);
-         return assignNew(mce, Ity_V128, binop(op, vatom1, atom2));
+         return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
 
 
       /* I128-bit data-steering */
       case Iop_64HLto128:
-         return assignNew(mce, Ity_I128, binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
 
       /* Scalar floating point */
 
@@ -2260,36 +2342,36 @@
          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
 
       case Iop_16HLto32:
-         return assignNew(mce, Ity_I32, binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
       case Iop_32HLto64:
-         return assignNew(mce, Ity_I64, binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
 
       case Iop_MullS64:
       case Iop_MullU64: {
          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
-         return assignNew(mce, Ity_I128, binop(Iop_64HLto128, vHi64, vLo64));
+         return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, vHi64, vLo64));
       }
 
       case Iop_MullS32:
       case Iop_MullU32: {
          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
-         return assignNew(mce, Ity_I64, binop(Iop_32HLto64, vHi32, vLo32));
+         return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, vHi32, vLo32));
       }
 
       case Iop_MullS16:
       case Iop_MullU16: {
          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
-         return assignNew(mce, Ity_I32, binop(Iop_16HLto32, vHi16, vLo16));
+         return assignNew('V', mce, Ity_I32, binop(Iop_16HLto32, vHi16, vLo16));
       }
 
       case Iop_MullS8:
       case Iop_MullU8: {
          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
-         return assignNew(mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
+         return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
       }
 
       case Iop_DivS32:
@@ -2424,7 +2506,7 @@
       do_And_Or:
          return
          assignNew(
-            mce, 
+            'V', mce, 
             and_or_ty,
             difd(mce, uifu(mce, vatom1, vatom2),
                       difd(mce, improve(mce, atom1, vatom1),
@@ -2484,7 +2566,7 @@
       case Iop_Dup8x16:
       case Iop_Dup16x8:
       case Iop_Dup32x4:
-         return assignNew(mce, Ity_V128, unop(op, vatom));
+         return assignNew('V', mce, Ity_V128, unop(op, vatom));
 
       case Iop_F32toF64: 
       case Iop_I32toF64:
@@ -2511,7 +2593,7 @@
       case Iop_V128HIto64:
       case Iop_128HIto64:
       case Iop_128to64:
-         return assignNew(mce, Ity_I64, unop(op, vatom));
+         return assignNew('V', mce, Ity_I64, unop(op, vatom));
 
       case Iop_64to32:
       case Iop_64HIto32:
@@ -2522,27 +2604,27 @@
       case Iop_16Sto32:
       case Iop_8Sto32:
       case Iop_V128to32:
-         return assignNew(mce, Ity_I32, unop(op, vatom));
+         return assignNew('V', mce, Ity_I32, unop(op, vatom));
 
       case Iop_8Sto16:
       case Iop_8Uto16:
       case Iop_32to16:
       case Iop_32HIto16:
       case Iop_64to16:
-         return assignNew(mce, Ity_I16, unop(op, vatom));
+         return assignNew('V', mce, Ity_I16, unop(op, vatom));
 
       case Iop_1Uto8:
       case Iop_16to8:
       case Iop_16HIto8:
       case Iop_32to8:
       case Iop_64to8:
-         return assignNew(mce, Ity_I8, unop(op, vatom));
+         return assignNew('V', mce, Ity_I8, unop(op, vatom));
 
       case Iop_32to1:
-         return assignNew(mce, Ity_I1, unop(Iop_32to1, vatom));
+         return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
 
       case Iop_64to1:
-         return assignNew(mce, Ity_I1, unop(Iop_64to1, vatom));
+         return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
 
       case Iop_ReinterpF64asI64:
       case Iop_ReinterpI64asF64:
@@ -2583,7 +2665,7 @@
 
    /* Now cook up a call to the relevant helper function, to read the
       data V bits from shadow memory. */
-   ty = shadowType(ty);
+   ty = shadowTypeV(ty);
 
    if (end == Iend_LE) {   
       switch (ty) {
@@ -2631,7 +2713,7 @@
       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
-      addrAct = assignNew(mce, tyAddr, binop(mkAdd, addr, eBias) );
+      addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
    }
 
    /* We need to have a place to park the V bits we're just about to
@@ -2642,7 +2724,7 @@
                            hname, VG_(fnptr_to_fnentry)( helper ), 
                            mkIRExprVec_1( addrAct ));
    setHelperAnns( mce, di );
-   stmt( mce->bb, IRStmt_Dirty(di) );
+   stmt( 'V', mce, IRStmt_Dirty(di) );
 
    return mkexpr(datavbits);
 }
@@ -2655,7 +2737,7 @@
 {
    IRAtom *v64hi, *v64lo;
    tl_assert(end == Iend_LE || end == Iend_BE);
-   switch (shadowType(ty)) {
+   switch (shadowTypeV(ty)) {
       case Ity_I8: 
       case Ity_I16: 
       case Ity_I32: 
@@ -2669,7 +2751,7 @@
             v64hi = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias);
             v64lo = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+8);
          }
-         return assignNew( mce, 
+         return assignNew( 'V', mce, 
                            Ity_V128, 
                            binop(Iop_64HLtoV128, v64hi, v64lo));
       default:
@@ -2699,7 +2781,8 @@
    ty = typeOfIRExpr(mce->bb->tyenv, vbits0);
 
    return
-      mkUifU(mce, ty, assignNew(mce, ty, IRExpr_Mux0X(cond, vbits0, vbitsX)),
+      mkUifU(mce, ty, assignNew('V', mce, ty, 
+                                     IRExpr_Mux0X(cond, vbits0, vbitsX)),
                       mkPCastTo(mce, ty, vbitsC) );
 }      
 
@@ -2718,10 +2801,10 @@
                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
 
       case Iex_RdTmp:
-         return IRExpr_RdTmp( findShadowTmp(mce, e->Iex.RdTmp.tmp) );
+         return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
 
       case Iex_Const:
-         return definedOfType(shadowType(typeOfIRExpr(mce->bb->tyenv, e)));
+         return definedOfType(shadowTypeV(typeOfIRExpr(mce->bb->tyenv, e)));
 
       case Iex_Qop:
          return expr2vbits_Qop(
@@ -2789,20 +2872,28 @@
 
    if (tyH == Ity_I32) {
       switch (ty) {
-         case Ity_I32: return vatom;
-         case Ity_I16: return assignNew(mce, tyH, unop(Iop_16Uto32, vatom));
-         case Ity_I8:  return assignNew(mce, tyH, unop(Iop_8Uto32, vatom));
-         default:      goto unhandled;
+         case Ity_I32:
+            return vatom;
+         case Ity_I16:
+            return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
+         case Ity_I8:
+            return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
+         default:
+            goto unhandled;
       }
    } else
    if (tyH == Ity_I64) {
       switch (ty) {
-         case Ity_I32: return assignNew(mce, tyH, unop(Iop_32Uto64, vatom));
-         case Ity_I16: return assignNew(mce, tyH, unop(Iop_32Uto64, 
-                              assignNew(mce, Ity_I32, unop(Iop_16Uto32, vatom))));
-         case Ity_I8:  return assignNew(mce, tyH, unop(Iop_32Uto64, 
-                              assignNew(mce, Ity_I32, unop(Iop_8Uto32, vatom))));
-         default:      goto unhandled;
+         case Ity_I32:
+            return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
+         case Ity_I16:
+            return assignNew('V', mce, tyH, unop(Iop_32Uto64, 
+                   assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
+         case Ity_I8:
+            return assignNew('V', mce, tyH, unop(Iop_32Uto64, 
+                   assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
+         default:
+            goto unhandled;
       }
    } else {
       goto unhandled;
@@ -2860,7 +2951,7 @@
    // If we're not doing undefined value checking, pretend that this value
    // is "all valid".  That lets Vex's optimiser remove some of the V bit
    // shadow computation ops that precede it.
-   if (!MC_(clo_undef_value_errors)) {
+   if (MC_(clo_mc_level) == 1) {
       switch (ty) {
          case Ity_V128: c = IRConst_V128(V_BITS16_DEFINED); break; // V128 weirdness
          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
@@ -2930,16 +3021,16 @@
       }
 
       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
-      addrLo64  = assignNew(mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
-      vdataLo64 = assignNew(mce, Ity_I64, unop(Iop_V128to64, vdata));
+      addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
+      vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
       diLo64    = unsafeIRDirty_0_N( 
                      1/*regparms*/, 
                      hname, VG_(fnptr_to_fnentry)( helper ), 
                      mkIRExprVec_2( addrLo64, vdataLo64 )
                   );
       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
-      addrHi64  = assignNew(mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
-      vdataHi64 = assignNew(mce, Ity_I64, unop(Iop_V128HIto64, vdata));
+      addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
+      vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
       diHi64    = unsafeIRDirty_0_N( 
                      1/*regparms*/, 
                      hname, VG_(fnptr_to_fnentry)( helper ), 
@@ -2947,8 +3038,8 @@
                   );
       setHelperAnns( mce, diLo64 );
       setHelperAnns( mce, diHi64 );
-      stmt( mce->bb, IRStmt_Dirty(diLo64) );
-      stmt( mce->bb, IRStmt_Dirty(diHi64) );
+      stmt( 'V', mce, IRStmt_Dirty(diLo64) );
+      stmt( 'V', mce, IRStmt_Dirty(diHi64) );
 
    } else {
 
@@ -2958,7 +3049,7 @@
          addrAct = addr;
       } else {
          eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
-         addrAct = assignNew(mce, tyAddr, binop(mkAdd, addr, eBias) );
+         addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
       }
 
       if (ty == Ity_I64) {
@@ -2979,7 +3070,7 @@
               );
       }
       setHelperAnns( mce, di );
-      stmt( mce->bb, IRStmt_Dirty(di) );
+      stmt( 'V', mce, IRStmt_Dirty(di) );
    }
 
 }
@@ -3060,8 +3151,8 @@
          /* update 'curr' with UifU of the state slice 
             gOff .. gOff+n-1 */
          tySrc = szToITy( n );
-         src   = assignNew( mce, tySrc, 
-                            shadow_GET(mce, gOff, tySrc ) );
+         src   = assignNew( 'V', mce, tySrc, 
+                                 shadow_GET(mce, gOff, tySrc ) );
          here = mkPCastTo( mce, Ity_I32, src );
          curr = mkUifU32(mce, here, curr);
          gSz -= n;
@@ -3123,9 +3214,9 @@
 
    /* Outputs: the destination temporary, if there is one. */
    if (d->tmp != IRTemp_INVALID) {
-      dst   = findShadowTmp(mce, d->tmp);
+      dst   = findShadowTmpV(mce, d->tmp);
       tyDst = typeOfIRTemp(mce->bb->tyenv, d->tmp);
-      assign( mce->bb, dst, mkPCastTo( mce, tyDst, curr) );
+      assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
    }
 
    /* Outputs: guest state that we write or modify. */
@@ -3185,19 +3276,28 @@
    notify the A/V bit machinery of this fact.
 
    We call 
-   void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len );
+   void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
+                                                    Addr nia );
 */
 static
-void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len )
+void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
 {
    IRDirty* di;
+   /* Minor optimisation: if not doing origin tracking, ignore the
+      supplied nia and pass zero instead.  This is on the basis that
+      MC_(helperc_MAKE_STACK_UNINIT) will ignore it anyway, and we can
+      almost always generate a shorter instruction to put zero into a
+      register than any other value. */
+   if (MC_(clo_mc_level) < 3)
+      nia = mkIRExpr_HWord(0);
+
    di = unsafeIRDirty_0_N(
            0/*regparms*/,
            "MC_(helperc_MAKE_STACK_UNINIT)",
            VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT) ),
-           mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
+           mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
         );
-   stmt( mce->bb, IRStmt_Dirty(di) );
+   stmt( 'V', mce, IRStmt_Dirty(di) );
 }
 
 
@@ -3205,6 +3305,8 @@
 /*--- Memcheck main                                        ---*/
 /*------------------------------------------------------------*/
 
+static void schemeS ( MCEnv* mce, IRStmt* st );
+
 static Bool isBogusAtom ( IRAtom* at )
 {
    ULong n = 0;
@@ -3301,7 +3403,8 @@
       case Ist_Exit:
          return isBogusAtom(st->Ist.Exit.guard);
       case Ist_AbiHint:
-         return isBogusAtom(st->Ist.AbiHint.base);
+         return isBogusAtom(st->Ist.AbiHint.base)
+                || isBogusAtom(st->Ist.AbiHint.nia);
       case Ist_NoOp:
       case Ist_IMark:
       case Ist_MBE:
@@ -3320,7 +3423,7 @@
                         VexGuestExtents* vge,
                         IRType gWordTy, IRType hWordTy )
 {
-   Bool    verboze = False; //True; 
+   Bool    verboze = 0||False;
    Bool    bogus;
    Int     i, j, first_stmt;
    IRStmt* st;
@@ -3333,12 +3436,16 @@
    }
 
    /* Check we're not completely nuts */
-   tl_assert(sizeof(UWord) == sizeof(void*));
-   tl_assert(sizeof(Word)  == sizeof(void*));
-   tl_assert(sizeof(ULong) == 8);
-   tl_assert(sizeof(Long)  == 8);
-   tl_assert(sizeof(UInt)  == 4);
-   tl_assert(sizeof(Int)   == 4);
+   tl_assert(sizeof(UWord)  == sizeof(void*));
+   tl_assert(sizeof(Word)   == sizeof(void*));
+   tl_assert(sizeof(Addr)   == sizeof(void*));
+   tl_assert(sizeof(ULong)  == 8);
+   tl_assert(sizeof(Long)   == 8);
+   tl_assert(sizeof(Addr64) == 8);
+   tl_assert(sizeof(UInt)   == 4);
+   tl_assert(sizeof(Int)    == 4);
+
+   tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
 
    /* Set up SB */
    bb = deepCopyIRSBExceptStmts(bb_in);
@@ -3346,13 +3453,17 @@
    /* Set up the running environment.  Only .bb is modified as we go
       along. */
    mce.bb             = bb;
+   mce.trace          = verboze;
    mce.layout         = layout;
    mce.n_originalTmps = bb->tyenv->types_used;
    mce.hWordTy        = hWordTy;
    mce.bogusLiterals  = False;
-   mce.tmpMap         = LibVEX_Alloc(mce.n_originalTmps * sizeof(IRTemp));
-   for (i = 0; i < mce.n_originalTmps; i++)
-      mce.tmpMap[i] = IRTemp_INVALID;
+   mce.tmpMapV        = LibVEX_Alloc(mce.n_originalTmps * sizeof(IRTemp));
+   mce.tmpMapB        = LibVEX_Alloc(mce.n_originalTmps * sizeof(IRTemp));
+   for (i = 0; i < mce.n_originalTmps; i++) {
+      mce.tmpMapV[i] = IRTemp_INVALID;
+      mce.tmpMapB[i] = IRTemp_INVALID;
+   }
 
    /* Make a preliminary inspection of the statements, to see if there
       are any dodgy-looking literals.  If there are, we generate
@@ -3392,7 +3503,7 @@
       tl_assert(st);
       tl_assert(isFlatIRStmt(st));
 
-      addStmtToIRSB( bb, bb_in->stmts[i] );
+      stmt( 'C', &mce, bb_in->stmts[i] );
       i++;
    }
 
@@ -3416,12 +3527,12 @@
    */
    for (j = 0; j < i; j++) {
       if (bb_in->stmts[j]->tag == Ist_WrTmp) {
-         /* findShadowTmp checks its arg is an original tmp;
+         /* findShadowTmpV checks its arg is an original tmp;
             no need to assert that here. */
          IRTemp tmp_o = bb_in->stmts[j]->Ist.WrTmp.tmp;
-         IRTemp tmp_s = findShadowTmp(&mce, tmp_o);
+         IRTemp tmp_s = findShadowTmpV(&mce, tmp_o);
          IRType ty_s  = typeOfIRTemp(bb->tyenv, tmp_s);
-         assign( bb, tmp_s, definedOfType( ty_s ) );
+         assign( 'V', &mce, tmp_s, definedOfType( ty_s ) );
          if (0) {
             VG_(printf)("create shadow tmp for preamble tmp [%d] ty ", j);
             ppIRType( ty_s );
@@ -3443,17 +3554,21 @@
       first_stmt = bb->stmts_used;
 
       if (verboze) {
+         VG_(printf)("\n");
          ppIRStmt(st);
-         VG_(printf)("\n\n");
+         VG_(printf)("\n");
       }
 
+      if (MC_(clo_mc_level) == 3)
+         schemeS( &mce, st );
+
       /* Generate instrumentation code for each stmt ... */
 
       switch (st->tag) {
 
          case Ist_WrTmp:
-            assign( bb, findShadowTmp(&mce, st->Ist.WrTmp.tmp), 
-                        expr2vbits( &mce, st->Ist.WrTmp.data) );
+            assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp), 
+                               expr2vbits( &mce, st->Ist.WrTmp.data) );
             break;
 
          case Ist_Put:
@@ -3482,8 +3597,10 @@
             complainIfUndefined( &mce, st->Ist.Exit.guard );
             break;
 
-         case Ist_NoOp:
          case Ist_IMark:
+            break;
+
+         case Ist_NoOp:
          case Ist_MBE:
             break;
 
@@ -3492,7 +3609,9 @@
             break;
 
          case Ist_AbiHint:
-            do_AbiHint( &mce, st->Ist.AbiHint.base, st->Ist.AbiHint.len );
+            do_AbiHint( &mce, st->Ist.AbiHint.base,
+                              st->Ist.AbiHint.len,
+                              st->Ist.AbiHint.nia );
             break;
 
          default:
@@ -3503,7 +3622,7 @@
 
       } /* switch (st->tag) */
 
-      if (verboze) {
+      if (0 && verboze) {
          for (j = first_stmt; j < bb->stmts_used; j++) {
             VG_(printf)("   ");
             ppIRStmt(bb->stmts[j]);
@@ -3513,7 +3632,7 @@
       }
 
       /* ... and finally copy the stmt itself to the output. */
-      addStmtToIRSB(bb, st);
+      stmt('C', &mce, st);
 
    }
 
@@ -3528,7 +3647,7 @@
 
    complainIfUndefined( &mce, bb->next );
 
-   if (verboze) {
+   if (0 && verboze) {
       for (j = first_stmt; j < bb->stmts_used; j++) {
          VG_(printf)("   ");
          ppIRStmt(bb->stmts[j]);
@@ -3547,7 +3666,7 @@
 /* This exploits the observation that Memcheck often produces
    repeated conditional calls of the form
 
-   Dirty G MC_(helperc_value_check0/1/4/8_fail)()
+   Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
 
    with the same guard expression G guarding the same helper call.
    The second and subsequent calls are redundant.  This usually
@@ -3636,10 +3755,14 @@
 static Bool is_helperc_value_checkN_fail ( HChar* name )
 {
    return
-      0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail)")
-      || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail)")
-      || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail)")
-      || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail)");
+      0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_no_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_no_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_no_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_no_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_w_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_w_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_w_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_w_o)");
 }
 
 IRSB* MC_(final_tidy) ( IRSB* sb_in )
@@ -3683,6 +3806,509 @@
 }
 
 
+/*------------------------------------------------------------*/
+/*--- Origin tracking stuff                                ---*/
+/*------------------------------------------------------------*/
+
+static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
+{
+   tl_assert(orig < mce->n_originalTmps);
+   if (mce->tmpMapB[orig] == IRTemp_INVALID) {
+      mce->tmpMapB[orig] 
+         = newIRTemp(mce->bb->tyenv, Ity_I32);
+   }
+   return mce->tmpMapB[orig];
+}
+
+static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
+{
+   return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
+}
+
+static IRAtom* gen_load_b ( MCEnv* mce, Int szB, 
+                            IRAtom* baseaddr, Int offset )
+{
+   void*    hFun;
+   HChar*   hName;
+   IRTemp   bTmp;
+   IRDirty* di;
+   IRType   aTy   = typeOfIRExpr( mce->bb->tyenv, baseaddr );
+   IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
+   IRAtom*  ea    = baseaddr;
+   if (offset != 0) {
+      IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
+                                   : mkU64( (Long)(Int)offset );
+      ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
+   }
+   bTmp = newIRTemp(mce->bb->tyenv, mce->hWordTy);
+
+   switch (szB) {
+      case 1: hFun  = (void*)&MC_(helperc_b_load1);
+              hName = "MC_(helperc_b_load1)";
+              break;
+      case 2: hFun  = (void*)&MC_(helperc_b_load2);
+              hName = "MC_(helperc_b_load2)";
+              break;
+      case 4: hFun  = (void*)&MC_(helperc_b_load4);
+              hName = "MC_(helperc_b_load4)";
+              break;
+      case 8: hFun  = (void*)&MC_(helperc_b_load8);
+              hName = "MC_(helperc_b_load8)";
+              break;
+      case 16: hFun  = (void*)&MC_(helperc_b_load16);
+               hName = "MC_(helperc_b_load16)";
+               break;
+      default:
+         VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
+         tl_assert(0);
+   }
+   di = unsafeIRDirty_1_N(
+           bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
+           mkIRExprVec_1( ea )
+        );
+   /* no need to mess with any annotations.  This call accesses
+      neither guest state nor guest memory. */
+   stmt( 'B', mce, IRStmt_Dirty(di) );
+   if (mce->hWordTy == Ity_I64) {
+      /* 64-bit host */
+      IRTemp bTmp32 = newIRTemp(mce->bb->tyenv, Ity_I32);
+      assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
+      return mkexpr(bTmp32);
+   } else {
+      /* 32-bit host */
+      return mkexpr(bTmp);
+   }
+}
+static void gen_store_b ( MCEnv* mce, Int szB,
+                          IRAtom* baseaddr, Int offset, IRAtom* dataB )
+{
+   void*    hFun;
+   HChar*   hName;
+   IRDirty* di;
+   IRType   aTy   = typeOfIRExpr( mce->bb->tyenv, baseaddr );
+   IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
+   IRAtom*  ea    = baseaddr;
+   if (offset != 0) {
+      IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
+                                   : mkU64( (Long)(Int)offset );
+      ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
+   }
+   if (mce->hWordTy == Ity_I64)
+      dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
+
+   switch (szB) {
+      case 1: hFun  = (void*)&MC_(helperc_b_store1);
+              hName = "MC_(helperc_b_store1)";
+              break;
+      case 2: hFun  = (void*)&MC_(helperc_b_store2);
+              hName = "MC_(helperc_b_store2)";
+              break;
+      case 4: hFun  = (void*)&MC_(helperc_b_store4);
+              hName = "MC_(helperc_b_store4)";
+              break;
+      case 8: hFun  = (void*)&MC_(helperc_b_store8);
+              hName = "MC_(helperc_b_store8)";
+              break;
+      case 16: hFun  = (void*)&MC_(helperc_b_store16);
+               hName = "MC_(helperc_b_store16)";
+               break;
+      default:
+         tl_assert(0);
+   }
+   di = unsafeIRDirty_0_N( 2/*regparms*/,
+           hName, VG_(fnptr_to_fnentry)( hFun ),
+           mkIRExprVec_2( ea, dataB )
+        );
+   /* no need to mess with any annotations.  This call accesses
+      neither guest state nor guest memory. */
+   stmt( 'B', mce, IRStmt_Dirty(di) );
+}
+
+static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
+   IRType eTy = typeOfIRExpr(mce->bb->tyenv, e);
+   if (eTy == Ity_I64)
+      return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
+   if (eTy == Ity_I32)
+      return e;
+   tl_assert(0);
+}
+
+static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
+   IRType eTy = typeOfIRExpr(mce->bb->tyenv, e);
+   tl_assert(eTy == Ity_I32);
+   if (dstTy == Ity_I64)
+      return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
+   tl_assert(0);
+}
+
+static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
+{
+   tl_assert(MC_(clo_mc_level) == 3);
+
+   switch (e->tag) {
+
+      case Iex_GetI: {
+         IRRegArray* descr_b;
+         IRAtom      *t1, *t2, *t3, *t4;
+         IRRegArray* descr      = e->Iex.GetI.descr;
+         IRType equivIntTy 
+            = MC_(get_otrack_reg_array_equiv_int_type)(descr);
+         /* If this array is unshadowable for whatever reason, use the
+            usual approximation. */
+         if (equivIntTy == Ity_INVALID)
+            return mkU32(0);
+         tl_assert(sizeofIRType(equivIntTy) >= 4);
+         tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
+         descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
+                                 equivIntTy, descr->nElems );
+         /* Do a shadow indexed get of the same size, giving t1.  Take
+            the bottom 32 bits of it, giving t2.  Compute into t3 the
+            origin for the index (almost certainly zero, but there's
+            no harm in being completely general here, since iropt will
+            remove any useless code), and fold it in, giving a final
+            value t4. */
+         t1 = assignNew( 'B', mce, equivIntTy, 
+                          IRExpr_GetI( descr_b, e->Iex.GetI.ix, 
+                                                e->Iex.GetI.bias ));
+         t2 = narrowTo32( mce, t1 );
+         t3 = schemeE( mce, e->Iex.GetI.ix );
+         t4 = gen_maxU32( mce, t2, t3 );
+         return t4;
+      }
+      case Iex_CCall: {
+         Int i;
+         IRAtom*  here;
+         IRExpr** args = e->Iex.CCall.args;
+         IRAtom*  curr = mkU32(0);
+         for (i = 0; args[i]; i++) {
+            tl_assert(i < 32);
+            tl_assert(isOriginalAtom(mce, args[i]));
+            /* Only take notice of this arg if the callee's
+               mc-exclusion mask does not say it is to be excluded. */
+            if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
+               /* the arg is to be excluded from definedness checking.
+                  Do nothing. */
+               if (0) VG_(printf)("excluding %s(%d)\n",
+                                  e->Iex.CCall.cee->name, i);
+            } else {
+               /* calculate the arg's definedness, and pessimistically
+                  merge it in. */
+               here = schemeE( mce, args[i] );
+               curr = gen_maxU32( mce, curr, here );
+            }
+         }
+         return curr;
+      }
+      case Iex_Load: {
+         Int dszB;
+         dszB = sizeofIRType(e->Iex.Load.ty);
+         /* assert that the B value for the address is already
+            available (somewhere) */
+         tl_assert(isIRAtom(e->Iex.Load.addr));
+         tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
+         return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
+      }
+      case Iex_Mux0X: {
+         IRAtom* b1 = schemeE( mce, e->Iex.Mux0X.cond );
+         IRAtom* b2 = schemeE( mce, e->Iex.Mux0X.expr0 );
+         IRAtom* b3 = schemeE( mce, e->Iex.Mux0X.exprX );
+         return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
+      }
+      case Iex_Qop: {
+         IRAtom* b1 = schemeE( mce, e->Iex.Qop.arg1 );
+         IRAtom* b2 = schemeE( mce, e->Iex.Qop.arg2 );
+         IRAtom* b3 = schemeE( mce, e->Iex.Qop.arg3 );
+         IRAtom* b4 = schemeE( mce, e->Iex.Qop.arg4 );
+         return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
+                                 gen_maxU32( mce, b3, b4 ) );
+      }
+      case Iex_Triop: {
+         IRAtom* b1 = schemeE( mce, e->Iex.Triop.arg1 );
+         IRAtom* b2 = schemeE( mce, e->Iex.Triop.arg2 );
+         IRAtom* b3 = schemeE( mce, e->Iex.Triop.arg3 );
+         return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
+      }
+      case Iex_Binop: {
+         IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
+         IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
+         return gen_maxU32( mce, b1, b2 );
+      }
+      case Iex_Unop: {
+         IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
+         return b1;
+      }
+      case Iex_Const:
+         return mkU32(0);
+      case Iex_RdTmp:
+         return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
+      case Iex_Get: {
+         Int b_offset = MC_(get_otrack_shadow_offset)( 
+                           e->Iex.Get.offset,
+                           sizeofIRType(e->Iex.Get.ty) 
+                        );
+         tl_assert(b_offset >= -1
+                   && b_offset <= mce->layout->total_sizeB -4);
+         if (b_offset >= 0) {
+            /* FIXME: this isn't an atom! */
+            return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
+                               Ity_I32 );
+         }
+         return mkU32(0);
+      }
+      default:
+         VG_(printf)("mc_translate.c: schemeE: unhandled: ");
+         ppIRExpr(e); 
+         VG_(tool_panic)("memcheck:schemeE");
+   }
+}
+
+static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
+{
+   // This is a hacked version of do_shadow_Dirty
+   Int       i, n, offset, toDo, gSz, gOff;
+   IRAtom    *here, *curr;
+   IRTemp    dst;
+   IREndness end;
+
+   /* What's the native endianness?  We need to know this. */
+#  if defined(VG_BIGENDIAN)
+   end = Iend_BE;
+#  elif defined(VG_LITTLEENDIAN)
+   end = Iend_LE;
+#  else
+#    error "Unknown endianness"
+#  endif
+
+   /* First check the guard. */
+   curr = schemeE( mce, d->guard );
+
+   /* Now round up all inputs and maxU32 over them. */
+
+   /* Inputs: unmasked args */
+   for (i = 0; d->args[i]; i++) {
+      if (d->cee->mcx_mask & (1<<i)) {
+         /* ignore this arg */
+      } else {
+         here = schemeE( mce, d->args[i] );
+         curr = gen_maxU32( mce, curr, here );
+      }
+   }
+
+   /* Inputs: guest state that we read. */
+   for (i = 0; i < d->nFxState; i++) {
+      tl_assert(d->fxState[i].fx != Ifx_None);
+      if (d->fxState[i].fx == Ifx_Write)
+         continue;
+
+      /* Ignore any sections marked as 'always defined'. */
+      if (isAlwaysDefd(mce, d->fxState[i].offset, d->fxState[i].size )) {
+         if (0)
+         VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
+                     d->fxState[i].offset, d->fxState[i].size );
+         continue;
+      }
+
+      /* This state element is read or modified.  So we need to
+         consider it.  If larger than 4 bytes, deal with it in 4-byte
+         chunks. */
+      gSz  = d->fxState[i].size;
+      gOff = d->fxState[i].offset;
+      tl_assert(gSz > 0);
+      while (True) {
+         Int b_offset;
+         if (gSz == 0) break;
+         n = gSz <= 4 ? gSz : 4;
+         /* update 'curr' with maxU32 of the state slice 
+            gOff .. gOff+n-1 */
+         b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
+         if (b_offset != -1) {
+            here = assignNew( 'B',mce,
+                               Ity_I32,
+                               IRExpr_Get(b_offset + 2*mce->layout->total_sizeB,
+                                          Ity_I32));
+            curr = gen_maxU32( mce, curr, here );
+         }
+         gSz -= n;
+         gOff += n;
+      }
+
+   }
+
+   /* Inputs: memory */
+
+   if (d->mFx != Ifx_None) {
+      /* Because we may do multiple shadow loads/stores from the same
+         base address, it's best to do a single test of its
+         definedness right now.  Post-instrumentation optimisation
+         should remove all but this test. */
+      tl_assert(d->mAddr);
+      here = schemeE( mce, d->mAddr );
+      curr = gen_maxU32( mce, curr, here );
+   }
+
+   /* Deal with memory inputs (reads or modifies) */
+   if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
+      offset = 0;
+      toDo   = d->mSize;
+      /* chew off 32-bit chunks.  We don't care about the endianness
+         since it's all going to be condensed down to a single bit,
+         but nevertheless choose an endianness which is hopefully
+         native to the platform. */
+      while (toDo >= 4) {
+         here = gen_load_b( mce, 4, d->mAddr, d->mSize - toDo );
+         curr = gen_maxU32( mce, curr, here );
+         toDo -= 4;
+      }
+      if (toDo != 0)
+         VG_(printf)("Approx: do_origins_Dirty(R): missed %d bytes\n",
+                     (Int)toDo );
+      //tl_assert(toDo == 0); /* also need to handle 1,2-byte excess */
+   }
+
+   /* Whew!  So curr is a 32-bit B-value which should give an origin
+      of some use if any of the inputs to the helper are undefined.
+      Now we need to re-distribute the results to all destinations. */
+
+   /* Outputs: the destination temporary, if there is one. */
+   if (d->tmp != IRTemp_INVALID) {
+      dst   = findShadowTmpB(mce, d->tmp);
+      assign( 'V', mce, dst, curr );
+   }
+
+   /* Outputs: guest state that we write or modify. */
+   for (i = 0; i < d->nFxState; i++) {
+      tl_assert(d->fxState[i].fx != Ifx_None);
+      if (d->fxState[i].fx == Ifx_Read)
+         continue;
+
+      /* Ignore any sections marked as 'always defined'. */
+      if (isAlwaysDefd(mce, d->fxState[i].offset, d->fxState[i].size ))
+         continue;
+
+      /* This state element is written or modified.  So we need to
+         consider it.  If larger than 4 bytes, deal with it in 4-byte
+         chunks. */
+      gSz  = d->fxState[i].size;
+      gOff = d->fxState[i].offset;
+      tl_assert(gSz > 0);
+      while (True) {
+         Int b_offset;
+         if (gSz == 0) break;
+         n = gSz <= 4 ? gSz : 4;
+         /* Write 'curr' to the state slice gOff .. gOff+n-1 */
+         b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
+         if (b_offset != -1) {
+           stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
+                                      curr ));
+         }
+         gSz -= n;
+         gOff += n;
+      }
+   }
+
+   /* Outputs: memory that we write or modify.  Same comments about
+      endianness as above apply. */
+   if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
+      offset = 0;
+      toDo   = d->mSize;
+      /* chew off 32-bit chunks */
+      while (toDo >= 4) {
+         gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr );
+         toDo -= 4;
+      }
+      if (toDo != 0)
+         VG_(printf)("Approx: do_origins_Dirty(W): missed %d bytes\n",
+                     (Int)toDo );
+      //tl_assert(toDo == 0); /* also need to handle 1,2-byte excess */
+   }
+
+}
+
+static void schemeS ( MCEnv* mce, IRStmt* st )
+{
+   tl_assert(MC_(clo_mc_level) == 3);
+
+   switch (st->tag) {
+
+      case Ist_AbiHint:
+         /* The value-check instrumenter handles this - by arranging
+            to pass the address of the next instruction to
+            MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
+            happen for origin tracking w.r.t. AbiHints.  So there is
+            nothing to do here. */
+         break;
+
+      case Ist_PutI: {
+         IRRegArray* descr_b;
+         IRAtom      *t1, *t2, *t3, *t4;
+         IRRegArray* descr = st->Ist.PutI.descr;
+         IRType equivIntTy
+            = MC_(get_otrack_reg_array_equiv_int_type)(descr);
+         /* If this array is unshadowable for whatever reason,
+            generate no code. */
+         if (equivIntTy == Ity_INVALID)
+            break;
+         tl_assert(sizeofIRType(equivIntTy) >= 4);
+         tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
+         descr_b
+            = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
+                            equivIntTy, descr->nElems );
+         /* Compute a value to Put - the conjoinment of the origin for
+            the data to be Put-ted (obviously) and of the index value
+            (not so obviously). */
+         t1 = schemeE( mce, st->Ist.PutI.data );
+         t2 = schemeE( mce, st->Ist.PutI.ix );
+         t3 = gen_maxU32( mce, t1, t2 );
+         t4 = zWidenFrom32( mce, equivIntTy, t3 );
+         stmt( 'B', mce, IRStmt_PutI( descr_b, st->Ist.PutI.ix,
+                                      st->Ist.PutI.bias, t4 ));
+         break;
+      }
+      case Ist_Dirty:
+         do_origins_Dirty( mce, st->Ist.Dirty.details );
+         break;
+      case Ist_Store: {
+         Int     dszB;
+         IRAtom* dataB;
+         /* assert that the B value for the address is already
+            available (somewhere) */
+         tl_assert(isIRAtom(st->Ist.Store.addr));
+         dszB = sizeofIRType(
+                   typeOfIRExpr(mce->bb->tyenv, st->Ist.Store.data ));
+         dataB = schemeE( mce, st->Ist.Store.data );
+         gen_store_b( mce, dszB, st->Ist.Store.addr, 0/*offset*/, dataB );
+         break;
+      }
+      case Ist_Put: {
+         Int b_offset
+            = MC_(get_otrack_shadow_offset)(
+                 st->Ist.Put.offset,
+                 sizeofIRType(typeOfIRExpr(mce->bb->tyenv, st->Ist.Put.data))
+              );
+         if (b_offset >= 0) {
+            /* FIXME: this isn't an atom! */
+            stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB, 
+                                       schemeE( mce, st->Ist.Put.data )) );
+         }
+         break;
+      }
+      case Ist_WrTmp:
+         assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
+                           schemeE(mce, st->Ist.WrTmp.data) );
+         break;
+      case Ist_MBE:
+      case Ist_NoOp:
+      case Ist_Exit:
+      case Ist_IMark:
+         break;
+      default:
+         VG_(printf)("mc_translate.c: schemeS: unhandled: ");
+         ppIRStmt(st); 
+         VG_(tool_panic)("memcheck:schemeS");
+   }
+}
+
+
 /*--------------------------------------------------------------------*/
 /*--- end                                           mc_translate.c ---*/
 /*--------------------------------------------------------------------*/