priv/ir/iropt.c - platform/external/valgrind - Gitiles


 /*---------------------------------------------------------------*/
 /*---                                                         ---*/
 /*--- This file (ir/iropt.c) is                               ---*/
 /*--- Copyright (C) OpenWorks LLP.  All rights reserved.      ---*/
 /*---                                                         ---*/
 /*---------------------------------------------------------------*/

 /*
    This file is part of LibVEX, a library for dynamic binary
    instrumentation and translation.

    Copyright (C) 2004-2005 OpenWorks LLP.  All rights reserved.

    This library is made available under a dual licensing scheme.

    If you link LibVEX against other code all of which is itself
    licensed under the GNU General Public License, version 2 dated June
    1991 ("GPL v2"), then you may use LibVEX under the terms of the GPL
    v2, as appearing in the file LICENSE.GPL.  If the file LICENSE.GPL
    is missing, you can obtain a copy of the GPL v2 from the Free
    Software Foundation Inc., 51 Franklin St, Fifth Floor, Boston, MA
    02110-1301, USA.

    For any other uses of LibVEX, you must first obtain a commercial
    license from OpenWorks LLP.  Please contact info@open-works.co.uk
    for information about commercial licensing.

    This software is provided by OpenWorks LLP "as is" and any express
    or implied warranties, including, but not limited to, the implied
    warranties of merchantability and fitness for a particular purpose
    are disclaimed.  In no event shall OpenWorks LLP be liable for any
    direct, indirect, incidental, special, exemplary, or consequential
    damages (including, but not limited to, procurement of substitute
    goods or services; loss of use, data, or profits; or business
    interruption) however caused and on any theory of liability,
    whether in contract, strict liability, or tort (including
    negligence or otherwise) arising in any way out of the use of this
    software, even if advised of the possibility of such damage.

    Neither the names of the U.S. Department of Energy nor the
    University of California nor the names of its contributors may be
    used to endorse or promote products derived from this software
    without prior written permission.
 */

 #include "libvex_basictypes.h"
 #include "libvex_ir.h"
 #include "libvex.h"

 #include "main/vex_util.h"
 #include "main/vex_globals.h"
 #include "ir/iropt.h"


 /* Set to 1 for lots of debugging output. */
 #define DEBUG_IROPT 0


 /* What iropt does, 29 Dec 04.

    It takes an IRBB and produces a new one with the same meaning,
    defined thus:

    After execution of the new BB, all guest state and guest memory is
    the same as after execution of the original.  This is true
    regardless of how the block was exited (at the end vs side exit).

    In addition, parts of the guest state will be identical to that
    created by execution of the original at the following observation
    points:

    * In a dirty helper call, any parts of the guest state that the
      helper states that it reads or modifies will be up to date.
      Also, guest memory will be up to date.  Parts of the guest state
      not marked as being read or modified by the helper cannot be
      assumed to be up-to-date at the point where the helper is called.

    * Immediately prior to any load or store, those parts of the guest
      state marked as requiring precise exceptions will be up to date.
      Also, guest memory will be up to date.  Parts of the guest state
      not marked as requiring precise exceptions cannot be assumed to
      be up-to-date at the point of the load/store.

    The relative order of loads and stores (including loads/stores of
    guest memory done by dirty helpers annotated as such) is not
    changed.  However, the relative order of loads with no intervening
    stores/modifies may be changed.

    Transformation order
    ~~~~~~~~~~~~~~~~~~~~

    There are three levels of optimisation, controlled by
    vex_control.iropt_level.  Define first:

    "Cheap transformations" are the following sequence:
       * Redundant-Get removal
       * Redundant-Put removal
       * Constant propagation/folding
       * Dead code removal
       * Specialisation of clean helper functions
       * Dead code removal

    "Expensive transformations" are the following sequence:
       * CSE
       * Folding of add/sub chains
       * Redundant-GetI removal
       * Redundant-PutI removal
       * Dead code removal

    Then the transformations are as follows, as defined by
    vex_control.iropt_level:

    Level 0:
       * Flatten into atomic form.

    Level 1: the following sequence:
       * Flatten into atomic form.
       * Cheap transformations.

    Level 2: the following sequence
       * Flatten into atomic form.
       * Cheap transformations.
       * If block contains GetI or PutI, Expensive transformations.
       * Try unrolling loops.  Three possible outcomes:
         - No effect: do nothing more.
         - Unrolled a loop, and block does not contain GetI or PutI:
           Do: * CSE
               * Dead code removal
         - Unrolled a loop, and block contains GetI or PutI:
           Do: * Expensive transformations
               * Cheap transformations
 */

 /* Implementation notes, 29 Dec 04.

    TODO (important): I think rPutI removal ignores precise exceptions
    and is therefore in a sense, wrong.  In the sense that PutIs are
    assumed not to write parts of the guest state that we need to have
    up-to-date at loads/stores.  So far on x86 guest that has not
    mattered since indeed only the x87 FP registers and tags are
    accessed using GetI/PutI, and there is no need so far for them to
    be up to date at mem exception points.  The rPutI pass should be
    fixed.

    TODO: improve pessimistic handling of precise exceptions
      in the tree builder.

    TODO: check interaction of rGetI and dirty helpers.

    F64i constants are treated differently from other constants.
    They are not regarded as atoms, and instead lifted off and
    bound to temps.  This allows them to participate in CSE, which
    is important for getting good performance for x86 guest code.

    CSE up F64 literals (already doing F64is)

    CSE: consider carefully the requirement for precise exns
         prior to making CSE any more aggressive.  */


 /*---------------------------------------------------------------*/
 /*--- Finite mappery, of a sort                               ---*/
 /*---------------------------------------------------------------*/

 /* General map from HWord-sized thing HWord-sized thing.  Could be by
    hashing, but it's not clear whether or not this would really be any
    faster. */

 typedef
    struct {
       Bool*  inuse;
       HWord* key;
       HWord* val;
       Int    size;
       Int    used;
    }
    HashHW;

 static HashHW* newHHW ( void )
 {
    HashHW* h = LibVEX_Alloc(sizeof(HashHW));
    h->size   = 8;
    h->used   = 0;
    h->inuse  = LibVEX_Alloc(h->size * sizeof(Bool));
    h->key    = LibVEX_Alloc(h->size * sizeof(HWord));
    h->val    = LibVEX_Alloc(h->size * sizeof(HWord));
    return h;
 }


 /* Look up key in the map. */

 static Bool lookupHHW ( HashHW* h, /*OUT*/HWord* val, HWord key )
 {
    Int i;
    /* vex_printf("lookupHHW(%llx)\n", key ); */
    for (i = 0; i < h->used; i++) {
       if (h->inuse[i] && h->key[i] == key) {
          if (val)
             *val = h->val[i];
          return True;
       }
    }
    return False;
 }


 /* Add key->val to the map.  Replaces any existing binding for key. */

 static void addToHHW ( HashHW* h, HWord key, HWord val )
 {
    Int i, j;
    /* vex_printf("addToHHW(%llx, %llx)\n", key, val); */

    /* Find and replace existing binding, if any. */
    for (i = 0; i < h->used; i++) {
       if (h->inuse[i] && h->key[i] == key) {
          h->val[i] = val;
          return;
       }
    }

    /* Ensure a space is available. */
    if (h->used == h->size) {
       /* Copy into arrays twice the size. */
       Bool*  inuse2 = LibVEX_Alloc(2 * h->size * sizeof(Bool));
       HWord* key2   = LibVEX_Alloc(2 * h->size * sizeof(HWord));
       HWord* val2   = LibVEX_Alloc(2 * h->size * sizeof(HWord));
       for (i = j = 0; i < h->size; i++) {
          if (!h->inuse[i]) continue;
          inuse2[j] = True;
          key2[j] = h->key[i];
          val2[j] = h->val[i];
          j++;
       }
       h->used = j;
       h->size *= 2;
       h->inuse = inuse2;
       h->key = key2;
       h->val = val2;
    }

    /* Finally, add it. */
    vassert(h->used < h->size);
    h->inuse[h->used] = True;
    h->key[h->used] = key;
    h->val[h->used] = val;
    h->used++;
 }


 /*---------------------------------------------------------------*/
 /*--- Flattening out a BB into atomic SSA form                ---*/
 /*---------------------------------------------------------------*/

 /* Non-critical helper, heuristic for reducing the number of tmp-tmp
    copies made by flattening.  If in doubt return False. */

 static Bool isFlat ( IRExpr* e )
 {
    if (e->tag == Iex_Get)
       return True;
    if (e->tag == Iex_Binop)
       return toBool( isIRAtom(e->Iex.Binop.arg1)
                      && isIRAtom(e->Iex.Binop.arg2) );
    if (e->tag == Iex_Load)
       return isIRAtom(e->Iex.Load.addr);
    return False;
 }

 /* Flatten out 'ex' so it is atomic, returning a new expression with
    the same value, after having appended extra IRTemp assignments to
    the end of 'bb'. */

 static IRExpr* flatten_Expr ( IRBB* bb, IRExpr* ex )
 {
    Int i;
    IRExpr** newargs;
    IRType ty = typeOfIRExpr(bb->tyenv, ex);
    IRTemp t1;

    switch (ex->tag) {

       case Iex_GetI:
          t1 = newIRTemp(bb->tyenv, ty);
          addStmtToIRBB(bb, IRStmt_Tmp(t1,
             IRExpr_GetI(ex->Iex.GetI.descr,
                         flatten_Expr(bb, ex->Iex.GetI.ix),
                         ex->Iex.GetI.bias)));
          return IRExpr_Tmp(t1);

       case Iex_Get:
          t1 = newIRTemp(bb->tyenv, ty);
          addStmtToIRBB(bb,
             IRStmt_Tmp(t1, ex));
          return IRExpr_Tmp(t1);

       case Iex_Binop:
          t1 = newIRTemp(bb->tyenv, ty);
          addStmtToIRBB(bb, IRStmt_Tmp(t1,
             IRExpr_Binop(ex->Iex.Binop.op,
                          flatten_Expr(bb, ex->Iex.Binop.arg1),
                          flatten_Expr(bb, ex->Iex.Binop.arg2))));
          return IRExpr_Tmp(t1);

       case Iex_Unop:
          t1 = newIRTemp(bb->tyenv, ty);
          addStmtToIRBB(bb, IRStmt_Tmp(t1,
             IRExpr_Unop(ex->Iex.Unop.op,
                         flatten_Expr(bb, ex->Iex.Unop.arg))));
          return IRExpr_Tmp(t1);

       case Iex_Load:
          t1 = newIRTemp(bb->tyenv, ty);
          addStmtToIRBB(bb, IRStmt_Tmp(t1,
             IRExpr_Load(ex->Iex.Load.end,
                         ex->Iex.Load.ty,
                         flatten_Expr(bb, ex->Iex.Load.addr))));
          return IRExpr_Tmp(t1);

       case Iex_CCall:
          newargs = sopyIRExprVec(ex->Iex.CCall.args);
          for (i = 0; newargs[i]; i++)
             newargs[i] = flatten_Expr(bb, newargs[i]);
          t1 = newIRTemp(bb->tyenv, ty);
          addStmtToIRBB(bb, IRStmt_Tmp(t1,
             IRExpr_CCall(ex->Iex.CCall.cee,
                          ex->Iex.CCall.retty,
                          newargs)));
          return IRExpr_Tmp(t1);

       case Iex_Mux0X:
          t1 = newIRTemp(bb->tyenv, ty);
          addStmtToIRBB(bb, IRStmt_Tmp(t1,
             IRExpr_Mux0X(flatten_Expr(bb, ex->Iex.Mux0X.cond),
                          flatten_Expr(bb, ex->Iex.Mux0X.expr0),
                          flatten_Expr(bb, ex->Iex.Mux0X.exprX))));
          return IRExpr_Tmp(t1);

       case Iex_Const:
          /* Lift F64i constants out onto temps so they can be CSEd
             later. */
          if (ex->Iex.Const.con->tag == Ico_F64i) {
             t1 = newIRTemp(bb->tyenv, ty);
             addStmtToIRBB(bb, IRStmt_Tmp(t1,
                IRExpr_Const(ex->Iex.Const.con)));
             return IRExpr_Tmp(t1);
          } else {
             /* Leave all other constants alone. */
             return ex;
          }

       case Iex_Tmp:
          return ex;

       default:
          vex_printf("\n");
          ppIRExpr(ex);
          vex_printf("\n");
          vpanic("flatten_Expr");
    }
 }


 /* Append a completely flattened form of 'st' to the end of 'bb'. */

 static void flatten_Stmt ( IRBB* bb, IRStmt* st )
 {
    Int i;
    IRExpr  *e1, *e2;
    IRDirty *d,  *d2;
    switch (st->tag) {
       case Ist_Put:
          if (isIRAtom(st->Ist.Put.data)) {
             /* optimisation to reduce the amount of heap wasted
                by the flattener */
             addStmtToIRBB(bb, st);
          } else {
             /* general case, always correct */
             e1 = flatten_Expr(bb, st->Ist.Put.data);
             addStmtToIRBB(bb, IRStmt_Put(st->Ist.Put.offset, e1));
          }
          break;
       case Ist_PutI:
          e1 = flatten_Expr(bb, st->Ist.PutI.ix);
          e2 = flatten_Expr(bb, st->Ist.PutI.data);
          addStmtToIRBB(bb, IRStmt_PutI(st->Ist.PutI.descr,
                                        e1,
                                        st->Ist.PutI.bias,
                                        e2));
          break;
       case Ist_Tmp:
          if (isFlat(st->Ist.Tmp.data)) {
             /* optimisation, to reduce the number of tmp-tmp
                copies generated */
             addStmtToIRBB(bb, st);
          } else {
             /* general case, always correct */
             e1 = flatten_Expr(bb, st->Ist.Tmp.data);
             addStmtToIRBB(bb, IRStmt_Tmp(st->Ist.Tmp.tmp, e1));
          }
          break;
       case Ist_Store:
          e1 = flatten_Expr(bb, st->Ist.Store.addr);
          e2 = flatten_Expr(bb, st->Ist.Store.data);
          addStmtToIRBB(bb, IRStmt_Store(st->Ist.Store.end, e1,e2));
          break;
       case Ist_Dirty:
          d = st->Ist.Dirty.details;
          d2 = emptyIRDirty();
          *d2 = *d;
          d2->args = sopyIRExprVec(d2->args);
          if (d2->mFx != Ifx_None) {
             d2->mAddr = flatten_Expr(bb, d2->mAddr);
          } else {
             vassert(d2->mAddr == NULL);
          }
          d2->guard = flatten_Expr(bb, d2->guard);
          for (i = 0; d2->args[i]; i++)
             d2->args[i] = flatten_Expr(bb, d2->args[i]);
          addStmtToIRBB(bb, IRStmt_Dirty(d2));
          break;
       case Ist_NoOp:
       case Ist_MFence:
       case Ist_IMark:
          addStmtToIRBB(bb, st);
          break;
       case Ist_AbiHint:
          e1 = flatten_Expr(bb, st->Ist.AbiHint.base);
          addStmtToIRBB(bb, IRStmt_AbiHint(e1, st->Ist.AbiHint.len));
          break;
       case Ist_Exit:
          e1 = flatten_Expr(bb, st->Ist.Exit.guard);
          addStmtToIRBB(bb, IRStmt_Exit(e1, st->Ist.Exit.jk,
                                            st->Ist.Exit.dst));
          break;
       default:
          vex_printf("\n");
          ppIRStmt(st);
          vex_printf("\n");
          vpanic("flatten_Stmt");
    }
 }


 static IRBB* flatten_BB ( IRBB* in )
 {
    Int   i;
    IRBB* out;
    out = emptyIRBB();
    out->tyenv = dopyIRTypeEnv( in->tyenv );
    for (i = 0; i < in->stmts_used; i++)
       if (in->stmts[i])
          flatten_Stmt( out, in->stmts[i] );
    out->next     = flatten_Expr( out, in->next );
    out->jumpkind = in->jumpkind;
    return out;
 }


 /*---------------------------------------------------------------*/
 /*--- In-place removal of redundant GETs                      ---*/
 /*---------------------------------------------------------------*/

 /* Scan forwards, building up an environment binding (min offset, max
    offset) pairs to values, which will either be temps or constants.

    On seeing 't = Get(minoff,maxoff)', look up (minoff,maxoff) in the
    env and if it matches, replace the Get with the stored value.  If
    there is no match, add a (minoff,maxoff) :-> t binding.

    On seeing 'Put (minoff,maxoff) = t or c', first remove in the env
    any binding which fully or partially overlaps with (minoff,maxoff).
    Then add a new (minoff,maxoff) :-> t or c binding.  */

 /* Extract the min/max offsets from a guest state array descriptor. */

 inline
 static void getArrayBounds ( IRArray* descr, UInt* minoff, UInt* maxoff )
 {
    *minoff = descr->base;
    *maxoff = *minoff + descr->nElems*sizeofIRType(descr->elemTy) - 1;
    vassert((*minoff & ~0xFFFF) == 0);
    vassert((*maxoff & ~0xFFFF) == 0);
    vassert(*minoff <= *maxoff);
 }

 /* Create keys, of the form ((minoffset << 16) | maxoffset). */

 static UInt mk_key_GetPut ( Int offset, IRType ty )
 {
    /* offset should fit in 16 bits. */
    UInt minoff = offset;
    UInt maxoff = minoff + sizeofIRType(ty) - 1;
    vassert((minoff & ~0xFFFF) == 0);
    vassert((maxoff & ~0xFFFF) == 0);
    return (minoff << 16) | maxoff;
 }

 static UInt mk_key_GetIPutI ( IRArray* descr )
 {
    UInt minoff, maxoff;
    getArrayBounds( descr, &minoff, &maxoff );
    vassert((minoff & ~0xFFFF) == 0);
    vassert((maxoff & ~0xFFFF) == 0);
    return (minoff << 16) | maxoff;
 }

 /* Supposing h has keys of the form generated by mk_key_GetPut and
    mk_key_GetIPutI, invalidate any key which overlaps (k_lo
    .. k_hi).
 */
 static void invalidateOverlaps ( HashHW* h, UInt k_lo, UInt k_hi )
 {
    Int  j;
    UInt e_lo, e_hi;
    vassert(k_lo <= k_hi);
    /* invalidate any env entries which in any way overlap (k_lo
       .. k_hi) */
    /* vex_printf("invalidate %d .. %d\n", k_lo, k_hi ); */

    for (j = 0; j < h->used; j++) {
       if (!h->inuse[j])
          continue;
       e_lo = (((UInt)h->key[j]) >> 16) & 0xFFFF;
       e_hi = ((UInt)h->key[j]) & 0xFFFF;
       vassert(e_lo <= e_hi);
       if (e_hi < k_lo || k_hi < e_lo)
          continue; /* no overlap possible */
       else
          /* overlap; invalidate */
          h->inuse[j] = False;
    }
 }


 static void redundant_get_removal_BB ( IRBB* bb )
 {
    HashHW* env = newHHW();
    UInt    key = 0; /* keep gcc -O happy */
    Int     i, j;
    HWord   val;

    for (i = 0; i < bb->stmts_used; i++) {
       IRStmt* st = bb->stmts[i];

       if (st->tag == Ist_NoOp)
          continue;

       /* Deal with Gets */
       if (st->tag == Ist_Tmp
           && st->Ist.Tmp.data->tag == Iex_Get) {
          /* st is 't = Get(...)'.  Look up in the environment and see
             if the Get can be replaced. */
          IRExpr* get = st->Ist.Tmp.data;
          key = (HWord)mk_key_GetPut( get->Iex.Get.offset,
                                      get->Iex.Get.ty );
          if (lookupHHW(env, &val, (HWord)key)) {
             /* found it */
             /* Note, we could do better here.  If the types are
                different we don't do the substitution, since doing so
                could lead to invalidly-typed IR.  An improvement would
                be to stick in a reinterpret-style cast, although that
                would make maintaining flatness more difficult. */
             IRExpr* valE    = (IRExpr*)val;
             Bool    typesOK = toBool( typeOfIRExpr(bb->tyenv,valE)
                                       == st->Ist.Tmp.data->Iex.Get.ty );
             if (typesOK && DEBUG_IROPT) {
                vex_printf("rGET: "); ppIRExpr(get);
                vex_printf("  ->  "); ppIRExpr(valE);
                vex_printf("\n");
             }
             if (typesOK)
                bb->stmts[i] = IRStmt_Tmp(st->Ist.Tmp.tmp, valE);
          } else {
             /* Not found, but at least we know that t and the Get(...)
                are now associated.  So add a binding to reflect that
                fact. */
             addToHHW( env, (HWord)key,
                            (HWord)(void*)(IRExpr_Tmp(st->Ist.Tmp.tmp)) );
          }
       }

       /* Deal with Puts: invalidate any env entries overlapped by this
          Put */
       if (st->tag == Ist_Put || st->tag == Ist_PutI) {
          UInt k_lo, k_hi;
          if (st->tag == Ist_Put) {
             key = mk_key_GetPut( st->Ist.Put.offset,
                                  typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );
          } else {
             vassert(st->tag == Ist_PutI);
             key = mk_key_GetIPutI( st->Ist.PutI.descr );
          }

          k_lo = (key >> 16) & 0xFFFF;
          k_hi = key & 0xFFFF;
          invalidateOverlaps(env, k_lo, k_hi);
       }
       else
       if (st->tag == Ist_Dirty) {
          /* Deal with dirty helpers which write or modify guest state.
             Invalidate the entire env.  We could do a lot better
             here. */
          IRDirty* d      = st->Ist.Dirty.details;
          Bool     writes = False;
          for (j = 0; j < d->nFxState; j++) {
             if (d->fxState[j].fx == Ifx_Modify
                 || d->fxState[j].fx == Ifx_Write)
             writes = True;
          }
          if (writes) {
             /* dump the entire env (not clever, but correct ...) */
             for (j = 0; j < env->used; j++)
                env->inuse[j] = False;
             if (0) vex_printf("rGET: trash env due to dirty helper\n");
          }
       }

       /* add this one to the env, if appropriate */
       if (st->tag == Ist_Put) {
          vassert(isIRAtom(st->Ist.Put.data));
          addToHHW( env, (HWord)key, (HWord)(st->Ist.Put.data));
       }

    } /* for (i = 0; i < bb->stmts_used; i++) */

 }


 /*---------------------------------------------------------------*/
 /*--- In-place removal of redundant PUTs                      ---*/
 /*---------------------------------------------------------------*/

 /* Find any Get uses in st and invalidate any partially or fully
    overlapping ranges listed in env.  Due to the flattening phase, the
    only stmt kind we expect to find a Get on is IRStmt_Tmp. */

 static void handle_gets_Stmt (
                HashHW* env,
                IRStmt* st,
                Bool (*preciseMemExnsFn)(Int,Int)
             )
 {
    Int     j;
    UInt    key = 0; /* keep gcc -O happy */
    Bool    isGet;
    Bool    memRW = False;
    IRExpr* e;

    switch (st->tag) {

       /* This is the only interesting case.  Deal with Gets in the RHS
          expression. */
       case Ist_Tmp:
          e = st->Ist.Tmp.data;
          switch (e->tag) {
             case Iex_Get:
                isGet = True;
                key = mk_key_GetPut ( e->Iex.Get.offset, e->Iex.Get.ty );
                break;
             case Iex_GetI:
                isGet = True;
                key = mk_key_GetIPutI ( e->Iex.GetI.descr );
                break;
             case Iex_Load:
                isGet = False;
                memRW = True;
                break;
             default:
                isGet = False;
          }
          if (isGet) {
             UInt k_lo, k_hi;
             k_lo = (key >> 16) & 0xFFFF;
             k_hi = key & 0xFFFF;
             invalidateOverlaps(env, k_lo, k_hi);
          }
          break;

       /* Be very conservative for dirty helper calls; dump the entire
          environment.  The helper might read guest state, in which
          case it needs to be flushed first.  Also, the helper might
          access guest memory, in which case all parts of the guest
          state requiring precise exceptions needs to be flushed.  The
          crude solution is just to flush everything; we could easily
          enough do a lot better if needed. */
       /* Probably also overly-conservative, but also dump everything
          if we hit a memory fence.  Ditto AbiHints.*/
       case Ist_AbiHint:
          vassert(isIRAtom(st->Ist.AbiHint.base));
          /* fall through */
       case Ist_MFence:
       case Ist_Dirty:
          for (j = 0; j < env->used; j++)
             env->inuse[j] = False;
          break;

       /* all other cases are boring. */
       case Ist_Store:
          vassert(isIRAtom(st->Ist.Store.addr));
          vassert(isIRAtom(st->Ist.Store.data));
          memRW = True;
          break;

       case Ist_Exit:
          vassert(isIRAtom(st->Ist.Exit.guard));
          break;

       case Ist_PutI:
          vassert(isIRAtom(st->Ist.PutI.ix));
          vassert(isIRAtom(st->Ist.PutI.data));
          break;

       case Ist_NoOp:
       case Ist_IMark:
          break;

       default:
          vex_printf("\n");
          ppIRStmt(st);
          vex_printf("\n");
          vpanic("handle_gets_Stmt");
    }

    if (memRW) {
       /* This statement accesses memory.  So we need to dump all parts
          of the environment corresponding to guest state that may not
          be reordered with respect to memory references.  That means
          at least the stack pointer. */
       for (j = 0; j < env->used; j++) {
          if (!env->inuse[j])
             continue;
          if (vex_control.iropt_precise_memory_exns) {
             /* Precise exceptions required.  Flush all guest state. */
             env->inuse[j] = False;
          } else {
             /* Just flush the minimal amount required, as computed by
                preciseMemExnsFn. */
             HWord k_lo = (env->key[j] >> 16) & 0xFFFF;
             HWord k_hi = env->key[j] & 0xFFFF;
             if (preciseMemExnsFn( k_lo, k_hi ))
                env->inuse[j] = False;
          }
       }
    } /* if (memRW) */

 }


 /* Scan backwards, building up a set of (min offset, max
    offset) pairs, indicating those parts of the guest state
    for which the next event is a write.

    On seeing a conditional exit, empty the set.

    On seeing 'Put (minoff,maxoff) = t or c', if (minoff,maxoff) is
    completely within the set, remove the Put.  Otherwise, add
    (minoff,maxoff) to the set.

    On seeing 'Get (minoff,maxoff)', remove any part of the set
    overlapping (minoff,maxoff).  The same has to happen for any events
    which implicitly read parts of the guest state: dirty helper calls
    and loads/stores.
 */

 static void redundant_put_removal_BB (
                IRBB* bb,
                Bool (*preciseMemExnsFn)(Int,Int)
             )
 {
    Int     i, j;
    Bool    isPut;
    IRStmt* st;
    UInt    key = 0; /* keep gcc -O happy */

    HashHW* env = newHHW();
    for (i = bb->stmts_used-1; i >= 0; i--) {
       st = bb->stmts[i];

       if (st->tag == Ist_NoOp)
          continue;

       /* Deal with conditional exits. */
       if (st->tag == Ist_Exit) {
          /* Since control may not get beyond this point, we must empty
             out the set, since we can no longer claim that the next
             event for any part of the guest state is definitely a
             write. */
          vassert(isIRAtom(st->Ist.Exit.guard));
          for (j = 0; j < env->used; j++)
             env->inuse[j] = False;
          continue;
       }

       /* Deal with Puts */
       switch (st->tag) {
          case Ist_Put:
             isPut = True;
             key = mk_key_GetPut( st->Ist.Put.offset,
                                  typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );
             vassert(isIRAtom(st->Ist.Put.data));
             break;
          case Ist_PutI:
             isPut = True;
             key = mk_key_GetIPutI( st->Ist.PutI.descr );
             vassert(isIRAtom(st->Ist.PutI.ix));
             vassert(isIRAtom(st->Ist.PutI.data));
             break;
          default:
             isPut = False;
       }
       if (isPut && st->tag != Ist_PutI) {
          /* See if any single entry in env overlaps this Put.  This is
             simplistic in that the transformation is valid if, say, two
             or more entries in the env overlap this Put, but the use of
             lookupHHW will only find a single entry which exactly
             overlaps this Put.  This is suboptimal but safe. */
          if (lookupHHW(env, NULL, (HWord)key)) {
             /* This Put is redundant because a later one will overwrite
                it.  So NULL (nop) it out. */
             if (DEBUG_IROPT) {
                vex_printf("rPUT: "); ppIRStmt(st);
                vex_printf("\n");
             }
             bb->stmts[i] = IRStmt_NoOp();
          } else {
             /* We can't demonstrate that this Put is redundant, so add it
                to the running collection. */
             addToHHW(env, (HWord)key, 0);
          }
          continue;
       }

       /* Deal with Gets.  These remove bits of the environment since
          appearance of a Get means that the next event for that slice
          of the guest state is no longer a write, but a read.  Also
          deals with implicit reads of guest state needed to maintain
          precise exceptions. */
       handle_gets_Stmt( env, st, preciseMemExnsFn );
    }
 }


 /*---------------------------------------------------------------*/
 /*--- Constant propagation and folding                        ---*/
 /*---------------------------------------------------------------*/

 /* The env in this section is a map from IRTemp to IRExpr*,
    that is, an array indexed by IRTemp. */

 /* Are both expressions simply the same IRTemp ? */
 static Bool sameIRTemps ( IRExpr* e1, IRExpr* e2 )
 {
    return toBool( e1->tag == Iex_Tmp
                   && e2->tag == Iex_Tmp
                   && e1->Iex.Tmp.tmp == e2->Iex.Tmp.tmp );
 }

 static Bool notBool ( Bool b )
 {
    if (b == True) return False;
    if (b == False) return True;
    vpanic("notBool");
 }

 /* Make a zero which has the same type as the result of the given
    primop. */
 static IRExpr* mkZeroForXor ( IROp op )
 {
    switch (op) {
       case Iop_Xor8:  return IRExpr_Const(IRConst_U8(0));
       case Iop_Xor16: return IRExpr_Const(IRConst_U16(0));
       case Iop_Xor32: return IRExpr_Const(IRConst_U32(0));
       case Iop_Xor64: return IRExpr_Const(IRConst_U64(0));
       default: vpanic("mkZeroForXor: bad primop");
    }
 }


 static IRExpr* fold_Expr ( IRExpr* e )
 {
    Int     shift;
    IRExpr* e2 = e; /* e2 is the result of folding e, if possible */

    /* UNARY ops */
    if (e->tag == Iex_Unop
        && e->Iex.Unop.arg->tag == Iex_Const) {
       switch (e->Iex.Unop.op) {
          case Iop_1Uto8:
             e2 = IRExpr_Const(IRConst_U8(toUChar(
                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                     ? 1 : 0)));
             break;
          case Iop_1Uto32:
             e2 = IRExpr_Const(IRConst_U32(
                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                     ? 1 : 0));
             break;
          case Iop_1Uto64:
             e2 = IRExpr_Const(IRConst_U64(
                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                     ? 1 : 0));
             break;

          case Iop_1Sto8:
             e2 = IRExpr_Const(IRConst_U8(toUChar(
                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                     ? 0xFF : 0)));
             break;
          case Iop_1Sto16:
             e2 = IRExpr_Const(IRConst_U16(toUShort(
                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                     ? 0xFFFF : 0)));
             break;
          case Iop_1Sto32:
             e2 = IRExpr_Const(IRConst_U32(
                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                     ? 0xFFFFFFFF : 0));
             break;
          case Iop_1Sto64:
             e2 = IRExpr_Const(IRConst_U64(
                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                     ? 0xFFFFFFFFFFFFFFFFULL : 0));
             break;

          case Iop_8Sto32: {
             /* signed */ Int s32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U8;
             s32 <<= 24;
             s32 >>= 24;
             e2 = IRExpr_Const(IRConst_U32((UInt)s32));
             break;
          }
          case Iop_8Uto64:
             e2 = IRExpr_Const(IRConst_U64(
                     0xFFULL & e->Iex.Unop.arg->Iex.Const.con->Ico.U8));
             break;
          case Iop_16Uto64:
             e2 = IRExpr_Const(IRConst_U64(
                     0xFFFFULL & e->Iex.Unop.arg->Iex.Const.con->Ico.U16));
             break;
          case Iop_8Uto32:
             e2 = IRExpr_Const(IRConst_U32(
                     0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U8));
             break;
          case Iop_16Uto32:
             e2 = IRExpr_Const(IRConst_U32(
                     0xFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U16));
             break;
          case Iop_32to16:
             e2 = IRExpr_Const(IRConst_U16(toUShort(
                     0xFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
             break;
          case Iop_32to8:
             e2 = IRExpr_Const(IRConst_U8(toUChar(
                     0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
             break;
          case Iop_32to1:
             e2 = IRExpr_Const(IRConst_U1(toBool(
                     1 == (1 & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)
                  )));
             break;
          case Iop_64to1:
             e2 = IRExpr_Const(IRConst_U1(toBool(
                     1 == (1 & e->Iex.Unop.arg->Iex.Const.con->Ico.U64)
                  )));
             break;

          case Iop_Not64:
             e2 = IRExpr_Const(IRConst_U64(
                     ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U64)));
             break;
          case Iop_Not32:
             e2 = IRExpr_Const(IRConst_U32(
                     ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
             break;
          case Iop_Not16:
             e2 = IRExpr_Const(IRConst_U16(toUShort(
                     ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U16))));
             break;
          case Iop_Not8:
             e2 = IRExpr_Const(IRConst_U8(toUChar(
                     ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U8))));
             break;

          case Iop_Not1:
             e2 = IRExpr_Const(IRConst_U1(
                     notBool(e->Iex.Unop.arg->Iex.Const.con->Ico.U1)));
             break;

          case Iop_Neg64:
             e2 = IRExpr_Const(IRConst_U64(
                     - (e->Iex.Unop.arg->Iex.Const.con->Ico.U64)));
             break;
          case Iop_Neg32:
             e2 = IRExpr_Const(IRConst_U32(
                     - (e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
             break;
          case Iop_Neg8:
             e2 = IRExpr_Const(IRConst_U8(toUChar(
                     - (e->Iex.Unop.arg->Iex.Const.con->Ico.U8))));
             break;

          case Iop_64to8: {
             ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
             w64 &= 0xFFULL;
             e2 = IRExpr_Const(IRConst_U8( (UChar)w64 ));
             break;
          }
          case Iop_64to16: {
             ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
             w64 &= 0xFFFFULL;
             e2 = IRExpr_Const(IRConst_U16( (UShort)w64 ));
             break;
          }
          case Iop_64to32: {
             ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
             w64 &= 0x00000000FFFFFFFFULL;
             e2 = IRExpr_Const(IRConst_U32( (UInt)w64 ));
             break;
          }
          case Iop_64HIto32: {
             ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
             w64 >>= 32;
             e2 = IRExpr_Const(IRConst_U32( (UInt)w64 ));
             break;
          }
          case Iop_32Uto64:
             e2 = IRExpr_Const(IRConst_U64(
                     0xFFFFFFFFULL
                     & e->Iex.Unop.arg->Iex.Const.con->Ico.U32));
             break;

          case Iop_CmpNEZ8:
             e2 = IRExpr_Const(IRConst_U1(toBool(
                     0 !=
                     (0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U8)
                  )));
             break;
          case Iop_CmpNEZ32:
             e2 = IRExpr_Const(IRConst_U1(toBool(
                     0 !=
                     (0xFFFFFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)
                  )));
             break;
          case Iop_CmpNEZ64:
             e2 = IRExpr_Const(IRConst_U1(toBool(
                     0ULL != e->Iex.Unop.arg->Iex.Const.con->Ico.U64
                  )));
             break;

          default:
             goto unhandled;
       }
    }

    /* BINARY ops */
    if (e->tag == Iex_Binop) {
       if (e->Iex.Binop.arg1->tag == Iex_Const
           && e->Iex.Binop.arg2->tag == Iex_Const) {
          /* cases where both args are consts */
          switch (e->Iex.Binop.op) {

             /* -- Or -- */
             case Iop_Or8:
                e2 = IRExpr_Const(IRConst_U8(toUChar(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
                         | e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
                break;
             case Iop_Or16:
                e2 = IRExpr_Const(IRConst_U16(toUShort(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U16
                         | e->Iex.Binop.arg2->Iex.Const.con->Ico.U16))));
                break;
             case Iop_Or32:
                e2 = IRExpr_Const(IRConst_U32(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                         | e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
                break;
             case Iop_Or64:
                e2 = IRExpr_Const(IRConst_U64(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                         | e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
                break;

             /* -- Xor -- */
             case Iop_Xor8:
                e2 = IRExpr_Const(IRConst_U8(toUChar(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
                         ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
                break;
             case Iop_Xor16:
                e2 = IRExpr_Const(IRConst_U16(toUShort(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U16
                         ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U16))));
                break;
             case Iop_Xor32:
                e2 = IRExpr_Const(IRConst_U32(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                         ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
                break;
             case Iop_Xor64:
                e2 = IRExpr_Const(IRConst_U64(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                         ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
                break;

             /* -- And -- */
             case Iop_And8:
                e2 = IRExpr_Const(IRConst_U8(toUChar(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
                         & e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
                break;
             case Iop_And32:
                e2 = IRExpr_Const(IRConst_U32(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                         & e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
                break;
             case Iop_And64:
                e2 = IRExpr_Const(IRConst_U64(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                         & e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
                break;

             /* -- Add -- */
             case Iop_Add8:
                e2 = IRExpr_Const(IRConst_U8(toUChar(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
                         + e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
                break;
             case Iop_Add32:
                e2 = IRExpr_Const(IRConst_U32(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                         + e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
                break;
             case Iop_Add64:
                e2 = IRExpr_Const(IRConst_U64(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                         + e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
                break;

             /* -- Sub -- */
             case Iop_Sub8:
                e2 = IRExpr_Const(IRConst_U8(toUChar(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
                         - e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
                break;
             case Iop_Sub32:
                e2 = IRExpr_Const(IRConst_U32(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                         - e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
                break;
             case Iop_Sub64:
                e2 = IRExpr_Const(IRConst_U64(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                         - e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
                break;

             /* -- Mul -- */
             case Iop_Mul32:
                e2 = IRExpr_Const(IRConst_U32(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                         * e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
                break;
             case Iop_Mul64:
                e2 = IRExpr_Const(IRConst_U64(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                         * e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
                break;

             case Iop_MullS32: {
                /* very paranoid */
                UInt  u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
                UInt  u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
                Int   s32a = (Int)u32a;
                Int   s32b = (Int)u32b;
                Long  s64a = (Long)s32a;
                Long  s64b = (Long)s32b;
                Long  sres = s64a * s64b;
                ULong ures = (ULong)sres;
                e2 = IRExpr_Const(IRConst_U64(ures));
                break;
             }

             /* -- Shl -- */
             case Iop_Shl32:
                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
                if (shift >= 0 && shift <= 31)
                   e2 = IRExpr_Const(IRConst_U32(
                           (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                            << shift)));
                break;
             case Iop_Shl64:
                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
                if (shift >= 0 && shift <= 63)
                   e2 = IRExpr_Const(IRConst_U64(
                           (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                            << shift)));
                break;

             /* -- Sar -- */
             case Iop_Sar32: {
                /* paranoid ... */
                /*signed*/ Int s32;
                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
                s32   = (Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32);
                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
                if (shift >= 0 && shift <= 31) {
                   s32 >>=/*signed*/ shift;
                   e2 = IRExpr_Const(IRConst_U32((UInt)s32));
                }
                break;
             }
             case Iop_Sar64: {
                /* paranoid ... */
                /*signed*/ Long s64;
                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
                s64   = (Long)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64);
                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
                if (shift >= 0 && shift <= 63) {
                   s64 >>=/*signed*/ shift;
                   e2 = IRExpr_Const(IRConst_U64((ULong)s64));
                }
                break;
             }

             /* -- Shr -- */
             case Iop_Shr32: {
                /* paranoid ... */
                /*unsigned*/ UInt u32;
                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
                u32   = (UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32);
                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
                if (shift >= 0 && shift <= 31) {
                   u32 >>=/*unsigned*/ shift;
                   e2 = IRExpr_Const(IRConst_U32(u32));
                }
                break;
             }
             case Iop_Shr64: {
                /* paranoid ... */
                /*unsigned*/ ULong u64;
                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
                u64   = (ULong)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64);
                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
                if (shift >= 0 && shift <= 63) {
                   u64 >>=/*unsigned*/ shift;
                   e2 = IRExpr_Const(IRConst_U64(u64));
                }
                break;
             }

             /* -- CmpEQ -- */
             case Iop_CmpEQ32:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                         == e->Iex.Binop.arg2->Iex.Const.con->Ico.U32))));
                break;
             case Iop_CmpEQ64:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                         == e->Iex.Binop.arg2->Iex.Const.con->Ico.U64))));
                break;

             /* -- CmpNE -- */
             case Iop_CmpNE8:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        ((0xFF & e->Iex.Binop.arg1->Iex.Const.con->Ico.U8)
                         != (0xFF & e->Iex.Binop.arg2->Iex.Const.con->Ico.U8)))));
                break;
             case Iop_CmpNE32:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                         != e->Iex.Binop.arg2->Iex.Const.con->Ico.U32))));
                break;
             case Iop_CmpNE64:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                         != e->Iex.Binop.arg2->Iex.Const.con->Ico.U64))));
                break;

             /* -- CmpLEU -- */
             case Iop_CmpLE32U:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        ((UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
                         <= (UInt)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
                break;

             /* -- CmpLES -- */
             case Iop_CmpLE32S:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        ((Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
                         <= (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
                break;

             /* -- CmpLTS -- */
             case Iop_CmpLT32S:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        ((Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
                         < (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
                break;

             /* -- CmpLTU -- */
             case Iop_CmpLT32U:
                e2 = IRExpr_Const(IRConst_U1(toBool(
                        ((UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
                         < (UInt)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
                break;

             /* -- CmpORD -- */
             case Iop_CmpORD32S: {
                /* very paranoid */
                UInt  u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
                UInt  u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
                Int   s32a = (Int)u32a;
                Int   s32b = (Int)u32b;
                Int   r = 0x2; /* EQ */
                if (s32a < s32b) {
                   r = 0x8; /* LT */
                }
                else if (s32a > s32b) {
                   r = 0x4; /* GT */
                }
                e2 = IRExpr_Const(IRConst_U32(r));
                break;
             }

             /* -- nHLto2n -- */
             case Iop_32HLto64:
                e2 = IRExpr_Const(IRConst_U64(
                        (((ULong)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)) << 32)
                        | ((ULong)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32))
                     ));
                break;
             case Iop_64HLto128:
                /* We can't fold this, because there is no way to
                   express he result in IR, but at least pretend to
                   handle it, so as to stop getting blasted with
                   no-rule-for-this-primop messages. */
                break;

             default:
                goto unhandled;
          }

       } else {

          /* other cases (identities, etc) */

          /* Shl64/Shr64(x,0) ==> x */
          if ((e->Iex.Binop.op == Iop_Shl64 || e->Iex.Binop.op == Iop_Shr64)
              && e->Iex.Binop.arg2->tag == Iex_Const
              && e->Iex.Binop.arg2->Iex.Const.con->Ico.U8 == 0) {
             e2 = e->Iex.Binop.arg1;
          } else

          /* Shl32/Shr32(x,0) ==> x */
          if ((e->Iex.Binop.op == Iop_Shl32 || e->Iex.Binop.op == Iop_Shr32)
              && e->Iex.Binop.arg2->tag == Iex_Const
              && e->Iex.Binop.arg2->Iex.Const.con->Ico.U8 == 0) {
             e2 = e->Iex.Binop.arg1;
          } else

          /* Or8(x,0) ==> x */
          if ((e->Iex.Binop.op == Iop_Or8)
              && e->Iex.Binop.arg2->tag == Iex_Const
              && e->Iex.Binop.arg2->Iex.Const.con->Ico.U8 == 0) {
             e2 = e->Iex.Binop.arg1;
          } else

          /* Or16(x,0) ==> x */
          if ((e->Iex.Binop.op == Iop_Or16)
              && e->Iex.Binop.arg2->tag == Iex_Const
              && e->Iex.Binop.arg2->Iex.Const.con->Ico.U16 == 0) {
             e2 = e->Iex.Binop.arg1;
          } else

          /* Or32/Add32(x,0) ==> x */
          if ((e->Iex.Binop.op == Iop_Add32 || e->Iex.Binop.op == Iop_Or32)
              && e->Iex.Binop.arg2->tag == Iex_Const
              && e->Iex.Binop.arg2->Iex.Const.con->Ico.U32 == 0) {
             e2 = e->Iex.Binop.arg1;
          } else

          /* Or64/Add64(x,0) ==> x */
          if ((e->Iex.Binop.op == Iop_Add64 || e->Iex.Binop.op == Iop_Or64)
              && e->Iex.Binop.arg2->tag == Iex_Const
              && e->Iex.Binop.arg2->Iex.Const.con->Ico.U64 == 0) {
             e2 = e->Iex.Binop.arg1;
          } else

          /* And32(x,0xFFFFFFFF) ==> x */
          if (e->Iex.Binop.op == Iop_And32
              && e->Iex.Binop.arg2->tag == Iex_Const
              && e->Iex.Binop.arg2->Iex.Const.con->Ico.U32 == 0xFFFFFFFF) {
             e2 = e->Iex.Binop.arg1;
          } else

          /* And32(x,0) ==> 0 */
          if (e->Iex.Binop.op == Iop_And32
              && e->Iex.Binop.arg2->tag == Iex_Const
              && e->Iex.Binop.arg2->Iex.Const.con->Ico.U32 == 0) {
             e2 = IRExpr_Const(IRConst_U32(0));
          } else

          /* Or32(0,x) ==> x */
          if (e->Iex.Binop.op == Iop_Or32
              && e->Iex.Binop.arg1->tag == Iex_Const
              && e->Iex.Binop.arg1->Iex.Const.con->Ico.U32 == 0) {
             e2 = e->Iex.Binop.arg2;
          } else

          /* Or8/16/32/64(t,t) ==> t, for some IRTemp t */
          /* And8/16/32/64(t,t) ==> t, for some IRTemp t */
          if (   (e->Iex.Binop.op == Iop_And64
               || e->Iex.Binop.op == Iop_And32
               || e->Iex.Binop.op == Iop_And16
               || e->Iex.Binop.op == Iop_And8
               || e->Iex.Binop.op == Iop_Or64
               || e->Iex.Binop.op == Iop_Or32
               || e->Iex.Binop.op == Iop_Or16
               || e->Iex.Binop.op == Iop_Or8)
              && sameIRTemps(e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
             e2 = e->Iex.Binop.arg1;
          }

          /* Xor8/16/32/64(t,t) ==> 0, for some IRTemp t */
          if (   (e->Iex.Binop.op == Iop_Xor64
               || e->Iex.Binop.op == Iop_Xor32
               || e->Iex.Binop.op == Iop_Xor16
               || e->Iex.Binop.op == Iop_Xor8)
              && sameIRTemps(e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
             e2 = mkZeroForXor(e->Iex.Binop.op);
          }

       }
    }

    /* Mux0X */
    if (e->tag == Iex_Mux0X
        && e->Iex.Mux0X.cond->tag == Iex_Const) {
       Bool zero;
       /* assured us by the IR type rules */
       vassert(e->Iex.Mux0X.cond->Iex.Const.con->tag == Ico_U8);
       zero = toBool(0 == (0xFF & e->Iex.Mux0X.cond
                                   ->Iex.Const.con->Ico.U8));
       e2 = zero ? e->Iex.Mux0X.expr0 : e->Iex.Mux0X.exprX;
    }

    if (DEBUG_IROPT && e2 != e) {
       vex_printf("FOLD: ");
       ppIRExpr(e); vex_printf("  ->  ");
       ppIRExpr(e2); vex_printf("\n");
    }

    return e2;

  unhandled:
 #  if 0
    vex_printf("\n\n");
    ppIRExpr(e);
    vpanic("fold_Expr: no rule for the above");
 #  else
    if (vex_control.iropt_verbosity > 0) {
       vex_printf("vex iropt: fold_Expr: no rule for: ");
       ppIRExpr(e);
       vex_printf("\n");
    }
    return e2;
 #  endif
 }


 /* Apply the subst to a simple 1-level expression -- guaranteed to be
    1-level due to previous flattening pass. */

 static IRExpr* subst_Expr ( IRExpr** env, IRExpr* ex )
 {
    switch (ex->tag) {
       case Iex_Tmp:
          if (env[(Int)ex->Iex.Tmp.tmp] != NULL) {
             return env[(Int)ex->Iex.Tmp.tmp];
          } else {
             /* not bound in env */
             return ex;
          }

       case Iex_Const:
       case Iex_Get:
          return ex;

       case Iex_GetI:
          vassert(isIRAtom(ex->Iex.GetI.ix));
          return IRExpr_GetI(
             ex->Iex.GetI.descr,
             subst_Expr(env, ex->Iex.GetI.ix),
             ex->Iex.GetI.bias
          );

       case Iex_Binop:
          vassert(isIRAtom(ex->Iex.Binop.arg1));
          vassert(isIRAtom(ex->Iex.Binop.arg2));
          return IRExpr_Binop(
                    ex->Iex.Binop.op,
                    subst_Expr(env, ex->Iex.Binop.arg1),
                    subst_Expr(env, ex->Iex.Binop.arg2)
                 );

       case Iex_Unop:
          vassert(isIRAtom(ex->Iex.Unop.arg));
          return IRExpr_Unop(
                    ex->Iex.Unop.op,
                    subst_Expr(env, ex->Iex.Unop.arg)
                 );

       case Iex_Load:
          vassert(isIRAtom(ex->Iex.Load.addr));
          return IRExpr_Load(
                    ex->Iex.Load.end,
                    ex->Iex.Load.ty,
                    subst_Expr(env, ex->Iex.Load.addr)
                 );

       case Iex_CCall: {
          Int      i;
          IRExpr** args2 = sopyIRExprVec(ex->Iex.CCall.args);
          for (i = 0; args2[i]; i++) {
             vassert(isIRAtom(args2[i]));
             args2[i] = subst_Expr(env, args2[i]);
          }
          return IRExpr_CCall(
                    ex->Iex.CCall.cee,
                    ex->Iex.CCall.retty,
                    args2
                 );
       }

       case Iex_Mux0X:
          vassert(isIRAtom(ex->Iex.Mux0X.cond));
          vassert(isIRAtom(ex->Iex.Mux0X.expr0));
          vassert(isIRAtom(ex->Iex.Mux0X.exprX));
          return IRExpr_Mux0X(
                    subst_Expr(env, ex->Iex.Mux0X.cond),
                    subst_Expr(env, ex->Iex.Mux0X.expr0),
                    subst_Expr(env, ex->Iex.Mux0X.exprX)
                 );

       default:
          vex_printf("\n\n"); ppIRExpr(ex);
          vpanic("subst_Expr");

    }
 }


 /* Apply the subst to stmt, then fold the result as much as possible.
    Much simplified due to stmt being previously flattened.  As a
    result of this, the stmt may wind up being turned into a no-op.
 */
 static IRStmt* subst_and_fold_Stmt ( IRExpr** env, IRStmt* st )
 {
 #  if 0
    vex_printf("\nsubst and fold stmt\n");
    ppIRStmt(st);
    vex_printf("\n");
 #  endif

    switch (st->tag) {
       case Ist_AbiHint:
          vassert(isIRAtom(st->Ist.AbiHint.base));
          return IRStmt_AbiHint(
                    fold_Expr(subst_Expr(env, st->Ist.AbiHint.base)),
                    st->Ist.AbiHint.len
                 );
       case Ist_Put:
          vassert(isIRAtom(st->Ist.Put.data));
          return IRStmt_Put(
                    st->Ist.Put.offset,
                    fold_Expr(subst_Expr(env, st->Ist.Put.data))
                 );

       case Ist_PutI:
          vassert(isIRAtom(st->Ist.PutI.ix));
          vassert(isIRAtom(st->Ist.PutI.data));
          return IRStmt_PutI(
                    st->Ist.PutI.descr,
                    fold_Expr(subst_Expr(env, st->Ist.PutI.ix)),
                    st->Ist.PutI.bias,
                    fold_Expr(subst_Expr(env, st->Ist.PutI.data))
                 );

       case Ist_Tmp:
          /* This is the one place where an expr (st->Ist.Tmp.data) is
             allowed to be more than just a constant or a tmp. */
          return IRStmt_Tmp(
                    st->Ist.Tmp.tmp,
                    fold_Expr(subst_Expr(env, st->Ist.Tmp.data))
                 );

       case Ist_Store:
          vassert(isIRAtom(st->Ist.Store.addr));
          vassert(isIRAtom(st->Ist.Store.data));
          return IRStmt_Store(
                    st->Ist.Store.end,
                    fold_Expr(subst_Expr(env, st->Ist.Store.addr)),
                    fold_Expr(subst_Expr(env, st->Ist.Store.data))
                 );

       case Ist_Dirty: {
          Int     i;
          IRDirty *d, *d2;
          d = st->Ist.Dirty.details;
          d2 = emptyIRDirty();
          *d2 = *d;
          d2->args = sopyIRExprVec(d2->args);
          if (d2->mFx != Ifx_None) {
             vassert(isIRAtom(d2->mAddr));
             d2->mAddr = fold_Expr(subst_Expr(env, d2->mAddr));
          }
          vassert(isIRAtom(d2->guard));
          d2->guard = fold_Expr(subst_Expr(env, d2->guard));
          for (i = 0; d2->args[i]; i++) {
             vassert(isIRAtom(d2->args[i]));
             d2->args[i] = fold_Expr(subst_Expr(env, d2->args[i]));
          }
          return IRStmt_Dirty(d2);
       }

       case Ist_IMark:
          return IRStmt_IMark(st->Ist.IMark.addr, st->Ist.IMark.len);

       case Ist_NoOp:
          return IRStmt_NoOp();

       case Ist_MFence:
          return IRStmt_MFence();

       case Ist_Exit: {
          IRExpr* fcond;
          vassert(isIRAtom(st->Ist.Exit.guard));
          fcond = fold_Expr(subst_Expr(env, st->Ist.Exit.guard));
          if (fcond->tag == Iex_Const) {
             /* Interesting.  The condition on this exit has folded down to
                a constant. */
             vassert(fcond->Iex.Const.con->tag == Ico_U1);
             vassert(fcond->Iex.Const.con->Ico.U1 == False
                     || fcond->Iex.Const.con->Ico.U1 == True);
             if (fcond->Iex.Const.con->Ico.U1 == False) {
                /* exit is never going to happen, so dump the statement. */
                return IRStmt_NoOp();
             } else {
                vassert(fcond->Iex.Const.con->Ico.U1 == True);
                /* Hmmm.  The exit has become unconditional.  Leave it
                   as it is for now, since we'd have to truncate the BB
                   at this point, which is tricky.  Such truncation is
                   done later by the dead-code elimination pass. */
                /* fall out into the reconstruct-the-exit code. */
                if (vex_control.iropt_verbosity > 0)
                   /* really a misuse of vex_control.iropt_verbosity */
                   vex_printf("vex iropt: IRStmt_Exit became unconditional\n");
             }
          }
          return IRStmt_Exit(fcond, st->Ist.Exit.jk, st->Ist.Exit.dst);
       }

    default:
       vex_printf("\n"); ppIRStmt(st);
       vpanic("subst_and_fold_Stmt");
    }
 }


 IRBB* cprop_BB ( IRBB* in )
 {
    Int      i;
    IRBB*    out;
    IRStmt*  st2;
    Int      n_tmps = in->tyenv->types_used;
    IRExpr** env = LibVEX_Alloc(n_tmps * sizeof(IRExpr*));

    out = emptyIRBB();
    out->tyenv = dopyIRTypeEnv( in->tyenv );

    /* Set up the env with which travels forward.  This holds a
       substitution, mapping IRTemps to atoms, that is, IRExprs which
       are either IRTemps or IRConsts.  Thus, copy and constant
       propagation is done.  The environment is to be applied as we
       move along.  Keys are IRTemps.  Values are IRExpr*s.
    */
    for (i = 0; i < n_tmps; i++)
       env[i] = NULL;

    /* For each original SSA-form stmt ... */
    for (i = 0; i < in->stmts_used; i++) {

       /* First apply the substitution to the current stmt.  This
          propagates in any constants and tmp-tmp assignments
          accumulated prior to this point.  As part of the subst_Stmt
          call, also then fold any constant expressions resulting. */

       st2 = in->stmts[i];

       /* perhaps st2 is already a no-op? */
       if (st2->tag == Ist_NoOp) continue;

       st2 = subst_and_fold_Stmt( env, st2 );

       /* If the statement has been folded into a no-op, forget it. */
       if (st2->tag == Ist_NoOp) continue;

       /* Now consider what the stmt looks like.  If it's of the form
          't = const' or 't1 = t2', add it to the running environment
          and not to the output BB.  Otherwise, add it to the output
          BB.  Note, we choose not to propagate const when const is an
          F64i, so that F64i literals can be CSE'd later.  This helps
          x86 floating point code generation. */

       if (st2->tag == Ist_Tmp
           && st2->Ist.Tmp.data->tag == Iex_Const
           && st2->Ist.Tmp.data->Iex.Const.con->tag != Ico_F64i) {
          /* 't = const' -- add to env.
              The pair (IRTemp, IRExpr*) is added. */
          env[(Int)(st2->Ist.Tmp.tmp)] = st2->Ist.Tmp.data;
       }
       else
       if (st2->tag == Ist_Tmp && st2->Ist.Tmp.data->tag == Iex_Tmp) {
          /* 't1 = t2' -- add to env.
              The pair (IRTemp, IRExpr*) is added. */
          env[(Int)(st2->Ist.Tmp.tmp)] = st2->Ist.Tmp.data;
       }
       else {
          /* Not interesting, copy st2 into the output block. */
          addStmtToIRBB( out, st2 );
       }
    }

    out->next     = subst_Expr( env, in->next );
    out->jumpkind = in->jumpkind;
    return out;
 }


 /*---------------------------------------------------------------*/
 /*--- Dead code (t = E) removal                               ---*/
 /*---------------------------------------------------------------*/

 /* As a side effect, also removes all code following an unconditional
    side exit. */

 /* The type of the HashHW map is: a map from IRTemp to nothing
    -- really just operating a set or IRTemps.
 */

 inline
 static void addUses_Temp ( Bool* set, IRTemp tmp )
 {
    set[(Int)tmp] = True;
 }

 static void addUses_Expr ( Bool* set, IRExpr* e )
 {
    Int i;
    switch (e->tag) {
       case Iex_GetI:
          addUses_Expr(set, e->Iex.GetI.ix);
          return;
       case Iex_Mux0X:
          addUses_Expr(set, e->Iex.Mux0X.cond);
          addUses_Expr(set, e->Iex.Mux0X.expr0);
          addUses_Expr(set, e->Iex.Mux0X.exprX);
          return;
       case Iex_CCall:
          for (i = 0; e->Iex.CCall.args[i]; i++)
             addUses_Expr(set, e->Iex.CCall.args[i]);
          return;
       case Iex_Load:
          addUses_Expr(set, e->Iex.Load.addr);
          return;
       case Iex_Binop:
          addUses_Expr(set, e->Iex.Binop.arg1);
          addUses_Expr(set, e->Iex.Binop.arg2);
          return;
       case Iex_Unop:
          addUses_Expr(set, e->Iex.Unop.arg);
          return;
       case Iex_Tmp:
          addUses_Temp(set, e->Iex.Tmp.tmp);
          return;
       case Iex_Const:
       case Iex_Get:
          return;
       default:
          vex_printf("\n");
          ppIRExpr(e);
          vpanic("addUses_Expr");
    }
 }

 static void addUses_Stmt ( Bool* set, IRStmt* st )
 {
    Int      i;
    IRDirty* d;
    switch (st->tag) {
       case Ist_AbiHint:
          addUses_Expr(set, st->Ist.AbiHint.base);
          return;
       case Ist_PutI:
          addUses_Expr(set, st->Ist.PutI.ix);
          addUses_Expr(set, st->Ist.PutI.data);
          return;
       case Ist_Tmp:
          addUses_Expr(set, st->Ist.Tmp.data);
          return;
       case Ist_Put:
          addUses_Expr(set, st->Ist.Put.data);
          return;
       case Ist_Store:
          addUses_Expr(set, st->Ist.Store.addr);
          addUses_Expr(set, st->Ist.Store.data);
          return;
       case Ist_Dirty:
          d = st->Ist.Dirty.details;
          if (d->mFx != Ifx_None)
             addUses_Expr(set, d->mAddr);
          addUses_Expr(set, d->guard);
          for (i = 0; d->args[i] != NULL; i++)
             addUses_Expr(set, d->args[i]);
          return;
       case Ist_NoOp:
       case Ist_IMark:
       case Ist_MFence:
          return;
       case Ist_Exit:
          addUses_Expr(set, st->Ist.Exit.guard);
          return;
       default:
          vex_printf("\n");
          ppIRStmt(st);
          vpanic("addUses_Stmt");
    }
 }


 /* Is this literally IRExpr_Const(IRConst_U1(False)) ? */
 static Bool isZeroU1 ( IRExpr* e )
 {
    return toBool( e->tag == Iex_Const
                   && e->Iex.Const.con->tag == Ico_U1
                   && e->Iex.Const.con->Ico.U1 == False );
 }

 /* Is this literally IRExpr_Const(IRConst_U1(True)) ? */
 static Bool isOneU1 ( IRExpr* e )
 {
    return toBool( e->tag == Iex_Const
                   && e->Iex.Const.con->tag == Ico_U1
                   && e->Iex.Const.con->Ico.U1 == True );
 }


 /* Note, this destructively modifies the given IRBB. */

 /* Scan backwards through statements, carrying a set of IRTemps which
    are known to be used after the current point.  On encountering 't =
    E', delete the binding if it is not used.  Otherwise, add any temp
    uses to the set and keep on moving backwards.

    As an enhancement, the first (backwards) pass searches for IR exits
    with always-taken conditions and notes the location of the earliest
    one in the block.  If any such are found, a second pass copies the
    exit destination and jump kind to the bb-end.  Then, the exit and
    all statements following it are turned into no-ops.
 */

 /* notstatic */ void do_deadcode_BB ( IRBB* bb )
 {
    Int     i, i_unconditional_exit;
    Int     n_tmps = bb->tyenv->types_used;
    Bool*   set = LibVEX_Alloc(n_tmps * sizeof(Bool));
    IRStmt* st;

    for (i = 0; i < n_tmps; i++)
       set[i] = False;

    /* start off by recording IRTemp uses in the next field. */
    addUses_Expr(set, bb->next);

    /* First pass */

    /* Work backwards through the stmts */
    i_unconditional_exit = -1;
    for (i = bb->stmts_used-1; i >= 0; i--) {
       st = bb->stmts[i];
       if (st->tag == Ist_NoOp)
          continue;
       /* take note of any unconditional exits */
       if (st->tag == Ist_Exit
           && isOneU1(st->Ist.Exit.guard))
          i_unconditional_exit = i;
       if (st->tag == Ist_Tmp
           && set[(Int)(st->Ist.Tmp.tmp)] == False) {
           /* it's an IRTemp which never got used.  Delete it. */
          if (DEBUG_IROPT) {
             vex_printf("DEAD: ");
             ppIRStmt(st);
             vex_printf("\n");
          }
          bb->stmts[i] = IRStmt_NoOp();
       }
       else
       if (st->tag == Ist_Dirty
           && st->Ist.Dirty.details->guard
           && isZeroU1(st->Ist.Dirty.details->guard)) {
          /* This is a dirty helper which will never get called.
             Delete it. */
          bb->stmts[i] = IRStmt_NoOp();
        }
        else {
          /* Note any IRTemp uses made by the current statement. */
          addUses_Stmt(set, st);
       }
    }

    /* Optional second pass: if any unconditional exits were found,
       delete them and all following statements. */

    if (i_unconditional_exit != -1) {
       if (0) vex_printf("ZAPPING ALL FORWARDS from %d\n",
                         i_unconditional_exit);
       vassert(i_unconditional_exit >= 0
               && i_unconditional_exit < bb->stmts_used);
       bb->next
          = IRExpr_Const( bb->stmts[i_unconditional_exit]->Ist.Exit.dst );
       bb->jumpkind
          = bb->stmts[i_unconditional_exit]->Ist.Exit.jk;
       for (i = i_unconditional_exit; i < bb->stmts_used; i++)
          bb->stmts[i] = IRStmt_NoOp();
    }
 }


 /*---------------------------------------------------------------*/
 /*--- Specialisation of helper function calls, in             ---*/
 /*--- collaboration with the front end                        ---*/
 /*---------------------------------------------------------------*/

 static
 IRBB* spec_helpers_BB ( IRBB* bb,
                         IRExpr* (*specHelper) ( HChar*, IRExpr**) )
 {
    Int     i;
    IRStmt* st;
    IRExpr* ex;
    Bool    any = False;

    for (i = bb->stmts_used-1; i >= 0; i--) {
       st = bb->stmts[i];

       if (st->tag != Ist_Tmp
           || st->Ist.Tmp.data->tag != Iex_CCall)
          continue;

       ex = (*specHelper)( st->Ist.Tmp.data->Iex.CCall.cee->name,
                           st->Ist.Tmp.data->Iex.CCall.args );
       if (!ex)
         /* the front end can't think of a suitable replacement */
         continue;

       /* We got something better.  Install it in the bb. */
       any = True;
       bb->stmts[i]
          = IRStmt_Tmp(st->Ist.Tmp.tmp, ex);

       if (0) {
          vex_printf("SPEC: ");
          ppIRExpr(st->Ist.Tmp.data);
          vex_printf("  -->  ");
          ppIRExpr(ex);
          vex_printf("\n");
       }
    }

    if (any)
       bb = flatten_BB(bb);
    return bb;
 }


 /*---------------------------------------------------------------*/
 /*--- Common Subexpression Elimination                        ---*/
 /*---------------------------------------------------------------*/

 /* Expensive in time and space. */

 /* Uses two environments:
    a IRTemp -> IRTemp mapping
    a mapping from AvailExpr* to IRTemp
 */

 typedef
    struct {
       enum { Ut, Btt, Btc, Bct, Cf64i } tag;
       union {
          /* unop(tmp) */
          struct {
             IROp   op;
             IRTemp arg;
          } Ut;
          /* binop(tmp,tmp) */
          struct {
             IROp   op;
             IRTemp arg1;
             IRTemp arg2;
          } Btt;
          /* binop(tmp,const) */
          struct {
             IROp    op;
             IRTemp  arg1;
             IRConst con2;
          } Btc;
          /* binop(const,tmp) */
          struct {
             IROp    op;
             IRConst con1;
             IRTemp  arg2;
          } Bct;
          /* F64i-style const */
          struct {
             ULong f64i;
          } Cf64i;
       } u;
    }
    AvailExpr;

 static Bool eq_AvailExpr ( AvailExpr* a1, AvailExpr* a2 )
 {
    if (a1->tag != a2->tag)
       return False;
    switch (a1->tag) {
       case Ut:
          return toBool(
                 a1->u.Ut.op == a2->u.Ut.op
                 && a1->u.Ut.arg == a2->u.Ut.arg);
       case Btt:
          return toBool(
                 a1->u.Btt.op == a2->u.Btt.op
                 && a1->u.Btt.arg1 == a2->u.Btt.arg1
                 && a1->u.Btt.arg2 == a2->u.Btt.arg2);
       case Btc:
          return toBool(
                 a1->u.Btc.op == a2->u.Btc.op
                 && a1->u.Btc.arg1 == a2->u.Btc.arg1
                 && eqIRConst(&a1->u.Btc.con2, &a2->u.Btc.con2));
       case Bct:
          return toBool(
                 a1->u.Bct.op == a2->u.Bct.op
                 && a1->u.Bct.arg2 == a2->u.Bct.arg2
                 && eqIRConst(&a1->u.Bct.con1, &a2->u.Bct.con1));
       case Cf64i:
          return toBool(a1->u.Cf64i.f64i == a2->u.Cf64i.f64i);
       default: vpanic("eq_AvailExpr");
    }
 }

 static IRExpr* availExpr_to_IRExpr ( AvailExpr* ae )
 {
    IRConst* con;
    switch (ae->tag) {
       case Ut:
          return IRExpr_Unop( ae->u.Ut.op, IRExpr_Tmp(ae->u.Ut.arg) );
       case Btt:
          return IRExpr_Binop( ae->u.Btt.op,
                               IRExpr_Tmp(ae->u.Btt.arg1),
                               IRExpr_Tmp(ae->u.Btt.arg2) );
       case Btc:
          con = LibVEX_Alloc(sizeof(IRConst));
          *con = ae->u.Btc.con2;
          return IRExpr_Binop( ae->u.Btc.op,
                               IRExpr_Tmp(ae->u.Btc.arg1), IRExpr_Const(con) );
       case Bct:
          con = LibVEX_Alloc(sizeof(IRConst));
          *con = ae->u.Bct.con1;
          return IRExpr_Binop( ae->u.Bct.op,
                               IRExpr_Const(con), IRExpr_Tmp(ae->u.Bct.arg2) );
       case Cf64i:
          return IRExpr_Const(IRConst_F64i(ae->u.Cf64i.f64i));
       default:
          vpanic("availExpr_to_IRExpr");
    }
 }

 inline
 static IRTemp subst_AvailExpr_Temp ( HashHW* env, IRTemp tmp )
 {
    HWord res;
    /* env :: IRTemp -> IRTemp */
    if (lookupHHW( env, &res, (HWord)tmp ))
       return (IRTemp)res;
    else
       return tmp;
 }

 static void subst_AvailExpr ( HashHW* env, AvailExpr* ae )
 {
    /* env :: IRTemp -> IRTemp */
    switch (ae->tag) {
       case Ut:
          ae->u.Ut.arg = subst_AvailExpr_Temp( env, ae->u.Ut.arg );
          break;
       case Btt:
          ae->u.Btt.arg1 = subst_AvailExpr_Temp( env, ae->u.Btt.arg1 );
          ae->u.Btt.arg2 = subst_AvailExpr_Temp( env, ae->u.Btt.arg2 );
          break;
       case Btc:
          ae->u.Btc.arg1 = subst_AvailExpr_Temp( env, ae->u.Btc.arg1 );
          break;
       case Bct:
          ae->u.Bct.arg2 = subst_AvailExpr_Temp( env, ae->u.Bct.arg2 );
          break;
       case Cf64i:
          break;
       default:
          vpanic("subst_AvailExpr");
    }
 }

 static AvailExpr* irExpr_to_AvailExpr ( IRExpr* e )
 {
    AvailExpr* ae;

    if (e->tag == Iex_Unop
        && e->Iex.Unop.arg->tag == Iex_Tmp) {
       ae = LibVEX_Alloc(sizeof(AvailExpr));
       ae->tag      = Ut;
       ae->u.Ut.op  = e->Iex.Unop.op;
       ae->u.Ut.arg = e->Iex.Unop.arg->Iex.Tmp.tmp;
       return ae;
    }

    if (e->tag == Iex_Binop
        && e->Iex.Binop.arg1->tag == Iex_Tmp
        && e->Iex.Binop.arg2->tag == Iex_Tmp) {
       ae = LibVEX_Alloc(sizeof(AvailExpr));
       ae->tag        = Btt;
       ae->u.Btt.op   = e->Iex.Binop.op;
       ae->u.Btt.arg1 = e->Iex.Binop.arg1->Iex.Tmp.tmp;
       ae->u.Btt.arg2 = e->Iex.Binop.arg2->Iex.Tmp.tmp;
       return ae;
    }

    if (e->tag == Iex_Binop
       && e->Iex.Binop.arg1->tag == Iex_Tmp
       && e->Iex.Binop.arg2->tag == Iex_Const) {
       ae = LibVEX_Alloc(sizeof(AvailExpr));
       ae->tag        = Btc;
       ae->u.Btc.op   = e->Iex.Binop.op;
       ae->u.Btc.arg1 = e->Iex.Binop.arg1->Iex.Tmp.tmp;
       ae->u.Btc.con2 = *(e->Iex.Binop.arg2->Iex.Const.con);
       return ae;
    }

    if (e->tag == Iex_Binop
       && e->Iex.Binop.arg1->tag == Iex_Const
       && e->Iex.Binop.arg2->tag == Iex_Tmp) {
       ae = LibVEX_Alloc(sizeof(AvailExpr));
       ae->tag        = Bct;
       ae->u.Bct.op   = e->Iex.Binop.op;
       ae->u.Bct.arg2 = e->Iex.Binop.arg2->Iex.Tmp.tmp;
       ae->u.Bct.con1 = *(e->Iex.Binop.arg1->Iex.Const.con);
       return ae;
    }

    if (e->tag == Iex_Const
        && e->Iex.Const.con->tag == Ico_F64i) {
       ae = LibVEX_Alloc(sizeof(AvailExpr));
       ae->tag          = Cf64i;
       ae->u.Cf64i.f64i = e->Iex.Const.con->Ico.F64i;
       return ae;
    }

    return NULL;
 }


 /* The BB is modified in-place. */

 void do_cse_BB ( IRBB* bb )
 {
    Int        i, j;
    IRTemp     t, q;
    IRStmt*    st;
    AvailExpr* eprime;

    HashHW* tenv = newHHW(); /* :: IRTemp -> IRTemp */
    HashHW* aenv = newHHW(); /* :: AvailExpr* -> IRTemp */

    vassert(sizeof(IRTemp) <= sizeof(HWord));

    //ppIRBB(bb);
    //vex_printf("\n\n");

    /* Iterate forwards over the stmts.
       On seeing "t = E", where E is one of the 3 AvailExpr forms:
          let E' = apply tenv substitution to E
          search aenv for E'
             if a mapping E' -> q is found,
                replace this stmt by "t = q"
                and add binding t -> q to tenv
             else
                add binding E' -> t to aenv
                replace this stmt by "t = E'"
       Ignore any other kind of stmt.
    */
    for (i = 0; i < bb->stmts_used; i++) {
       st = bb->stmts[i];

       /* ignore not-interestings */
       if (st->tag != Ist_Tmp)
          continue;

       t = st->Ist.Tmp.tmp;
       eprime = irExpr_to_AvailExpr(st->Ist.Tmp.data);
       /* ignore if not of AvailExpr form */
       if (!eprime)
          continue;

       /* vex_printf("considering: " ); ppIRStmt(st); vex_printf("\n"); */

       /* apply tenv */
       subst_AvailExpr( tenv, eprime );

       /* search aenv for eprime, unfortunately the hard way */
       for (j = 0; j < aenv->used; j++)
          if (aenv->inuse[j] && eq_AvailExpr(eprime, (AvailExpr*)aenv->key[j]))
             break;

       if (j < aenv->used) {
          /* A binding E' -> q was found.  Replace stmt by "t = q" and
             note the t->q binding in tenv. */
          /* (this is the core of the CSE action) */
          q = (IRTemp)aenv->val[j];
          bb->stmts[i] = IRStmt_Tmp( t, IRExpr_Tmp(q) );
          addToHHW( tenv, (HWord)t, (HWord)q );
       } else {
          /* No binding was found, so instead we add E' -> t to our
             collection of available expressions, replace this stmt
             with "t = E'", and move on. */
          bb->stmts[i] = IRStmt_Tmp( t, availExpr_to_IRExpr(eprime) );
          addToHHW( aenv, (HWord)eprime, (HWord)t );
       }
    }

    //ppIRBB(bb);
    //sanityCheckIRBB(bb, Ity_I32);
    //vex_printf("\n\n");

 }


 /*---------------------------------------------------------------*/
 /*--- Add32/Sub32 chain collapsing                            ---*/
 /*---------------------------------------------------------------*/

 /* ----- Helper functions for Add32/Sub32 chain collapsing ----- */

 /* Is this expression "Add32(tmp,const)" or "Sub32(tmp,const)" ?  If
    yes, set *tmp and *i32 appropriately.  *i32 is set as if the
    root node is Add32, not Sub32. */

 static Bool isAdd32OrSub32 ( IRExpr* e, IRTemp* tmp, Int* i32 )
 {
    if (e->tag != Iex_Binop)
       return False;
    if (e->Iex.Binop.op != Iop_Add32 && e->Iex.Binop.op != Iop_Sub32)
       return False;
    if (e->Iex.Binop.arg1->tag != Iex_Tmp)
       return False;
    if (e->Iex.Binop.arg2->tag != Iex_Const)
       return False;
    *tmp = e->Iex.Binop.arg1->Iex.Tmp.tmp;
    *i32 = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32);
    if (e->Iex.Binop.op == Iop_Sub32)
       *i32 = -*i32;
    return True;
 }


 /* Figure out if tmp can be expressed as tmp2 +32 const, for some
    other tmp2.  Scan backwards from the specified start point -- an
    optimisation. */

 static Bool collapseChain ( IRBB* bb, Int startHere,
                             IRTemp tmp,
                             IRTemp* tmp2, Int* i32 )
 {
    Int     j, ii;
    IRTemp  vv;
    IRStmt* st;
    IRExpr* e;

    /* the (var, con) pair contain the current 'representation' for
       'tmp'.  We start with 'tmp + 0'.  */
    IRTemp var = tmp;
    Int    con = 0;

    /* Scan backwards to see if tmp can be replaced by some other tmp
      +/- a constant. */
    for (j = startHere; j >= 0; j--) {
       st = bb->stmts[j];
       if (st->tag != Ist_Tmp)
          continue;
       if (st->Ist.Tmp.tmp != var)
          continue;
       e = st->Ist.Tmp.data;
       if (!isAdd32OrSub32(e, &vv, &ii))
          break;
       var = vv;
       con += ii;
    }
    if (j == -1)
       /* no earlier binding for var .. ill-formed IR */
       vpanic("collapseChain");

    /* so, did we find anything interesting? */
    if (var == tmp)
       return False; /* no .. */

    *tmp2 = var;
    *i32  = con;
    return True;
 }


 /* ------- Main function for Add32/Sub32 chain collapsing ------ */

 static void collapse_AddSub_chains_BB ( IRBB* bb )
 {
    IRStmt *st;
    IRTemp var, var2;
    Int    i, con, con2;

    for (i = bb->stmts_used-1; i >= 0; i--) {
       st = bb->stmts[i];
       if (st->tag == Ist_NoOp)
          continue;

       /* Try to collapse 't1 = Add32/Sub32(t2, con)'. */

       if (st->tag == Ist_Tmp
           && isAdd32OrSub32(st->Ist.Tmp.data, &var, &con)) {

          /* So e1 is of the form Add32(var,con) or Sub32(var,-con).
             Find out if var can be expressed as var2 + con2. */
          if (collapseChain(bb, i-1, var, &var2, &con2)) {
             if (DEBUG_IROPT) {
                vex_printf("replacing1 ");
                ppIRStmt(st);
                vex_printf(" with ");
             }
             con2 += con;
             bb->stmts[i]
                = IRStmt_Tmp(
                     st->Ist.Tmp.tmp,
                     (con2 >= 0)
                       ? IRExpr_Binop(Iop_Add32,
                                      IRExpr_Tmp(var2),
                                      IRExpr_Const(IRConst_U32(con2)))
                       : IRExpr_Binop(Iop_Sub32,
                                      IRExpr_Tmp(var2),
                                      IRExpr_Const(IRConst_U32(-con2)))
                  );
             if (DEBUG_IROPT) {
                ppIRStmt(bb->stmts[i]);
                vex_printf("\n");
             }
          }

          continue;
       }

       /* Try to collapse 't1 = GetI[t2, con]'. */

       if (st->tag == Ist_Tmp
           && st->Ist.Tmp.data->tag == Iex_GetI
           && st->Ist.Tmp.data->Iex.GetI.ix->tag == Iex_Tmp
           && collapseChain(bb, i-1, st->Ist.Tmp.data->Iex.GetI.ix
                                       ->Iex.Tmp.tmp, &var2, &con2)) {
          if (DEBUG_IROPT) {
             vex_printf("replacing3 ");
             ppIRStmt(st);
             vex_printf(" with ");
          }
          con2 += st->Ist.Tmp.data->Iex.GetI.bias;
          bb->stmts[i]
             = IRStmt_Tmp(
                  st->Ist.Tmp.tmp,
                  IRExpr_GetI(st->Ist.Tmp.data->Iex.GetI.descr,
                              IRExpr_Tmp(var2),
                              con2));
          if (DEBUG_IROPT) {
             ppIRStmt(bb->stmts[i]);
             vex_printf("\n");
          }
          continue;
       }

       /* Perhaps st is PutI[t, con] ? */

       if (st->tag == Ist_PutI
           && st->Ist.PutI.ix->tag == Iex_Tmp
           && collapseChain(bb, i-1, st->Ist.PutI.ix->Iex.Tmp.tmp,
                                &var2, &con2)) {
          if (DEBUG_IROPT) {
             vex_printf("replacing2 ");
             ppIRStmt(st);
             vex_printf(" with ");
          }
          con2 += st->Ist.PutI.bias;
          bb->stmts[i]
            = IRStmt_PutI(st->Ist.PutI.descr,
                          IRExpr_Tmp(var2),
                          con2,
                          st->Ist.PutI.data);
          if (DEBUG_IROPT) {
             ppIRStmt(bb->stmts[i]);
             vex_printf("\n");
          }
          continue;
       }

    } /* for */
 }


 /*---------------------------------------------------------------*/
 /*--- PutI/GetI transformations                               ---*/
 /*---------------------------------------------------------------*/

 /* ------- Helper functions for PutI/GetI transformations ------ */

 /* Determine, to the extent possible, the relationship between two
    guest state accesses.  The possible outcomes are:

    * Exact alias.  These two accesses denote precisely the same
      piece of the guest state.

    * Definitely no alias.  These two accesses are guaranteed not to
      overlap any part of the guest state.

    * Unknown -- if neither of the above can be established.

    If in doubt, return Unknown.  */

 typedef
    enum { ExactAlias, NoAlias, UnknownAlias }
    GSAliasing;


 /* Produces the alias relation between an indexed guest
    state access and a non-indexed access. */

 static
 GSAliasing getAliasingRelation_IC ( IRArray* descr1, IRExpr* ix1,
                                     Int offset2, IRType ty2 )
 {
    UInt minoff1, maxoff1, minoff2, maxoff2;

    getArrayBounds( descr1, &minoff1, &maxoff1 );
    minoff2 = offset2;
    maxoff2 = minoff2 + sizeofIRType(ty2) - 1;

    if (maxoff1 < minoff2 || maxoff2 < minoff1)
       return NoAlias;

    /* Could probably do better here if required.  For the moment
       however just claim not to know anything more. */
    return UnknownAlias;
 }


 /* Produces the alias relation between two indexed guest state
    accesses. */

 static
 GSAliasing getAliasingRelation_II (
               IRArray* descr1, IRExpr* ix1, Int bias1,
               IRArray* descr2, IRExpr* ix2, Int bias2
            )
 {
    UInt minoff1, maxoff1, minoff2, maxoff2;
    Int  iters;

    /* First try hard to show they don't alias. */
    getArrayBounds( descr1, &minoff1, &maxoff1 );
    getArrayBounds( descr2, &minoff2, &maxoff2 );
    if (maxoff1 < minoff2 || maxoff2 < minoff1)
       return NoAlias;

    /* So the two arrays at least partially overlap.  To get any
       further we'll have to be sure that the descriptors are
       identical. */
    if (!eqIRArray(descr1, descr2))
       return UnknownAlias;

    /* The descriptors are identical.  Now the only difference can be
       in the index expressions.  If they cannot be shown to be
       identical, we have to say we don't know what the aliasing
       relation will be.  Now, since the IR is flattened, the index
       expressions should be atoms -- either consts or tmps.  So that
       makes the comparison simple. */
    vassert(isIRAtom(ix1));
    vassert(isIRAtom(ix2));
    if (!eqIRAtom(ix1,ix2))
       return UnknownAlias;

    /* Ok, the index expressions are identical.  So now the only way
       they can be different is in the bias.  Normalise this
       paranoidly, to reliably establish equality/non-equality. */

    /* So now we know that the GetI and PutI index the same array
       with the same base.  Are the offsets the same, modulo the
       array size?  Do this paranoidly. */
    vassert(descr1->nElems == descr2->nElems);
    vassert(descr1->elemTy == descr2->elemTy);
    vassert(descr1->base   == descr2->base);
    iters = 0;
    while (bias1 < 0 || bias2 < 0) {
       bias1 += descr1->nElems;
       bias2 += descr1->nElems;
       iters++;
       if (iters > 10)
          vpanic("getAliasingRelation: iters");
    }
    vassert(bias1 >= 0 && bias2 >= 0);
    bias1 %= descr1->nElems;
    bias2 %= descr1->nElems;
    vassert(bias1 >= 0 && bias1 < descr1->nElems);
    vassert(bias2 >= 0 && bias2 < descr1->nElems);

    /* Finally, biasP and biasG are normalised into the range
       0 .. descrP/G->nElems - 1.  And so we can establish
       equality/non-equality. */

    return bias1==bias2 ? ExactAlias : NoAlias;
 }


 /* Given the parts (descr, tmp, bias) for a GetI, scan backwards from
    the given starting point to find, if any, a PutI which writes
    exactly the same piece of guest state, and so return the expression
    that the PutI writes.  This is the core of PutI-GetI forwarding. */

 static
 IRExpr* findPutI ( IRBB* bb, Int startHere,
                    IRArray* descrG, IRExpr* ixG, Int biasG )
 {
    Int        j;
    IRStmt*    st;
    GSAliasing relation;

    if (0) {
       vex_printf("\nfindPutI ");
       ppIRArray(descrG);
       vex_printf(" ");
       ppIRExpr(ixG);
       vex_printf(" %d\n", biasG);
    }

    /* Scan backwards in bb from startHere to find a suitable PutI
       binding for (descrG, ixG, biasG), if any. */

    for (j = startHere; j >= 0; j--) {
       st = bb->stmts[j];
       if (st->tag == Ist_NoOp)
          continue;

       if (st->tag == Ist_Put) {
          /* Non-indexed Put.  This can't give a binding, but we do
             need to check it doesn't invalidate the search by
             overlapping any part of the indexed guest state. */

          relation
             = getAliasingRelation_IC(
                  descrG, ixG,
                  st->Ist.Put.offset,
                  typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );

          if (relation == NoAlias) {
             /* we're OK; keep going */
             continue;
          } else {
             /* relation == UnknownAlias || relation == ExactAlias */
             /* If this assertion fails, we've found a Put which writes
                an area of guest state which is read by a GetI.  Which
                is unlikely (although not per se wrong). */
             vassert(relation != ExactAlias);
             /* This Put potentially writes guest state that the GetI
                reads; we must fail. */
             return NULL;
          }
       }

       if (st->tag == Ist_PutI) {

          relation = getAliasingRelation_II(
                        descrG, ixG, biasG,
                        st->Ist.PutI.descr,
                        st->Ist.PutI.ix,
                        st->Ist.PutI.bias
                     );

          if (relation == NoAlias) {
             /* This PutI definitely doesn't overlap.  Ignore it and
                keep going. */
             continue; /* the for j loop */
          }

          if (relation == UnknownAlias) {
             /* We don't know if this PutI writes to the same guest
                state that the GetI, or not.  So we have to give up. */
             return NULL;
          }

          /* Otherwise, we've found what we're looking for.  */
          vassert(relation == ExactAlias);
          return st->Ist.PutI.data;

       } /* if (st->tag == Ist_PutI) */

       if (st->tag == Ist_Dirty) {
          /* Be conservative.  If the dirty call has any guest effects at
             all, give up.  We could do better -- only give up if there
             are any guest writes/modifies. */
          if (st->Ist.Dirty.details->nFxState > 0)
             return NULL;
       }

    } /* for */

    /* No valid replacement was found. */
    return NULL;
 }


 /* Assuming pi is a PutI stmt, is s2 identical to it (in the sense
    that it writes exactly the same piece of guest state) ?  Safe
    answer: False. */

 static Bool identicalPutIs ( IRStmt* pi, IRStmt* s2 )
 {
    vassert(pi->tag == Ist_PutI);
    if (s2->tag != Ist_PutI)
       return False;

    return toBool(
           getAliasingRelation_II(
              pi->Ist.PutI.descr, pi->Ist.PutI.ix, pi->Ist.PutI.bias,
              s2->Ist.PutI.descr, s2->Ist.PutI.ix, s2->Ist.PutI.bias
           )
           == ExactAlias
           );
 }


 /* Assuming pi is a PutI stmt, is s2 a Get/GetI/Put/PutI which might
    overlap it?  Safe answer: True.  Note, we could do a lot better
    than this if needed. */

 static
 Bool guestAccessWhichMightOverlapPutI (
         IRTypeEnv* tyenv, IRStmt* pi, IRStmt* s2
      )
 {
    GSAliasing relation;
    UInt       minoffP, maxoffP;

    vassert(pi->tag == Ist_PutI);
    getArrayBounds(pi->Ist.PutI.descr, &minoffP, &maxoffP);
    switch (s2->tag) {

       case Ist_NoOp:
       case Ist_IMark:
          return False;

       case Ist_MFence:
       case Ist_AbiHint:
          /* just be paranoid ... these should be rare. */
          return True;

       case Ist_Dirty:
          /* If the dirty call has any guest effects at all, give up.
             Probably could do better. */
          if (s2->Ist.Dirty.details->nFxState > 0)
             return True;
          return False;

       case Ist_Put:
          vassert(isIRAtom(s2->Ist.Put.data));
          relation
             = getAliasingRelation_IC(
                  pi->Ist.PutI.descr, pi->Ist.PutI.ix,
                  s2->Ist.Put.offset,
                  typeOfIRExpr(tyenv,s2->Ist.Put.data)
               );
          goto have_relation;

       case Ist_PutI:
          vassert(isIRAtom(s2->Ist.PutI.ix));
          vassert(isIRAtom(s2->Ist.PutI.data));
          relation
             = getAliasingRelation_II(
                  pi->Ist.PutI.descr, pi->Ist.PutI.ix, pi->Ist.PutI.bias,
                  s2->Ist.PutI.descr, s2->Ist.PutI.ix, s2->Ist.PutI.bias
               );
          goto have_relation;

       case Ist_Tmp:
          if (s2->Ist.Tmp.data->tag == Iex_GetI) {
             relation
                = getAliasingRelation_II(
                     pi->Ist.PutI.descr, pi->Ist.PutI.ix,
                                         pi->Ist.PutI.bias,
                     s2->Ist.Tmp.data->Iex.GetI.descr,
                     s2->Ist.Tmp.data->Iex.GetI.ix,
                     s2->Ist.Tmp.data->Iex.GetI.bias
                  );
             goto have_relation;
          }
          if (s2->Ist.Tmp.data->tag == Iex_Get) {
             relation
                = getAliasingRelation_IC(
                     pi->Ist.PutI.descr, pi->Ist.PutI.ix,
                     s2->Ist.Tmp.data->Iex.Get.offset,
                     s2->Ist.Tmp.data->Iex.Get.ty
                  );
             goto have_relation;
          }
          return False;

       case Ist_Store:
          vassert(isIRAtom(s2->Ist.Store.addr));
          vassert(isIRAtom(s2->Ist.Store.data));
          return False;

       default:
          vex_printf("\n"); ppIRStmt(s2); vex_printf("\n");
          vpanic("guestAccessWhichMightOverlapPutI");
    }

   have_relation:
    if (relation == NoAlias)
       return False;
    else
       return True; /* ExactAlias or UnknownAlias */
 }


 /* ---------- PutI/GetI transformations main functions --------- */

 /* Remove redundant GetIs, to the extent that they can be detected.
    bb is modified in-place. */

 static
 void do_redundant_GetI_elimination ( IRBB* bb )
 {
    Int     i;
    IRStmt* st;

    for (i = bb->stmts_used-1; i >= 0; i--) {
       st = bb->stmts[i];
       if (st->tag == Ist_NoOp)
          continue;

       if (st->tag == Ist_Tmp
           && st->Ist.Tmp.data->tag == Iex_GetI
           && st->Ist.Tmp.data->Iex.GetI.ix->tag == Iex_Tmp) {
          IRArray* descr = st->Ist.Tmp.data->Iex.GetI.descr;
          IRExpr*  ix    = st->Ist.Tmp.data->Iex.GetI.ix;
          Int      bias  = st->Ist.Tmp.data->Iex.GetI.bias;
          IRExpr*  replacement = findPutI(bb, i-1, descr, ix, bias);
          if (replacement
              && isIRAtom(replacement)
              /* Make sure we're doing a type-safe transformation! */
              && typeOfIRExpr(bb->tyenv, replacement) == descr->elemTy) {
             if (DEBUG_IROPT) {
                vex_printf("rGI:  ");
                ppIRExpr(st->Ist.Tmp.data);
                vex_printf(" -> ");
                ppIRExpr(replacement);
                vex_printf("\n");
             }
             bb->stmts[i] = IRStmt_Tmp(st->Ist.Tmp.tmp, replacement);
          }
       }
    }

 }


 /* Remove redundant PutIs, to the extent which they can be detected.
    bb is modified in-place. */

 static
 void do_redundant_PutI_elimination ( IRBB* bb )
 {
    Int    i, j;
    Bool   delete;
    IRStmt *st, *stj;

    for (i = 0; i < bb->stmts_used; i++) {
       st = bb->stmts[i];
       if (st->tag != Ist_PutI)
          continue;
       /* Ok, search forwards from here to see if we can find another
          PutI which makes this one redundant, and dodging various
          hazards.  Search forwards:
          * If conditional exit, give up (because anything after that
            does not postdominate this put).
          * If a Get which might overlap, give up (because this PutI
            not necessarily dead).
          * If a Put which is identical, stop with success.
          * If a Put which might overlap, but is not identical, give up.
          * If a dirty helper call which might write guest state, give up.
          * If a Put which definitely doesn't overlap, or any other
            kind of stmt, continue.
       */
       delete = False;
       for (j = i+1; j < bb->stmts_used; j++) {
          stj = bb->stmts[j];
          if (stj->tag == Ist_NoOp)
             continue;
          if (identicalPutIs(st, stj)) {
             /* success! */
             delete = True;
             break;
          }
          if (stj->tag == Ist_Exit)
             /* give up */
             break;
          if (st->tag == Ist_Dirty)
             /* give up; could do better here */
             break;
          if (guestAccessWhichMightOverlapPutI(bb->tyenv, st, stj))
             /* give up */
            break;
       }

       if (delete) {
          if (DEBUG_IROPT) {
             vex_printf("rPI:  ");
             ppIRStmt(st);
             vex_printf("\n");
          }
          bb->stmts[i] = IRStmt_NoOp();
       }

    }
 }


 /*---------------------------------------------------------------*/
 /*--- Loop unrolling                                          ---*/
 /*---------------------------------------------------------------*/

 /* Adjust all tmp values (names) in e by delta.  e is destructively
    modified. */

 static void deltaIRExpr ( IRExpr* e, Int delta )
 {
    Int i;
    switch (e->tag) {
       case Iex_Tmp:
          e->Iex.Tmp.tmp += delta;
          break;
       case Iex_Get:
       case Iex_Const:
          break;
       case Iex_GetI:
          deltaIRExpr(e->Iex.GetI.ix, delta);
          break;
       case Iex_Binop:
          deltaIRExpr(e->Iex.Binop.arg1, delta);
          deltaIRExpr(e->Iex.Binop.arg2, delta);
          break;
       case Iex_Unop:
          deltaIRExpr(e->Iex.Unop.arg, delta);
          break;
       case Iex_Load:
          deltaIRExpr(e->Iex.Load.addr, delta);
          break;
       case Iex_CCall:
          for (i = 0; e->Iex.CCall.args[i]; i++)
             deltaIRExpr(e->Iex.CCall.args[i], delta);
          break;
       case Iex_Mux0X:
          deltaIRExpr(e->Iex.Mux0X.cond, delta);
          deltaIRExpr(e->Iex.Mux0X.expr0, delta);
          deltaIRExpr(e->Iex.Mux0X.exprX, delta);
          break;
       default:
          vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
          vpanic("deltaIRExpr");
    }
 }

 /* Adjust all tmp values (names) in st by delta.  st is destructively
    modified. */

 static void deltaIRStmt ( IRStmt* st, Int delta )
 {
    Int      i;
    IRDirty* d;
    switch (st->tag) {
       case Ist_NoOp:
       case Ist_IMark:
       case Ist_MFence:
          break;
       case Ist_AbiHint:
          deltaIRExpr(st->Ist.AbiHint.base, delta);
          break;
       case Ist_Put:
          deltaIRExpr(st->Ist.Put.data, delta);
          break;
       case Ist_PutI:
          deltaIRExpr(st->Ist.PutI.ix, delta);
          deltaIRExpr(st->Ist.PutI.data, delta);
          break;
       case Ist_Tmp:
          st->Ist.Tmp.tmp += delta;
          deltaIRExpr(st->Ist.Tmp.data, delta);
          break;
       case Ist_Exit:
          deltaIRExpr(st->Ist.Exit.guard, delta);
          break;
       case Ist_Store:
          deltaIRExpr(st->Ist.Store.addr, delta);
          deltaIRExpr(st->Ist.Store.data, delta);
          break;
       case Ist_Dirty:
          d = st->Ist.Dirty.details;
          deltaIRExpr(d->guard, delta);
          for (i = 0; d->args[i]; i++)
             deltaIRExpr(d->args[i], delta);
          if (d->tmp != IRTemp_INVALID)
             d->tmp += delta;
          if (d->mAddr)
             deltaIRExpr(d->mAddr, delta);
          break;
       default:
          vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
          vpanic("deltaIRStmt");
    }
 }


 /* If possible, return a loop-unrolled version of bb0.  The original
    is changed.  If not possible, return NULL.  */

 /* The two schemas considered are:

      X: BODY; goto X

      which unrolls to (eg)  X: BODY;BODY; goto X

    and

        X: BODY; if (c) goto X; goto Y
    which trivially transforms to
        X: BODY; if (!c) goto Y; goto X;
    so it falls in the scope of the first case.

    X and Y must be literal (guest) addresses.
 */

 static Int calc_unroll_factor( IRBB* bb )
 {
    Int n_stmts, i;

    n_stmts = 0;
    for (i = 0; i < bb->stmts_used; i++) {
       if (bb->stmts[i]->tag != Ist_NoOp)
          n_stmts++;
    }

    if (n_stmts <= vex_control.iropt_unroll_thresh/8) {
       if (vex_control.iropt_verbosity > 0)
          vex_printf("vex iropt: 8 x unrolling (%d sts -> %d sts)\n",
                     n_stmts, 8* n_stmts);
       return 8;
    }
    if (n_stmts <= vex_control.iropt_unroll_thresh/4) {
       if (vex_control.iropt_verbosity > 0)
          vex_printf("vex iropt: 4 x unrolling (%d sts -> %d sts)\n",
                     n_stmts, 4* n_stmts);
       return 4;
    }

    if (n_stmts <= vex_control.iropt_unroll_thresh/2) {
       if (vex_control.iropt_verbosity > 0)
          vex_printf("vex iropt: 2 x unrolling (%d sts -> %d sts)\n",
                     n_stmts, 2* n_stmts);
       return 2;
    }

    if (vex_control.iropt_verbosity > 0)
       vex_printf("vex iropt: not unrolling (%d sts)\n", n_stmts);

    return 1;
 }


 static IRBB* maybe_loop_unroll_BB ( IRBB* bb0, Addr64 my_addr )
 {
    Int      i, j, jmax, n_vars;
    Bool     xxx_known;
    Addr64   xxx_value, yyy_value;
    IRExpr*  udst;
    IRStmt*  st;
    IRConst* con;
    IRBB     *bb1, *bb2;
    Int      unroll_factor;

    if (vex_control.iropt_unroll_thresh <= 0)
       return NULL;

    /* First off, figure out if we can unroll this loop.  Do this
       without modifying bb0. */

    if (bb0->jumpkind != Ijk_Boring)
       return NULL;

    xxx_known = False;
    xxx_value = 0;

    /* Extract the next-guest address.  If it isn't a literal, we
       have to give up. */

    udst = bb0->next;
    if (udst->tag == Iex_Const
        && (udst->Iex.Const.con->tag == Ico_U32
            || udst->Iex.Const.con->tag == Ico_U64)) {
       /* The BB ends in a jump to a literal location. */
       xxx_known = True;
       xxx_value = udst->Iex.Const.con->tag == Ico_U64
                     ?  udst->Iex.Const.con->Ico.U64
                     : (Addr64)(udst->Iex.Const.con->Ico.U32);
    }

    if (!xxx_known)
       return NULL;

    /* Now we know the BB ends to a jump to a literal location.  If
       it's a jump to itself (viz, idiom #1), move directly to the
       unrolling stage, first cloning the bb so the original isn't
       modified. */
    if (xxx_value == my_addr) {
       unroll_factor = calc_unroll_factor( bb0 );
       if (unroll_factor < 2)
          return NULL;
       bb1 = dopyIRBB( bb0 );
       bb0 = NULL;
       udst = NULL; /* is now invalid */
       goto do_unroll;
    }

    /* Search for the second idiomatic form:
         X: BODY; if (c) goto X; goto Y
       We know Y, but need to establish that the last stmt
       is 'if (c) goto X'.
    */
    yyy_value = xxx_value;
    for (i = bb0->stmts_used-1; i >= 0; i--)
       if (bb0->stmts[i])
          break;

    if (i < 0)
       return NULL; /* block with no stmts.  Strange. */

    st = bb0->stmts[i];
    if (st->tag != Ist_Exit)
       return NULL;
    if (st->Ist.Exit.jk != Ijk_Boring)
       return NULL;

    con = st->Ist.Exit.dst;
    vassert(con->tag == Ico_U32 || con->tag == Ico_U64);

    xxx_value = con->tag == Ico_U64
                   ? st->Ist.Exit.dst->Ico.U64
                   : (Addr64)(st->Ist.Exit.dst->Ico.U32);

    /* If this assertion fails, we have some kind of type error. */
    vassert(con->tag == udst->Iex.Const.con->tag);

    if (xxx_value != my_addr)
       /* We didn't find either idiom.  Give up. */
       return NULL;

    /* Ok, we found idiom #2.  Copy the BB, switch around the xxx and
       yyy values (which makes it look like idiom #1), and go into
       unrolling proper.  This means finding (again) the last stmt, in
       the copied BB. */

    unroll_factor = calc_unroll_factor( bb0 );
    if (unroll_factor < 2)
       return NULL;

    bb1 = dopyIRBB( bb0 );
    bb0 = NULL;
    udst = NULL; /* is now invalid */
    for (i = bb1->stmts_used-1; i >= 0; i--)
       if (bb1->stmts[i])
          break;

    /* The next bunch of assertions should be true since we already
       found and checked the last stmt in the original bb. */

    vassert(i >= 0);

    st = bb1->stmts[i];
    vassert(st->tag == Ist_Exit);

    con = st->Ist.Exit.dst;
    vassert(con->tag == Ico_U32 || con->tag == Ico_U64);

    udst = bb1->next;
    vassert(udst->tag == Iex_Const);
    vassert(udst->Iex.Const.con->tag == Ico_U32
           || udst->Iex.Const.con->tag == Ico_U64);
    vassert(con->tag == udst->Iex.Const.con->tag);

    /* switch the xxx and yyy fields around */
    if (con->tag == Ico_U64) {
       udst->Iex.Const.con->Ico.U64 = xxx_value;
       con->Ico.U64 = yyy_value;
    } else {
       udst->Iex.Const.con->Ico.U32 = (UInt)xxx_value;
       con->Ico.U32 = (UInt)yyy_value;
    }

    /* negate the test condition */
    st->Ist.Exit.guard
       = IRExpr_Unop(Iop_Not1,dopyIRExpr(st->Ist.Exit.guard));

    /* --- The unroller proper.  Both idioms are by now --- */
    /* --- now converted to idiom 1. --- */

   do_unroll:

    vassert(unroll_factor == 2
            || unroll_factor == 4
            || unroll_factor == 8);

    jmax = unroll_factor==8 ? 3 : (unroll_factor==4 ? 2 : 1);
    for (j = 1; j <= jmax; j++) {

       n_vars = bb1->tyenv->types_used;

       bb2 = dopyIRBB(bb1);
       for (i = 0; i < n_vars; i++)
          (void)newIRTemp(bb1->tyenv, bb2->tyenv->types[i]);

       for (i = 0; i < bb2->stmts_used; i++) {
          /* deltaIRStmt destructively modifies the stmt, but
             that's OK since bb2 is a complete fresh copy of bb1. */
          deltaIRStmt(bb2->stmts[i], n_vars);
          addStmtToIRBB(bb1, bb2->stmts[i]);
       }
    }

    if (DEBUG_IROPT) {
       vex_printf("\nUNROLLED (%llx)\n", my_addr);
       ppIRBB(bb1);
       vex_printf("\n");
    }

    /* Flattening; sigh.  The unroller succeeds in breaking flatness
       by negating the test condition.  This should be fixed properly.
       For the moment use this shotgun approach.  */
    return flatten_BB(bb1);
 }


 /*---------------------------------------------------------------*/
 /*--- The tree builder                                        ---*/
 /*---------------------------------------------------------------*/

 /* This isn't part of IR optimisation.  Really it's a pass done prior
    to instruction selection, which improves the code that the
    instruction selector can produce. */

 typedef
    struct {
       Int     occ;          /* occurrence count for this tmp */
       IRExpr* expr;         /* expr it is bound to,
                                or NULL if already 'used' */
       Bool    eDoesLoad;    /* True <=> expr reads mem */
       Bool    eDoesGet;     /* True <=> expr reads guest state */
       Bool    invalidateMe; /* used when dumping bindings */
       Int     origPos;      /* posn of the binder in the original bb */
    }
    TmpInfo;

 /* Given env :: IRTemp -> TmpInfo*
    Add the use-occurrences of temps in this expression
    to the environment.
 */
 static void occCount_Temp ( TmpInfo** env, IRTemp tmp )
 {
    TmpInfo* ti = env[(Int)tmp];
    if (ti) {
       ti->occ++;
    } else {
       ti               = LibVEX_Alloc(sizeof(TmpInfo));
       ti->occ          = 1;
       ti->expr         = NULL;
       ti->eDoesLoad    = False;
       ti->eDoesGet     = False;
       ti->invalidateMe = False;
       ti->origPos      = -1; /* filed in properly later */
       env[(Int)tmp] = ti;
    }
 }

 static void occCount_Expr ( TmpInfo** env, IRExpr* e )
 {
    Int i;

    switch (e->tag) {

       case Iex_Tmp: /* the only interesting case */
          occCount_Temp(env, e->Iex.Tmp.tmp);
          return;

       case Iex_Mux0X:
          occCount_Expr(env, e->Iex.Mux0X.cond);
          occCount_Expr(env, e->Iex.Mux0X.expr0);
          occCount_Expr(env, e->Iex.Mux0X.exprX);
          return;

       case Iex_Binop:
          occCount_Expr(env, e->Iex.Binop.arg1);
          occCount_Expr(env, e->Iex.Binop.arg2);
          return;

       case Iex_Unop:
          occCount_Expr(env, e->Iex.Unop.arg);
          return;

       case Iex_Load:
          occCount_Expr(env, e->Iex.Load.addr);
          return;

       case Iex_CCall:
          for (i = 0; e->Iex.CCall.args[i]; i++)
             occCount_Expr(env, e->Iex.CCall.args[i]);
          return;

       case Iex_GetI:
          occCount_Expr(env, e->Iex.GetI.ix);
          return;

       case Iex_Const:
       case Iex_Get:
          return;

       default:
          vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
          vpanic("occCount_Expr");
     }
 }


 /* Given env :: IRTemp -> TmpInfo*
    Add the use-occurrences of temps in this expression
    to the environment.
 */
 static void occCount_Stmt ( TmpInfo** env, IRStmt* st )
 {
    Int      i;
    IRDirty* d;
    switch (st->tag) {
       case Ist_AbiHint:
          occCount_Expr(env, st->Ist.AbiHint.base);
          return;
       case Ist_Tmp:
          occCount_Expr(env, st->Ist.Tmp.data);
          return;
       case Ist_Put:
          occCount_Expr(env, st->Ist.Put.data);
          return;
       case Ist_PutI:
          occCount_Expr(env, st->Ist.PutI.ix);
          occCount_Expr(env, st->Ist.PutI.data);
          return;
       case Ist_Store:
          occCount_Expr(env, st->Ist.Store.addr);
          occCount_Expr(env, st->Ist.Store.data);
          return;
       case Ist_Dirty:
          d = st->Ist.Dirty.details;
          if (d->mFx != Ifx_None)
             occCount_Expr(env, d->mAddr);
          occCount_Expr(env, d->guard);
          for (i = 0; d->args[i]; i++)
             occCount_Expr(env, d->args[i]);
          return;
       case Ist_NoOp:
       case Ist_IMark:
       case Ist_MFence:
          return;
       case Ist_Exit:
          occCount_Expr(env, st->Ist.Exit.guard);
          return;
       default:
          vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
          vpanic("occCount_Stmt");
    }
 }

 /* Look up a binding for tmp in the env.  If found, return the bound
    expression, and set the env's binding to NULL so it is marked as
    used.  If not found, return NULL. */

 static IRExpr* tbSubst_Temp ( TmpInfo** env, IRTemp tmp )
 {
    TmpInfo* ti;
    IRExpr*  e;
    ti = env[(Int)tmp];
    if (ti){
       e = ti->expr;
       if (e) {
          ti->expr = NULL;
          return e;
       } else {
          return NULL;
       }
    } else {
       return NULL;
    }
 }

 /* Traverse e, looking for temps.  For each observed temp, see if env
    contains a binding for the temp, and if so return the bound value.
    The env has the property that any binding it holds is
    'single-shot', so once a binding is used, it is marked as no longer
    available, by setting its .expr field to NULL. */

 static IRExpr* tbSubst_Expr ( TmpInfo** env, IRExpr* e )
 {
    IRExpr*  e2;
    IRExpr** args2;
    Int      i;

    switch (e->tag) {

       case Iex_CCall:
          args2 = sopyIRExprVec(e->Iex.CCall.args);
          for (i = 0; args2[i]; i++)
             args2[i] = tbSubst_Expr(env,args2[i]);
          return IRExpr_CCall(e->Iex.CCall.cee,
                    e->Iex.CCall.retty,
                    args2
                 );
       case Iex_Tmp:
          e2 = tbSubst_Temp(env, e->Iex.Tmp.tmp);
          return e2 ? e2 : e;
       case Iex_Mux0X:
          return IRExpr_Mux0X(
                    tbSubst_Expr(env, e->Iex.Mux0X.cond),
                    tbSubst_Expr(env, e->Iex.Mux0X.expr0),
                    tbSubst_Expr(env, e->Iex.Mux0X.exprX)
                 );
       case Iex_Binop:
          return IRExpr_Binop(
                    e->Iex.Binop.op,
                    tbSubst_Expr(env, e->Iex.Binop.arg1),
                    tbSubst_Expr(env, e->Iex.Binop.arg2)
                 );
       case Iex_Unop:
          return IRExpr_Unop(
                    e->Iex.Unop.op,
                    tbSubst_Expr(env, e->Iex.Unop.arg)
                 );
       case Iex_Load:
          return IRExpr_Load(
                    e->Iex.Load.end,
                    e->Iex.Load.ty,
                   tbSubst_Expr(env, e->Iex.Load.addr)
                 );
       case Iex_GetI:
          return IRExpr_GetI(
                    e->Iex.GetI.descr,
                    tbSubst_Expr(env, e->Iex.GetI.ix),
                    e->Iex.GetI.bias
                 );
       case Iex_Const:
       case Iex_Get:
          return e;
       default:
          vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
          vpanic("tbSubst_Expr");
    }
 }

 /* Same deal as tbSubst_Expr, except for stmts. */

 static IRStmt* tbSubst_Stmt ( TmpInfo** env, IRStmt* st )
 {
    Int      i;
    IRDirty* d;
    IRDirty* d2;
    switch (st->tag) {
       case Ist_AbiHint:
          return IRStmt_AbiHint(
                    tbSubst_Expr(env, st->Ist.AbiHint.base),
                    st->Ist.AbiHint.len
                 );
       case Ist_Store:
          return IRStmt_Store(
                    st->Ist.Store.end,
                    tbSubst_Expr(env, st->Ist.Store.addr),
                    tbSubst_Expr(env, st->Ist.Store.data)
                 );
       case Ist_Tmp:
          return IRStmt_Tmp(
                    st->Ist.Tmp.tmp,
                    tbSubst_Expr(env, st->Ist.Tmp.data)
                 );
       case Ist_Put:
          return IRStmt_Put(
                    st->Ist.Put.offset,
                    tbSubst_Expr(env, st->Ist.Put.data)
                 );
       case Ist_PutI:
          return IRStmt_PutI(
                    st->Ist.PutI.descr,
                    tbSubst_Expr(env, st->Ist.PutI.ix),
                    st->Ist.PutI.bias,
                    tbSubst_Expr(env, st->Ist.PutI.data)
                 );

       case Ist_Exit:
          return IRStmt_Exit(
                    tbSubst_Expr(env, st->Ist.Exit.guard),
                    st->Ist.Exit.jk,
                    st->Ist.Exit.dst
                 );
       case Ist_IMark:
          return IRStmt_IMark(st->Ist.IMark.addr, st->Ist.IMark.len);
       case Ist_NoOp:
          return IRStmt_NoOp();
       case Ist_MFence:
          return IRStmt_MFence();
       case Ist_Dirty:
          d = st->Ist.Dirty.details;
          d2 = emptyIRDirty();
          *d2 = *d;
          if (d2->mFx != Ifx_None)
             d2->mAddr = tbSubst_Expr(env, d2->mAddr);
          d2->guard = tbSubst_Expr(env, d2->guard);
          for (i = 0; d2->args[i]; i++)
             d2->args[i] = tbSubst_Expr(env, d2->args[i]);
          return IRStmt_Dirty(d2);
       default:
          vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
          vpanic("tbSubst_Stmt");
    }
 }


 /* Traverse an expr, and detect if any part of it reads memory or does
    a Get.  Be careful ... this really controls how much the
    tree-builder can reorder the code, so getting it right is critical.
 */
 static void setHints_Expr (Bool* doesLoad, Bool* doesGet, IRExpr* e )
 {
    Int i;
    switch (e->tag) {
       case Iex_CCall:
          for (i = 0; e->Iex.CCall.args[i]; i++)
             setHints_Expr(doesLoad, doesGet, e->Iex.CCall.args[i]);
          return;
       case Iex_Mux0X:
          setHints_Expr(doesLoad, doesGet, e->Iex.Mux0X.cond);
          setHints_Expr(doesLoad, doesGet, e->Iex.Mux0X.expr0);
          setHints_Expr(doesLoad, doesGet, e->Iex.Mux0X.exprX);
          return;
       case Iex_Binop:
          setHints_Expr(doesLoad, doesGet, e->Iex.Binop.arg1);
          setHints_Expr(doesLoad, doesGet, e->Iex.Binop.arg2);
          return;
       case Iex_Unop:
          setHints_Expr(doesLoad, doesGet, e->Iex.Unop.arg);
          return;
       case Iex_Load:
          *doesLoad = True;
          setHints_Expr(doesLoad, doesGet, e->Iex.Load.addr);
          return;
       case Iex_Get:
          *doesGet = True;
          return;
       case Iex_GetI:
          *doesGet = True;
          setHints_Expr(doesLoad, doesGet, e->Iex.GetI.ix);
          return;
       case Iex_Tmp:
       case Iex_Const:
          return;
       default:
          vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
          vpanic("setHints_Expr");
    }
 }


 static void dumpInvalidated ( TmpInfo** env, IRBB* bb, /*INOUT*/Int* j )
 {
    Int k, oldest_op, oldest_k;
    TmpInfo* ti;

    /* Dump all the bindings to marked as invalidated, in order. */
    while (True) {

       /* find the oldest bind marked 'invalidateMe'. */
       oldest_op = 1<<30;
       oldest_k =  1<<30;
       for (k = 0; k < bb->tyenv->types_used; k++) {
          ti = env[k];
          if (!ti)
             continue;
          if (!ti->expr)
             continue;
          if (!ti->invalidateMe)
            continue;
          /* vex_printf("FOUND INVAL %d %d\n", ti->origPos, oldest_op); */
          if (ti->origPos < oldest_op) {
             oldest_op = ti->origPos;
             oldest_k = k;
          }
       }

       /* No more binds to invalidate. */
       if (oldest_op == 1<<30)
          return;

       /* the oldest bind to invalidate has been identified */
       vassert(oldest_op != 1<<31 && oldest_k != 1<<31);
       ti = env[oldest_k];
       vassert(ti->expr && ti->invalidateMe);

       /* and invalidate it ... */
       bb->stmts[*j] = IRStmt_Tmp( (IRTemp)oldest_k, ti->expr );
       /*  vex_printf("**1  "); ppIRStmt(bb->stmts[*j]); vex_printf("\n"); */
       (*j)++;
       ti->invalidateMe = False;
       ti->expr = NULL; /* no longer available for substitution */

    } /* loop which dumps the binds marked for invalidation */
 }


 /* notstatic */ void do_treebuild_BB ( IRBB* bb )
 {
    Int      i, j, k;
    Bool     invPut, invStore;
    IRStmt*  st;
    IRStmt*  st2;
    TmpInfo* ti;
    IRExpr*  next2;

    /* Mapping from IRTemp to TmpInfo*. */
    Int       n_tmps = bb->tyenv->types_used;
    TmpInfo** env    = LibVEX_Alloc(n_tmps * sizeof(TmpInfo*));

    for (i = 0; i < n_tmps; i++)
       env[i] = NULL;

    /* Phase 1.  Scan forwards in bb, counting use occurrences of each
       temp.  Also count occurrences in the bb->next field. */

    for (i = 0; i < bb->stmts_used; i++) {
       st = bb->stmts[i];
       if (st->tag == Ist_NoOp)
          continue;
       occCount_Stmt( env, st );
    }
    occCount_Expr(env, bb->next );

 #  if 0
    for (i = 0; i < env->used; i++) {
       if (!env->inuse[i])
         continue;
       ppIRTemp( (IRTemp)(env->key[i]) );
       vex_printf("  used %d\n", ((TmpInfo*)env->val[i])->occ );
    }
 #  endif

    /* Phase 2.  Fill in the origPos fields. */

    for (i = 0; i < bb->stmts_used; i++) {
       st = bb->stmts[i];
       if (st->tag != Ist_Tmp)
          continue;

       ti = env[(Int)st->Ist.Tmp.tmp];
       if (!ti) {
          vex_printf("\n");
          ppIRTemp(st->Ist.Tmp.tmp);
          vex_printf("\n");
          vpanic("treebuild_BB (phase 2): unmapped IRTemp");
       }
       ti->origPos = i;
    }

    /* Phase 3.  Scan forwards in bb.

       On seeing 't = E', occ(t)==1,
             let E'=env(E), set t's binding to be E', and
             delete this stmt.
             Also examine E' and set the hints for E' appropriately
               (doesLoad? doesGet?)

       On seeing any other stmt,
             let stmt' = env(stmt)
             remove from env any 't=E' binds invalidated by stmt
                 emit the invalidated stmts
             emit stmt'

       Apply env to bb->next.
    */

    /* The stmts in bb are being reordered, and we are guaranteed to
       end up with no more than the number we started with.  Use i to
       be the cursor of the current stmt examined and j <= i to be that
       for the current stmt being written.
    */
    j = 0;
    for (i = 0; i < bb->stmts_used; i++) {
       st = bb->stmts[i];
       if (st->tag == Ist_NoOp)
          continue;

       if (st->tag == Ist_Tmp) {
          /* vex_printf("acquire binding\n"); */
          ti = env[st->Ist.Tmp.tmp];
          if (!ti) {
             vpanic("treebuild_BB (phase 2): unmapped IRTemp");
          }
          if (ti->occ == 1) {
             /* ok, we have 't = E', occ(t)==1.  Do the abovementioned actions. */
             IRExpr* e = st->Ist.Tmp.data;
             IRExpr* e2 = tbSubst_Expr(env, e);
             ti->expr = e2;
             ti->eDoesLoad = ti->eDoesGet = False;
             setHints_Expr(&ti->eDoesLoad, &ti->eDoesGet, e2);
             /* don't advance j, as we are deleting this stmt and instead
                holding it temporarily in the env. */
             continue; /* the for (i = 0; i < bb->stmts_used; i++) loop */
          }
       }

       /* we get here for any other kind of statement. */
       /* 'use up' any bindings required by the current statement. */
       st2 = tbSubst_Stmt(env, st);

       /* Now, before this stmt, dump any bindings it invalidates.
          These need to be dumped in the order in which they originally
          appeared.  (Stupid algorithm): first, mark all bindings which
          need to be dumped.  Then, dump them in the order in which
          they were defined. */

       invPut = toBool(st->tag == Ist_Put
                       || st->tag == Ist_PutI
                       || st->tag == Ist_Dirty);

       invStore = toBool(st->tag == Ist_Store
                         || st->tag == Ist_Dirty);

       for (k = 0; k < n_tmps; k++) {
          ti = env[k];
          if (!ti)
             continue;
          if (!ti->expr)
             continue;

          /* Do we have to invalidate this binding? */

          ti->invalidateMe
             = toBool(
               /* a store invalidates loaded data */
               (ti->eDoesLoad && invStore)
               /* a put invalidates get'd data */
               || (ti->eDoesGet && invPut)
               /* a put invalidates loaded data.  Note, we could do
                  much better here in the sense that we only need to
                  invalidate trees containing loads if the Put in
                  question is marked as requiring precise
                  exceptions. */
               || (ti->eDoesLoad && invPut)
               /* probably overly conservative: a memory fence
                  invalidates absolutely everything, so that all
                  computation prior to it is forced to complete before
                  proceeding with the fence. */
               || st->tag == Ist_MFence
               /* also be (probably overly) paranoid re AbiHints */
               || st->tag == Ist_AbiHint
               );
          /*
          if (ti->invalidateMe)
             vex_printf("SET INVAL\n");
          */
       }

       dumpInvalidated ( env, bb, &j );

       /* finally, emit the substituted statement */
       bb->stmts[j] = st2;
       /*  vex_printf("**2  "); ppIRStmt(bb->stmts[j]); vex_printf("\n"); */
       j++;

       vassert(j <= i+1);
    } /* for each stmt in the original bb ... */

    /* Finally ... substitute the ->next field as much as possible, and
       dump any left-over bindings.  Hmm.  Perhaps there should be no
       left over bindings?  Or any left-over bindings are
       by definition dead? */
    next2 = tbSubst_Expr(env, bb->next);
    bb->next = next2;
    bb->stmts_used = j;
 }


 /*---------------------------------------------------------------*/
 /*--- iropt main                                              ---*/
 /*---------------------------------------------------------------*/

 static Bool iropt_verbose = False; //True;


 /* Do a simple cleanup pass on bb.  This is: redundant Get removal,
    redundant Put removal, constant propagation, dead code removal,
    clean helper specialisation, and dead code removal (again).
 */


 static
 IRBB* cheap_transformations (
          IRBB* bb,
          IRExpr* (*specHelper) (HChar*, IRExpr**),
          Bool (*preciseMemExnsFn)(Int,Int)
       )
 {
    redundant_get_removal_BB ( bb );
    if (iropt_verbose) {
       vex_printf("\n========= REDUNDANT GET\n\n" );
       ppIRBB(bb);
    }

    redundant_put_removal_BB ( bb, preciseMemExnsFn );
    if (iropt_verbose) {
       vex_printf("\n========= REDUNDANT PUT\n\n" );
       ppIRBB(bb);
    }

    bb = cprop_BB ( bb );
    if (iropt_verbose) {
       vex_printf("\n========= CPROPD\n\n" );
       ppIRBB(bb);
    }

    do_deadcode_BB ( bb );
    if (iropt_verbose) {
       vex_printf("\n========= DEAD\n\n" );
       ppIRBB(bb);
    }

    bb = spec_helpers_BB ( bb, specHelper );
    do_deadcode_BB ( bb );
    if (iropt_verbose) {
       vex_printf("\n========= SPECd \n\n" );
       ppIRBB(bb);
    }

    return bb;
 }


 /* Do some more expensive transformations on bb, which are aimed at
    optimising as much as possible in the presence of GetI and PutI.  */

 static
 IRBB* expensive_transformations( IRBB* bb )
 {
    do_cse_BB( bb );
    collapse_AddSub_chains_BB( bb );
    do_redundant_GetI_elimination( bb );
    do_redundant_PutI_elimination( bb );
    do_deadcode_BB( bb );
    return bb;
 }


 /* Scan a flattened BB to see if it has any GetI or PutIs in it.  Used
    as a heuristic hack to see if iropt needs to do expensive
    optimisations (CSE, PutI -> GetI forwarding, redundant PutI
    elimination) to improve code containing GetI or PutI.  */

 static Bool hasGetIorPutI ( IRBB* bb )
 {
    Int i, j;
    IRStmt* st;
    IRDirty* d;

    for (i = 0; i < bb->stmts_used; i++) {
       st = bb->stmts[i];
       switch (st->tag) {
          case Ist_AbiHint:
             vassert(isIRAtom(st->Ist.AbiHint.base));
             break;
          case Ist_PutI:
             return True;
          case Ist_Tmp:
             if (st->Ist.Tmp.data->tag == Iex_GetI)
                return True;
             break;
          case Ist_Put:
             vassert(isIRAtom(st->Ist.Put.data));
             break;
          case Ist_Store:
             vassert(isIRAtom(st->Ist.Store.addr));
             vassert(isIRAtom(st->Ist.Store.data));
             break;
          case Ist_Dirty:
             d = st->Ist.Dirty.details;
             vassert(isIRAtom(d->guard));
             for (j = 0; d->args[j]; j++)
                vassert(isIRAtom(d->args[j]));
             if (d->mFx != Ifx_None)
                vassert(isIRAtom(d->mAddr));
             break;
          case Ist_NoOp:
          case Ist_IMark:
          case Ist_MFence:
             break;
          case Ist_Exit:
             vassert(isIRAtom(st->Ist.Exit.guard));
             break;
          default:
             ppIRStmt(st);
             vpanic("hasGetIorPutI");
       }

    }
    return False;

 }


 /* ---------------- The main iropt entry point. ---------------- */

 /* exported from this file */
 /* Rules of the game:

    - IRExpr/IRStmt trees should be treated as immutable, as they
      may get shared.  So never change a field of such a tree node;
      instead construct and return a new one if needed.
 */


 IRBB* do_iropt_BB ( IRBB* bb0,
                     IRExpr* (*specHelper) (HChar*, IRExpr**),
                     Bool (*preciseMemExnsFn)(Int,Int),
                     Addr64 guest_addr )
 {
    static Int n_total     = 0;
    static Int n_expensive = 0;

    Bool do_expensive;
    IRBB *bb, *bb2;

    n_total++;

    /* First flatten the block out, since all other
       phases assume flat code. */

    bb = flatten_BB ( bb0 );

    if (iropt_verbose) {
       vex_printf("\n========= FLAT\n\n" );
       ppIRBB(bb);
    }

    /* If at level 0, stop now. */
    if (vex_control.iropt_level <= 0) return bb;

    /* Now do a preliminary cleanup pass, and figure out if we also
       need to do 'expensive' optimisations.  Expensive optimisations
       are deemed necessary if the block contains any GetIs or PutIs.
       If needed, do expensive transformations and then another cheap
       cleanup pass. */

    bb = cheap_transformations( bb, specHelper, preciseMemExnsFn );

    if (vex_control.iropt_level > 1) {
       do_expensive = hasGetIorPutI(bb);
       if (do_expensive) {
          n_expensive++;
          if (DEBUG_IROPT)
             vex_printf("***** EXPENSIVE %d %d\n", n_total, n_expensive);
          bb = expensive_transformations( bb );
          bb = cheap_transformations( bb, specHelper, preciseMemExnsFn );
       }

       /* Now have a go at unrolling simple (single-BB) loops.  If
          successful, clean up the results as much as possible. */

       bb2 = maybe_loop_unroll_BB( bb, guest_addr );
       if (bb2) {
          bb = cheap_transformations( bb2, specHelper, preciseMemExnsFn );
          if (do_expensive) {
             bb = expensive_transformations( bb );
             bb = cheap_transformations( bb, specHelper, preciseMemExnsFn );
          } else {
             /* at least do CSE and dead code removal */
             do_cse_BB( bb );
             do_deadcode_BB( bb );
          }
          if (0) vex_printf("vex iropt: unrolled a loop\n");
       }

    }

    return bb;
 }


 /*---------------------------------------------------------------*/
 /*--- end                                          ir/iropt.c ---*/
 /*---------------------------------------------------------------*/