coregrind/vg_translate.c - platform/external/valgrind - Gitiles


 /*--------------------------------------------------------------------*/
 /*--- The JITter proper: register allocation & code improvement    ---*/
 /*---                                               vg_translate.c ---*/
 /*--------------------------------------------------------------------*/

 /*
    This file is part of Valgrind, an extensible x86 protected-mode
    emulator for monitoring program execution on x86-Unixes.

    Copyright (C) 2000-2004 Julian Seward
       jseward@acm.org

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
    published by the Free Software Foundation; either version 2 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307, USA.

    The GNU General Public License is contained in the file COPYING.
 */

 #include "vg_include.h"

 /*------------------------------------------------------------*/
 /*--- Renamings of frequently-used global functions.       ---*/
 /*------------------------------------------------------------*/

 #define dis       VG_(print_codegen)


 /*------------------------------------------------------------*/
 /*--- Basics                                               ---*/
 /*------------------------------------------------------------*/

 /* This one is called by the core */
 UCodeBlock* VG_(alloc_UCodeBlock) ( void )
 {
    UCodeBlock* cb = VG_(arena_malloc)(VG_AR_CORE, sizeof(UCodeBlock));
    cb->used = cb->size = cb->nextTemp = 0;
    cb->instrs = NULL;
    return cb;
 }

 /* This one is called by skins */
 UCodeBlock* VG_(setup_UCodeBlock) ( UCodeBlock* cb_in )
 {
    UCodeBlock* cb = VG_(arena_malloc)(VG_AR_CORE, sizeof(UCodeBlock));
    cb->orig_eip = cb_in->orig_eip;
    cb->used = cb->size = 0;
    cb->nextTemp = cb_in->nextTemp;
    cb->instrs = NULL;
    return cb;
 }

 void VG_(free_UCodeBlock) ( UCodeBlock* cb )
 {
    if (cb->instrs) VG_(arena_free)(VG_AR_CORE, cb->instrs);
    VG_(arena_free)(VG_AR_CORE, cb);
 }


 /* Ensure there's enough space in a block to add one uinstr. */
 static
 void ensureUInstr ( UCodeBlock* cb )
 {
    if (cb->used == cb->size) {
       if (cb->instrs == NULL) {
          vg_assert(cb->size == 0);
          vg_assert(cb->used == 0);
          cb->size = 8;
          cb->instrs = VG_(arena_malloc)(VG_AR_CORE, 8 * sizeof(UInstr));
       } else {
          Int i;
          UInstr* instrs2 = VG_(arena_malloc)(VG_AR_CORE,
                                        2 * sizeof(UInstr) * cb->size);
          for (i = 0; i < cb->used; i++)
             instrs2[i] = cb->instrs[i];
          cb->size *= 2;
          VG_(arena_free)(VG_AR_CORE, cb->instrs);
          cb->instrs = instrs2;
       }
    }

    vg_assert(cb->used < cb->size);
 }


 __inline__
 void VG_(new_NOP) ( UInstr* u )
 {
    u->val1 = u->val2 = u->val3 = 0;
    u->tag1 = u->tag2 = u->tag3 = NoValue;
    u->flags_r = u->flags_w = FlagsEmpty;
    u->jmpkind = JmpBoring;
    u->signed_widen = u->has_ret_val = False;
    u->regs_live_after = ALL_RREGS_LIVE;
    u->lit32    = 0;
    u->opcode   = NOP;
    u->size     = 0;
    u->cond     = 0;
    u->extra4b  = 0;
    u->argc = u->regparms_n = 0;
 }


 /* Add an instruction to a ucode block, and return the index of the
    instruction. */
 __inline__
 void VG_(new_UInstr3) ( UCodeBlock* cb, Opcode opcode, Int sz,
                        Tag tag1, UInt val1,
                        Tag tag2, UInt val2,
                        Tag tag3, UInt val3 )
 {
    UInstr* ui;
    ensureUInstr(cb);
    ui = & cb->instrs[cb->used];
    cb->used++;
    VG_(new_NOP)(ui);
    ui->val1   = val1;
    ui->val2   = val2;
    ui->val3   = val3;
    ui->opcode = opcode;
    ui->tag1   = tag1;
    ui->tag2   = tag2;
    ui->tag3   = tag3;
    ui->size   = sz;
    if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
    if (tag2 == TempReg) vg_assert(val2 != INVALID_TEMPREG);
    if (tag3 == TempReg) vg_assert(val3 != INVALID_TEMPREG);
 }


 __inline__
 void VG_(new_UInstr2) ( UCodeBlock* cb, Opcode opcode, Int sz,
                        Tag tag1, UInt val1,
                        Tag tag2, UInt val2 )
 {
    UInstr* ui;
    ensureUInstr(cb);
    ui = & cb->instrs[cb->used];
    cb->used++;
    VG_(new_NOP)(ui);
    ui->val1   = val1;
    ui->val2   = val2;
    ui->opcode = opcode;
    ui->tag1   = tag1;
    ui->tag2   = tag2;
    ui->size   = sz;
    if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
    if (tag2 == TempReg) vg_assert(val2 != INVALID_TEMPREG);
 }


 __inline__
 void VG_(new_UInstr1) ( UCodeBlock* cb, Opcode opcode, Int sz,
                        Tag tag1, UInt val1 )
 {
    UInstr* ui;
    ensureUInstr(cb);
    ui = & cb->instrs[cb->used];
    cb->used++;
    VG_(new_NOP)(ui);
    ui->val1   = val1;
    ui->opcode = opcode;
    ui->tag1   = tag1;
    ui->size   = sz;
    if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
 }


 __inline__
 void VG_(new_UInstr0) ( UCodeBlock* cb, Opcode opcode, Int sz )
 {
    UInstr* ui;
    ensureUInstr(cb);
    ui = & cb->instrs[cb->used];
    cb->used++;
    VG_(new_NOP)(ui);
    ui->opcode = opcode;
    ui->size   = sz;
 }

 /* Copy an instruction into the given codeblock. */
 __inline__
 void VG_(copy_UInstr) ( UCodeBlock* cb, UInstr* instr )
 {
    ensureUInstr(cb);
    cb->instrs[cb->used] = *instr;
    cb->used++;
 }

 /* Copy auxiliary info from one uinstr to another. */
 static __inline__
 void copyAuxInfoFromTo ( UInstr* src, UInstr* dst )
 {
    dst->cond            = src->cond;
    dst->extra4b         = src->extra4b;
    dst->signed_widen    = src->signed_widen;
    dst->jmpkind         = src->jmpkind;
    dst->flags_r         = src->flags_r;
    dst->flags_w         = src->flags_w;
    dst->argc            = src->argc;
    dst->regparms_n      = src->regparms_n;
    dst->has_ret_val     = src->has_ret_val;
    dst->regs_live_after = src->regs_live_after;
 }


 /* Set the lit32 field of the most recent uinsn. */
 void VG_(set_lit_field) ( UCodeBlock* cb, UInt lit32 )
 {
    LAST_UINSTR(cb).lit32 = lit32;
 }


 /* Set the C call info fields of the most recent uinsn. */
 void  VG_(set_ccall_fields) ( UCodeBlock* cb, Addr fn, UChar argc, UChar
                               regparms_n, Bool has_ret_val )
 {
    vg_assert(argc       <  4);
    vg_assert(regparms_n <= argc);
    LAST_UINSTR(cb).lit32       = fn;
    LAST_UINSTR(cb).argc        = argc;
    LAST_UINSTR(cb).regparms_n  = regparms_n;
    LAST_UINSTR(cb).has_ret_val = has_ret_val;
 }

 /* For the last uinsn inserted into cb, set the read, written and
    undefined flags.  Undefined flags are counted as written, but it
    seems worthwhile to distinguish them.
 */
 __inline__
 void VG_(set_flag_fields) ( UCodeBlock* cb,
                              FlagSet rr, FlagSet ww, FlagSet uu )
 {
    FlagSet uw = VG_UNION_FLAG_SETS(ww,uu);

    vg_assert(rr == (rr & FlagsALL));
    vg_assert(uw == (uw & FlagsALL));
    LAST_UINSTR(cb).flags_r = rr;
    LAST_UINSTR(cb).flags_w = uw;
 }


 Bool VG_(any_flag_use) ( UInstr* u )
 {
    return (u->flags_r != FlagsEmpty
            || u->flags_w != FlagsEmpty);
 }

 #if 1
 #  define BEST_ALLOC_ORDER
 #endif

 /* Convert a rank in the range 0 .. VG_MAX_REALREGS-1 into an Intel
    register number.  This effectively defines the order in which real
    registers are allocated.  %ebp is excluded since it is permanently
    reserved for pointing at VG_(baseBlock).

    Important!  This function must correspond with the value of
    VG_MAX_REALREGS (actually, VG_MAX_REALREGS can be reduced without
    a problem, except the generated code will obviously be worse).
 */
 __inline__
 Int VG_(rank_to_realreg) ( Int rank )
 {
    switch (rank) {
 #     ifdef BEST_ALLOC_ORDER
       /* Probably the best allocation ordering. */
       case 0: return R_EAX;
       case 1: return R_EBX;
       case 2: return R_ECX;
       case 3: return R_EDX;
       case 4: return R_ESI;
       case 5: return R_EDI;
 #     else
       /* Contrary; probably the worst.  Helpful for debugging, tho. */
       case 5: return R_EAX;
       case 4: return R_EBX;
       case 3: return R_ECX;
       case 2: return R_EDX;
       case 1: return R_ESI;
       case 0: return R_EDI;
 #     endif
       default: VG_(core_panic)("VG_(rank_to_realreg)");
    }
 }

 /* Convert an Intel register number into a rank in the range 0 ..
    VG_MAX_REALREGS-1.  See related comments for rank_to_realreg()
    above.  */
 __inline__
 Int VG_(realreg_to_rank) ( Int realReg )
 {
    switch (realReg) {
 #     ifdef BEST_ALLOC_ORDER
       case R_EAX: return 0;
       case R_EBX: return 1;
       case R_ECX: return 2;
       case R_EDX: return 3;
       case R_ESI: return 4;
       case R_EDI: return 5;
 #     else
       case R_EAX: return 5;
       case R_EBX: return 4;
       case R_ECX: return 3;
       case R_EDX: return 2;
       case R_ESI: return 1;
       case R_EDI: return 0;
 #     endif
       default: VG_(core_panic)("VG_(realreg_to_rank)");
    }
 }


 /*------------------------------------------------------------*/
 /*--- Sanity checking uinstrs.                             ---*/
 /*------------------------------------------------------------*/

 /* This seems as good a place as any to record some important stuff
    about ucode semantics.

    * TempRegs are 32 bits wide.  LOADs of 8/16 bit values into a
      TempReg are defined to zero-extend the loaded value to 32 bits.
      This is needed to make the translation of movzbl et al work
      properly.

    * Similarly, GETs of a 8/16 bit ArchRegs are zero-extended.

    * Arithmetic on TempRegs is at the specified size.  For example,
      SUBW t1, t2 has to result in a real 16 bit x86 subtraction
      being emitted -- not a 32 bit one.

    * On some insns we allow the cc bit to be set.  If so, the
      intention is that the simulated machine's %eflags register
      is copied into that of the real machine before the insn,
      and copied back again afterwards.  This means that the
      code generated for that insn must be very careful only to
      update %eflags in the intended way.  This is particularly
      important for the routines referenced by CALL insns.
 */

 /* Meaning of operand kinds is as follows:

    ArchReg  is a register of the simulated CPU, stored in memory,
             in vg_m_state.m_eax .. m_edi.  These values are stored
             using the Intel register encoding.

    RealReg  is a register of the real CPU.  There are VG_MAX_REALREGS
             available for allocation.  As with ArchRegs, these values
             are stored using the Intel register encoding.

    TempReg  is a temporary register used to express the results of
             disassembly.  There is an unlimited supply of them --
             register allocation and spilling eventually assigns them
             to RealRegs.

    SpillNo  is a spill slot number.  The number of required spill
             slots is VG_MAX_PSEUDOS, in general.  Only allowed
             as the ArchReg operand of GET and PUT.

    Lit16    is a signed 16-bit literal value.

    Literal  is a 32-bit literal value.  Each uinstr can only hold
             one of these.

    The disassembled code is expressed purely in terms of ArchReg,
    TempReg and Literal operands.  Eventually, register allocation
    removes all the TempRegs, giving a result using ArchRegs, RealRegs,
    and Literals.  New x86 code can easily be synthesised from this.
    There are carefully designed restrictions on which insns can have
    which operands, intended to make it possible to generate x86 code
    from the result of register allocation on the ucode efficiently and
    without need of any further RealRegs.

    Restrictions for the individual UInstrs are clear from the checks below.
    Abbreviations: A=ArchReg   S=SpillNo   T=TempReg   L=Literal
                   Ls=Lit16    R=RealReg   N=NoValue
                   As=ArchRegS

    Before register allocation, S operands should not appear anywhere.
    After register allocation, all T operands should have been
    converted into Rs, and S operands are allowed in GET and PUT --
    denoting spill saves/restores.

    Before liveness analysis, save_e[acd]x fields should all be True.
    Afterwards, they may be False.

    The size field should be 0 for insns for which it is meaningless,
    ie those which do not directly move/operate on data.
 */
 Bool VG_(saneUInstr) ( Bool beforeRA, Bool beforeLiveness, UInstr* u )
 {
 #  define LIT0 (u->lit32 == 0)
 #  define LIT8 (((u->lit32) & 0xFFFFFF00) == 0)
 #  define LIT1 (!(LIT0))
 #  define LITm (u->tag1 == Literal ? True : LIT0 )
 #  define SZ16 (u->size == 16)
 #  define SZ8  (u->size == 8)
 #  define SZ4  (u->size == 4)
 #  define SZ2  (u->size == 2)
 #  define SZ1  (u->size == 1)
 #  define SZ0  (u->size == 0)
 #  define SZ42 (u->size == 4 || u->size == 2)
 #  define SZ48 (u->size == 4 || u->size == 8)
 #  define SZ416 (u->size == 4 || u->size == 16)
 #  define SZ816 (u->size == 8 || u->size == 16)
 #  define SZsse2 (u->size == 4 || u->size == 8 || u->size == 16 || u->size == 512)
 #  define SZsse3 (u->size == 4 || u->size == 8 || u->size == 16)
 #  define SZi  (u->size == 4 || u->size == 2 || u->size == 1)
 #  define SZf  (  u->size ==  4 || u->size ==  8 || u->size ==   2     \
                || u->size == 10 || u->size == 28 || u->size == 108)
 #  define SZ4m ((u->tag1 == TempReg || u->tag1 == RealReg) \
                       ? (u->size == 4) : True)

 /* For these ones, two cases:
  *
  * 1. They are transliterations of the corresponding x86 instruction, in
  *    which case they should have its flags (except that redundant write
  *    flags can be annulled by the optimisation pass).
  *
  * 2. They are being used generally for other purposes, eg. helping with a
  *    'rep'-prefixed instruction, in which case should have empty flags .
  */
 #  define emptyR (u->flags_r == FlagsEmpty)
 #  define emptyW (u->flags_w == FlagsEmpty)
 #  define CC0 (emptyR && emptyW)
 #  define CCr (u->flags_r == FlagsALL && emptyW)
 #  define CCw (emptyR &&  u->flags_w == FlagsALL)
 #  define CCa (emptyR && (u->flags_w == FlagsOSZACP  || emptyW))
 #  define CCc (emptyR && (u->flags_w == FlagsOC      || emptyW))
 #  define CCe (emptyR && (u->flags_w == FlagsOSZAP   || emptyW))
 #  define CCb ((u->flags_r==FlagC       || emptyR) && \
                (u->flags_w==FlagsOSZACP || emptyW))
 #  define CCd ((u->flags_r==FlagC   || emptyR) && \
                (u->flags_w==FlagsOC || emptyW))
 #  define CCf (CC0 || (emptyR && u->flags_w==FlagsZCP) \
                    || (u->flags_r==FlagsZCP && emptyW))
 #  define CCg ((u->flags_r==FlagsOSZACP || emptyR) && emptyW)
 #  define CCj (u->cond==CondAlways ? CC0 : CCg)

 #  define TR1 (beforeRA ? (u->tag1 == TempReg) : (u->tag1 == RealReg))
 #  define TR2 (beforeRA ? (u->tag2 == TempReg) : (u->tag2 == RealReg))
 #  define TR3 (beforeRA ? (u->tag3 == TempReg) : (u->tag3 == RealReg))
 #  define A1  (u->tag1 == ArchReg)
 #  define A2  (u->tag2 == ArchReg)
 #  define AS1 ((u->tag1 == ArchReg) || ((!beforeRA && (u->tag1 == SpillNo))))
 #  define AS2 ((u->tag2 == ArchReg) || ((!beforeRA && (u->tag2 == SpillNo))))
 #  define AS3 ((u->tag3 == ArchReg) || ((!beforeRA && (u->tag3 == SpillNo))))
 #  define L1  (u->tag1 == Literal && u->val1 == 0)
 #  define L2  (u->tag2 == Literal && u->val2 == 0)
 #  define Ls1 (u->tag1 == Lit16)
 #  define Ls2 (u->tag2 == Lit16)
 #  define Ls3 (u->tag3 == Lit16)
 #  define TRL1 (TR1 || L1)
 #  define TRAL1 (TR1 || A1 || L1)
 #  define TRA1 (TR1 || A1)
 #  define TRA2 (TR2 || A2)
 #  define N1  (u->tag1 == NoValue)
 #  define N2  (u->tag2 == NoValue)
 #  define N3  (u->tag3 == NoValue)
 #  define Se1 (u->tag1 == ArchRegS)
 #  define Se2 (u->tag2 == ArchRegS)

 #  define COND0    (u->cond         == 0)
 #  define EXTRA4b0 (u->extra4b      == 0)
 #  define SG_WD0   (u->signed_widen == 0)
 #  define JMPKIND0 (u->jmpkind      == 0)
 #  define CCALL0   (u->argc==0 && u->regparms_n==0 && u->has_ret_val==0 && \
                     ( beforeLiveness                                       \
                     ? u->regs_live_after == ALL_RREGS_LIVE                 \
                     : True ))

 #  define XCONDi   (         EXTRA4b0 && SG_WD0 && JMPKIND0 && CCALL0)
 #  define Xextra4b (COND0             && SG_WD0 && JMPKIND0 && CCALL0)
 #  define XWIDEN   (COND0                       && JMPKIND0 && CCALL0)
 #  define XJMP     (                     SG_WD0             && CCALL0)
 #  define XCCALL   (COND0 && EXTRA4b0 && SG_WD0 && JMPKIND0          )
 #  define XOTHER   (COND0 && EXTRA4b0 && SG_WD0 && JMPKIND0 && CCALL0)

    /* 0 or 1 Literal args per UInstr */
    Int n_lits = 0;
    if (u->tag1 == Literal) n_lits++;
    if (u->tag2 == Literal) n_lits++;
    if (u->tag3 == Literal) n_lits++;
    if (n_lits > 1)
       return False;

    /* Fields not checked: val1, val2, val3 */

    switch (u->opcode) {

    /* Fields checked: lit32   size  flags_r/w tag1   tag2   tag3    (rest) */
    case PUTSEG: return LIT0 && SZ2  && CC0 &&  TR1 && Se2 &&  N3 && XOTHER;
    case GETSEG: return LIT0 && SZ2  && CC0 &&  Se1 && TR2 &&  N3 && XOTHER;
    case USESEG: return LIT0 && SZ0  && CC0 &&  TR1 && TR2 &&  N3 && XOTHER;
    case NOP:    return LIT0 && SZ0  && CC0 &&   N1 &&  N2 &&  N3 && XOTHER;
    case LOCK:   return LIT0 && SZ0  && CC0 &&   N1 &&  N2 &&  N3 && XOTHER;
    case GETF:   return LIT0 && SZ42 && CCr &&  TR1 &&  N2 &&  N3 && XOTHER;
    case PUTF:   return LIT0 && SZ42 && CCw &&  TR1 &&  N2 &&  N3 && XOTHER;
    case GET:    return LIT0 && SZi  && CC0 &&  AS1 && TR2 &&  N3 && XOTHER;
    case PUT:    return LIT0 && SZi  && CC0 &&  TR1 && AS2 &&  N3 && XOTHER;
    case LOAD:
    case STORE:  return LIT0 && SZi  && CC0 &&  TR1 && TR2 &&  N3 && XOTHER;
    case MOV:    return LITm && SZ4m && CC0 && TRL1 && TR2 &&  N3 && XOTHER;
    case CMOV:   return LIT0 && SZ4  && CCg &&  TR1 && TR2 &&  N3 && XCONDi;
    case WIDEN:  return LIT0 && SZ42 && CC0 &&  TR1 &&  N2 &&  N3 && XWIDEN;
    case JMP:    return LITm && SZ0  && CCj && TRL1 &&  N2 &&  N3 && XJMP;
    case CALLM:  return LIT0 && SZ0 /*any*/ &&  Ls1 &&  N2 &&  N3 && XOTHER;
    case CALLM_S:
    case CALLM_E:return LIT0 && SZ0  && CC0 &&   N1 &&  N2 &&  N3 && XOTHER;
    case PUSH:
    case POP:    return LIT0 && SZi  && CC0 &&  TR1 &&  N2 &&  N3 && XOTHER;
    case CLEAR:  return LIT0 && SZ0  && CC0 &&  Ls1 &&  N2 &&  N3 && XOTHER;
    case AND:
    case OR:     return LIT0 && SZi  && CCa &&  TR1 && TR2 &&  N3 && XOTHER;
    case MUL:    return LIT0 && SZ42 && CCa && TRA1 &&TRA2 &&  N3 && XOTHER;
    case ADD:
    case XOR:
    case SUB:    return LITm && SZi  && CCa &&TRAL1 && TR2 &&  N3 && XOTHER;
    case SBB:
    case ADC:    return LITm && SZi  && CCb &&TRAL1 && TR2 &&  N3 && XOTHER;
    case SHL:
    case SHR:
    case SAR:    return LITm && SZi  && CCa && TRL1 && TR2 &&  N3 && XOTHER;
    case ROL:
    case ROR:    return LITm && SZi  && CCc && TRL1 && TR2 &&  N3 && XOTHER;
    case RCL:
    case RCR:    return LITm && SZi  && CCd && TRL1 && TR2 &&  N3 && XOTHER;
    case NOT:    return LIT0 && SZi  && CC0 &&  TR1 &&  N2 &&  N3 && XOTHER;
    case NEG:    return LIT0 && SZi  && CCa &&  TR1 &&  N2 &&  N3 && XOTHER;
    case INC:
    case DEC:    return LIT0 && SZi  && CCe &&  TR1 &&  N2 &&  N3 && XOTHER;
    case CC2VAL: return LIT0 && SZ1  && CCg &&  TR1 &&  N2 &&  N3 && XCONDi;
    case BSWAP:  return LIT0 && SZ4  && CC0 &&  TR1 &&  N2 &&  N3 && XOTHER;
    case JIFZ:   return LIT1 && SZ4  && CC0 &&  TR1 &&  L2 &&  N3 && XOTHER;
    case FPU_R:
    case FPU_W:  return LIT0 && SZf  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
    case FPU:    return LIT0 && SZ0  && CCf &&  Ls1 &&  N2 &&  N3 && XOTHER;
    case LEA1:   return /*any*/ SZ4  && CC0 &&  TR1 && TR2 &&  N3 && XOTHER;
    case LEA2:   return /*any*/ SZ4  && CC0 &&  TR1 && TR2 && TR3 && Xextra4b;
    case INCEIP: return LIT0 && SZ0  && CC0 &&  Ls1 &&  N2 &&  N3 && XOTHER;
    case CCALL:  return LIT1 && SZ0  && CC0 &&
                        (u->argc > 0                   ? TR1 : N1) &&
                        (u->argc > 1                   ? TR2 : N2) &&
                        (u->argc > 2 || u->has_ret_val ? TR3 : N3) &&
                        u->regparms_n <= u->argc && XCCALL;
    /* Fields checked:       lit32   size  flags_r/w tag1   tag2   tag3    (rest) */
    case MMX1:
    case MMX2:         return LIT0 && SZ0  && CC0 &&  Ls1 &&  N2 &&  N3 && XOTHER;
    case MMX3:         return LIT0 && SZ0  && CC0 &&  Ls1 && Ls2 &&  N3 && XOTHER;
    case MMX2_MemRd:   return LIT0 && SZ48 && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
    case MMX2_MemWr:   return LIT0 && SZ48 && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
    case MMX2a1_MemRd: return LIT0 && SZ8  && CC0 &&  Ls1 && Ls2 && TR3 && XOTHER;
    case MMX2_ERegRd:  return LIT0 && SZ4  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
    case MMX2_ERegWr:  return LIT0 && SZ4  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;

    /* Fields checked:        lit32   size  flags_r/w tag1   tag2   tag3    (rest) */
    case SSE2a_MemWr:  return LIT0 && SZsse2 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE2a_MemRd:  return LIT0 && SZsse2 && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE2a1_MemRd: return LIT0 && SZsse3 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE2g_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE2g1_RegWr: return LIT8 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE2e1_RegRd: return LIT8 && SZ2    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3a_MemWr:  return LIT0 && SZsse3 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3a_MemRd:  return LIT0 && SZsse3 && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e_RegRd:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3a1_MemRd: return LIT8 && SZ816  && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3g_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3g1_RegWr: return LIT8 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e1_RegRd: return LIT8 && SZ2    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3:         return LIT0 && SZ0    && CCa  && Ls1 && Ls2 && N3  && XOTHER;
    case SSE4:         return LIT0 && SZ0    && CCa  && Ls1 && Ls2 && N3  && XOTHER;
    case SSE5:         return LIT0 && SZ0    && CC0  && Ls1 && Ls2 && Ls3 && XOTHER;
    case SSE3ag_MemRd_RegWr:
                       return         SZ48   && CC0  && TR1 && TR2 && N3  && XOTHER;
    default:
       if (VG_(needs).extended_UCode)
          return SK_(sane_XUInstr)(beforeRA, beforeLiveness, u);
       else {
          VG_(printf)("unhandled opcode: %u.  Perhaps "
                      "VG_(needs).extended_UCode should be set?",
                      u->opcode);
          VG_(core_panic)("VG_(saneUInstr): unhandled opcode");
       }
    }
 #  undef LIT0
 #  undef LIT1
 #  undef LIT8
 #  undef LITm
 #  undef SZ16
 #  undef SZ8
 #  undef SZ4
 #  undef SZ2
 #  undef SZ1
 #  undef SZ0
 #  undef SZ42
 #  undef SZ48
 #  undef SZ416
 #  undef SZsse2
 #  undef SZsse3
 #  undef SZi
 #  undef SZf
 #  undef SZ4m
 #  undef emptyR
 #  undef emptyW
 #  undef CC0
 #  undef CCr
 #  undef CCw
 #  undef CCa
 #  undef CCb
 #  undef CCc
 #  undef CCd
 #  undef CCe
 #  undef CCf
 #  undef CCg
 #  undef CCj
 #  undef TR1
 #  undef TR2
 #  undef TR3
 #  undef A1
 #  undef A2
 #  undef AS1
 #  undef AS2
 #  undef AS3
 #  undef L1
 #  undef L2
 #  undef Ls1
 #  undef Ls2
 #  undef Ls3
 #  undef TRL1
 #  undef TRAL1
 #  undef N1
 #  undef N2
 #  undef N3
 #  undef Se2
 #  undef Se1
 #  undef COND0
 #  undef EXTRA4b0
 #  undef SG_WD0
 #  undef JMPKIND0
 #  undef CCALL0
 #  undef Xextra4b
 #  undef XWIDEN
 #  undef XJMP
 #  undef XCCALL
 #  undef XOTHER
 }

 void VG_(saneUCodeBlock) ( UCodeBlock* cb )
 {
    Int i;

    for (i = 0; i < cb->used; i++) {
       Bool sane = VG_(saneUInstr)(True, True, &cb->instrs[i]);
       if (!sane) {
          VG_(printf)("Instruction failed sanity check:\n");
          VG_(up_UInstr)(i, &cb->instrs[i]);
       }
       vg_assert(sane);
    }
 }

 /* Sanity checks to do with CALLMs in UCodeBlocks. */
 Bool VG_(saneUCodeBlockCalls) ( UCodeBlock* cb )
 {
    Int  callm = 0;
    Int  callm_s = 0;
    Int  callm_e = 0;
    Int  callm_ptr, calls_ptr;
    Int  i, j, t;
    Bool incall = False;

    /* Ensure the number of CALLM, CALLM_S and CALLM_E are the same. */

    for (i = 0; i < cb->used; i++) {
       switch (cb->instrs[i].opcode) {
          case CALLM:
             if (!incall) return False;
             callm++;
             break;
          case CALLM_S:
             if (incall) return False;
             incall = True;
             callm_s++;
             break;
          case CALLM_E:
             if (!incall) return False;
             incall = False;
             callm_e++;
             break;
          case PUSH: case POP: case CLEAR:
             if (!incall) return False;
             break;
          default:
             break;
       }
    }
    if (incall) return False;
    if (callm != callm_s || callm != callm_e) return False;

    /* Check the sections between CALLM_S and CALLM's.  Ensure that no
       PUSH uinsn pushes any TempReg that any other PUSH in the same
       section pushes.  Ie, check that the TempReg args to PUSHes in
       the section are unique.  If not, the instrumenter generates
       incorrect code for CALLM insns. */

    callm_ptr = 0;

  find_next_CALLM:
    /* Search for the next interval, making calls_ptr .. callm_ptr
       bracket it. */
    while (callm_ptr < cb->used
           && cb->instrs[callm_ptr].opcode != CALLM)
       callm_ptr++;
    if (callm_ptr == cb->used)
       return True;
    vg_assert(cb->instrs[callm_ptr].opcode == CALLM);

    calls_ptr = callm_ptr - 1;
    while (cb->instrs[calls_ptr].opcode != CALLM_S)
       calls_ptr--;
    vg_assert(cb->instrs[calls_ptr].opcode == CALLM_S);
    vg_assert(calls_ptr >= 0);

    /* VG_(printf)("interval from %d to %d\n", calls_ptr, callm_ptr ); */

    /* For each PUSH insn in the interval ... */
    for (i = calls_ptr + 1; i < callm_ptr; i++) {
       if (cb->instrs[i].opcode != PUSH) continue;
       t = cb->instrs[i].val1;
       /* Ensure no later PUSH insns up to callm_ptr push the same
          TempReg.  Return False if any such are found. */
       for (j = i+1; j < callm_ptr; j++) {
          if (cb->instrs[j].opcode == PUSH &&
              cb->instrs[j].val1 == t)
             return False;
       }
    }

    /* This interval is clean.  Keep going ... */
    callm_ptr++;
    goto find_next_CALLM;
 }


 /*------------------------------------------------------------*/
 /*--- Printing uinstrs.                                    ---*/
 /*------------------------------------------------------------*/

 /* Global that dictates whether to print generated code at all stages */
 Bool VG_(print_codegen);

 Char* VG_(name_UCondcode) ( Condcode cond )
 {
    switch (cond) {
       case CondO:      return "o";
       case CondNO:     return "no";
       case CondB:      return "b";
       case CondNB:     return "nb";
       case CondZ:      return "z";
       case CondNZ:     return "nz";
       case CondBE:     return "be";
       case CondNBE:    return "nbe";
       case CondS:      return "s";
       case CondNS:     return "ns";
       case CondP:      return "p";
       case CondNP:     return "np";
       case CondL:      return "l";
       case CondNL:     return "nl";
       case CondLE:     return "le";
       case CondNLE:    return "nle";
       case CondAlways: return "MP"; /* hack! */
       default: VG_(core_panic)("name_UCondcode");
    }
 }


 static void vg_ppFlagSet ( Char* prefix, FlagSet set )
 {
    VG_(printf)("%s", prefix);
    if (set & FlagD) VG_(printf)("D");
    if (set & FlagO) VG_(printf)("O");
    if (set & FlagS) VG_(printf)("S");
    if (set & FlagZ) VG_(printf)("Z");
    if (set & FlagA) VG_(printf)("A");
    if (set & FlagC) VG_(printf)("C");
    if (set & FlagP) VG_(printf)("P");
 }


 static void ppTempReg ( Int tt )
 {
    if ((tt & 1) == 0)
       VG_(printf)("t%d", tt);
    else
       VG_(printf)("q%d", tt-1);
 }


 void VG_(pp_UOperand) ( UInstr* u, Int operandNo, Int sz, Bool parens )
 {
    UInt tag, val;
    switch (operandNo) {
       case 1: tag = u->tag1; val = u->val1; break;
       case 2: tag = u->tag2; val = u->val2; break;
       case 3: tag = u->tag3; val = u->val3; break;
       default: VG_(core_panic)("VG_(pp_UOperand)(1)");
    }
    if (tag == Literal) val = u->lit32;

    if (parens) VG_(printf)("(");
    switch (tag) {
       case TempReg:  ppTempReg(val); break;
       case RealReg:  VG_(printf)("%s",nameIReg(sz==0 ? 4 : sz,val)); break;
       case Literal:  VG_(printf)("$0x%x", val); break;
       case Lit16:    VG_(printf)("$0x%x", val); break;
       case NoValue:  VG_(printf)("NoValue"); break;
       case ArchReg:  VG_(printf)("%S",nameIReg(sz,val)); break;
       case ArchRegS: VG_(printf)("%S",nameSReg(val)); break;
       case SpillNo:  VG_(printf)("spill%d", val); break;
       default: VG_(core_panic)("VG_(ppUOperand)(2)");
    }
    if (parens) VG_(printf)(")");
 }


 Char* VG_(name_UOpcode) ( Bool upper, Opcode opc )
 {
    switch (opc) {
       case ADD:   return (upper ? "ADD" : "add");
       case ADC:   return (upper ? "ADC" : "adc");
       case AND:   return (upper ? "AND" : "and");
       case OR:    return (upper ? "OR"  : "or");
       case XOR:   return (upper ? "XOR" : "xor");
       case SUB:   return (upper ? "SUB" : "sub");
       case SBB:   return (upper ? "SBB" : "sbb");
       case SHL:   return (upper ? "SHL" : "shl");
       case SHR:   return (upper ? "SHR" : "shr");
       case SAR:   return (upper ? "SAR" : "sar");
       case ROL:   return (upper ? "ROL" : "rol");
       case ROR:   return (upper ? "ROR" : "ror");
       case RCL:   return (upper ? "RCL" : "rcl");
       case RCR:   return (upper ? "RCR" : "rcr");
       case MUL:   return (upper ? "MUL" : "mul");
       case NOT:   return (upper ? "NOT" : "not");
       case NEG:   return (upper ? "NEG" : "neg");
       case INC:   return (upper ? "INC" : "inc");
       case DEC:   return (upper ? "DEC" : "dec");
       case BSWAP: return (upper ? "BSWAP" : "bswap");
       default:    break;
    }
    if (!upper) VG_(core_panic)("vg_name_UOpcode: invalid !upper");
    switch (opc) {
       case CALLM_S: return "CALLM_S";
       case CALLM_E: return "CALLM_E";
       case INCEIP:  return "INCEIP";
       case LEA1:    return "LEA1";
       case LEA2:    return "LEA2";
       case NOP:     return "NOP";
       case LOCK:    return "LOCK";
       case GET:     return "GET";
       case PUT:     return "PUT";
       case GETF:    return "GETF";
       case PUTF:    return "PUTF";
       case GETSEG:  return "GETSEG";
       case PUTSEG:  return "PUTSEG";
       case USESEG:  return "USESEG";
       case LOAD:    return "LD" ;
       case STORE:   return "ST" ;
       case MOV:     return "MOV";
       case CMOV:    return "CMOV";
       case WIDEN:   return "WIDEN";
       case JMP:     return "J"    ;
       case JIFZ:    return "JIFZ" ;
       case CALLM:   return "CALLM";
       case CCALL:   return "CCALL";
       case PUSH:    return "PUSH" ;
       case POP:     return "POP"  ;
       case CLEAR:   return "CLEAR";
       case CC2VAL:  return "CC2VAL";
       case FPU_R:   return "FPU_R";
       case FPU_W:   return "FPU_W";
       case FPU:     return "FPU"  ;
       case MMX1:       return "MMX1" ;
       case MMX2:       return "MMX2" ;
       case MMX3:       return "MMX3" ;
       case MMX2_MemRd: return "MMX2_MRd" ;
       case MMX2_MemWr: return "MMX2_MWr" ;
       case MMX2a1_MemRd: return "MMX2a1_MRd" ;
       case MMX2_ERegRd: return "MMX2_eRRd" ;
       case MMX2_ERegWr: return "MMX2_eRWr" ;
       case SSE2a_MemWr: return "SSE2a_MWr";
       case SSE2a_MemRd: return "SSE2a_MRd";
       case SSE2g_RegWr: return "SSE2g_RWr";
       case SSE2a1_MemRd: return "SSE2a1_MRd";
       case SSE2g1_RegWr: return "SSE2g1_RWr";
       case SSE2e1_RegRd: return "SSE2e1_RRd";
       case SSE3e_RegRd: return "SSE3e_RRd";
       case SSE3e_RegWr: return "SSE3e_RWr";
       case SSE3g_RegWr: return "SSE3g_RWr";
       case SSE3a1_MemRd: return "SSE3a1_MRd";
       case SSE3g1_RegWr: return "SSE3g1_RWr";
       case SSE3e1_RegRd: return "SSE3e1_RRd";
       case SSE3:        return "SSE3";
       case SSE4:        return "SSE4";
       case SSE5:        return "SSE5";
       case SSE3a_MemWr: return "SSE3a_MWr";
       case SSE3a_MemRd: return "SSE3a_MRd";
       case SSE3ag_MemRd_RegWr: return "SSE3ag_MemRd_RegWr";
       default:
          if (VG_(needs).extended_UCode)
             return SK_(name_XUOpcode)(opc);
          else {
             VG_(printf)("unhandled opcode: %u.  Perhaps "
                         "VG_(needs).extended_UCode should be set?",
                         opc);
             VG_(core_panic)("name_UOpcode: unhandled opcode");
          }
    }
 }

 static
 void pp_realregs_liveness ( UInstr* u )
 {
 #  define PRINT_RREG_LIVENESS(realReg,s) \
      VG_(printf)( IS_RREG_LIVE(VG_(realreg_to_rank)(realReg), \
                                u->regs_live_after)             \
                      ? s : "-");

    VG_(printf)("[");
    PRINT_RREG_LIVENESS(R_EAX, "a");
    PRINT_RREG_LIVENESS(R_EBX, "b");
    PRINT_RREG_LIVENESS(R_ECX, "c");
    PRINT_RREG_LIVENESS(R_EDX, "d");
    PRINT_RREG_LIVENESS(R_ESI, "S");
    PRINT_RREG_LIVENESS(R_EDI, "D");
    VG_(printf)("]");

 #  undef PRINT_RREG_LIVENESS
 }

 /* Ugly-print UInstr :) */
 void VG_(up_UInstr) ( Int i, UInstr* u )
 {
    VG_(pp_UInstr_regs)(i, u);

    VG_(printf)("opcode:          %d\n", u->opcode);
    VG_(printf)("lit32:           0x%x\n", u->lit32);
    VG_(printf)("size:            %d\n", u->size);
    VG_(printf)("val1,val2,val3:  %d, %d, %d\n", u->val1, u->val2, u->val3);
    VG_(printf)("tag1,tag2,tag3:  %d, %d, %d\n", u->tag1, u->tag2, u->tag3);
    VG_(printf)("flags_r:         0x%x\n", u->flags_r);
    VG_(printf)("flags_w:         0x%x\n", u->flags_w);
    VG_(printf)("extra4b:         0x%x\n", u->extra4b);
    VG_(printf)("cond:            0x%x\n", u->cond);
    VG_(printf)("signed_widen:    %d\n", u->signed_widen);
    VG_(printf)("jmpkind:         %d\n", u->jmpkind);
    VG_(printf)("argc,regparms_n: %d, %d\n", u->argc, u->regparms_n);
    VG_(printf)("has_ret_val:     %d\n", u->has_ret_val);
    VG_(printf)("regs_live_after: ");
    pp_realregs_liveness(u);
    VG_(printf)("\n");
 }

 static
 void pp_UInstrWorker ( Int instrNo, UInstr* u, Bool ppRegsLiveness )
 {
    VG_(printf)("\t%4d: %s", instrNo,
                             VG_(name_UOpcode)(True, u->opcode));
    if (u->opcode == JMP || u->opcode == CC2VAL)
       VG_(printf)("%s", VG_(name_UCondcode)(u->cond));

    switch (u->size) {
       case 0:  VG_(printf)("o"); break;
       case 1:  VG_(printf)("B"); break;
       case 2:  VG_(printf)("W"); break;
       case 4:  VG_(printf)("L"); break;
       case 8:  VG_(printf)("Q"); break;
       case 16: VG_(printf)("QQ"); break;
       default: VG_(printf)("%d", (Int)u->size); break;
    }

    VG_(printf)("       \t");

    switch (u->opcode) {

       case CALLM_S: case CALLM_E:
          break;

       case INCEIP:
          VG_(printf)("$%d", u->val1);
          break;

       case LEA2:
          VG_(printf)("%d(" , u->lit32);
          VG_(pp_UOperand)(u, 1, 4, False);
          VG_(printf)(",");
          VG_(pp_UOperand)(u, 2, 4, False);
          VG_(printf)(",%d), ", (Int)u->extra4b);
          VG_(pp_UOperand)(u, 3, 4, False);
          break;

       case LEA1:
          VG_(printf)("%d" , u->lit32);
          VG_(pp_UOperand)(u, 1, 4, True);
          VG_(printf)(", ");
          VG_(pp_UOperand)(u, 2, 4, False);
          break;

       case NOP: case LOCK:
          break;

       case FPU_W:
          VG_(printf)("0x%x:0x%x, ",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
          VG_(pp_UOperand)(u, 2, 4, True);
          break;

       case FPU_R:
          VG_(printf)("");
          VG_(pp_UOperand)(u, 2, 4, True);
          VG_(printf)(", 0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
          break;

       case FPU:
          VG_(printf)("0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
          break;

       case MMX1:
          VG_(printf)("0x%x",
                      u->val1 & 0xFF );
          break;

       case MMX2:
          VG_(printf)("0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
          break;

       case MMX3:
          VG_(printf)("0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, u->val2 & 0xFF );
          break;

       case MMX2_ERegWr:
       case MMX2_ERegRd:
          VG_(printf)("0x%x:0x%x, ",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
          VG_(pp_UOperand)(u, 2, 4, False);
          break;

       case MMX2_MemWr:
       case MMX2_MemRd:
           VG_(printf)("0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
          VG_(pp_UOperand)(u, 2, 4, True);
          break;

       case MMX2a1_MemRd:
           VG_(printf)("0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, u->val2 & 0xFF );
          VG_(pp_UOperand)(u, 3, 4, True);
          break;

       case SSE2a_MemWr:
       case SSE2a_MemRd:
       case SSE2g_RegWr:
       case SSE2g1_RegWr:
       case SSE2e1_RegRd:
          VG_(printf)("0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, u->val2 & 0xFF );
          VG_(pp_UOperand)(u, 3, 4, True);
          break;

       case SSE2a1_MemRd:
       case SSE3a_MemWr:
       case SSE3a_MemRd:
          VG_(printf)("0x%x:0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                      (u->val2 >> 8) & 0xFF, u->val2 & 0xFF );
          VG_(pp_UOperand)(u, 3, 4, True);
          break;

       case SSE3e_RegWr:
       case SSE3e_RegRd:
       case SSE3g_RegWr:
          VG_(printf)("0x%x:0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                      (u->val2 >> 8) & 0xFF, u->val2 & 0xFF );
          VG_(pp_UOperand)(u, 3, 4, True);
          break;

       case SSE3g1_RegWr:
       case SSE3e1_RegRd:
       case SSE3a1_MemRd:
          VG_(printf)("0x%x:0x%x:0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                      (u->val2 >> 8) & 0xFF, u->val2 & 0xFF,
                      u->lit32 );
          VG_(pp_UOperand)(u, 3, 4, True);
          break;

       case SSE3:
          VG_(printf)("0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                      u->val2 & 0xFF );
          break;

       case SSE4:
          VG_(printf)("0x%x:0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                      (u->val2 >> 8) & 0xFF, u->val2 & 0xFF );
          break;

       case SSE5:
          VG_(printf)("0x%x:0x%x:0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                      (u->val2 >> 8) & 0xFF, u->val2 & 0xFF,
                      u->val3 & 0xFF );
          break;

       case SSE3ag_MemRd_RegWr:
 	 VG_(printf)("0x%x(addr=", u->lit32 );
 	 VG_(pp_UOperand)(u, 1, 4, False);
 	 VG_(printf)(", dst=");
 	 VG_(pp_UOperand)(u, 2, 4, False);
          VG_(printf)(")");
          break;

       case GET: case PUT: case MOV: case LOAD: case STORE: case CMOV:
       case GETSEG: case PUTSEG:
          VG_(pp_UOperand)(u, 1, u->size, u->opcode==LOAD);
          VG_(printf)(", ");
          VG_(pp_UOperand)(u, 2, u->size, u->opcode==STORE);
          break;

       case JMP:
          switch (u->jmpkind) {
             case JmpCall:      VG_(printf)("-c"); break;
             case JmpRet:       VG_(printf)("-r"); break;
             case JmpSyscall:   VG_(printf)("-sys"); break;
             case JmpClientReq: VG_(printf)("-cli"); break;
             case JmpYield:     VG_(printf)("-yld"); break;
             default: break;
          }
          VG_(pp_UOperand)(u, 1, u->size, False);
          if (CondAlways == u->cond) {
             /* Print x86 instruction size if filled in */
             if (0 != u->extra4b)
                VG_(printf)("  ($%u)", u->extra4b);
          }
          break;

       case GETF: case PUTF:
       case CC2VAL: case PUSH: case POP: case CLEAR: case CALLM:
       case NOT: case NEG: case INC: case DEC: case BSWAP:
          VG_(pp_UOperand)(u, 1, u->size, False);
          break;

       /* Print a "(s)" after args passed on stack */
       case CCALL:
          if (u->has_ret_val) {
             VG_(pp_UOperand)(u, 3, 0, False);
             VG_(printf)(" = ");
          }
          VG_(printf)("%p(", u->lit32);
          if (u->argc > 0) {
             VG_(pp_UOperand)(u, 1, 0, False);
             if (u->regparms_n < 1)
                VG_(printf)("(s)");
          }
          if (u->argc > 1) {
             VG_(printf)(", ");
             VG_(pp_UOperand)(u, 2, 0, False);
             if (u->regparms_n < 2)
                VG_(printf)("(s)");
          }
          if (u->argc > 2) {
             VG_(printf)(", ");
             VG_(pp_UOperand)(u, 3, 0, False);
             if (u->regparms_n < 3)
                VG_(printf)("(s)");
          }
          VG_(printf)(") ");
          break;

       case USESEG:
       case JIFZ:
       case ADD: case ADC: case AND: case OR:
       case XOR: case SUB: case SBB:
       case SHL: case SHR: case SAR:
       case ROL: case ROR: case RCL: case RCR:
       case MUL:
          VG_(pp_UOperand)(u, 1, u->size, False);
          VG_(printf)(", ");
          VG_(pp_UOperand)(u, 2, u->size, False);
          break;

       case WIDEN:
          VG_(printf)("_%c%c", VG_(toupper)(nameISize(u->extra4b)),
                               u->signed_widen?'s':'z');
          VG_(pp_UOperand)(u, 1, u->size, False);
          break;

       default:
          if (VG_(needs).extended_UCode)
             SK_(pp_XUInstr)(u);
          else {
             VG_(printf)("unhandled opcode: %u.  Perhaps "
                         "VG_(needs).extended_UCode should be set?",
                         u->opcode);
             VG_(core_panic)("pp_UInstr: unhandled opcode");
          }
    }
    if (u->flags_r != FlagsEmpty || u->flags_w != FlagsEmpty) {
       VG_(printf)("  (");
       if (u->flags_r != FlagsEmpty)
          vg_ppFlagSet("-r", u->flags_r);
       if (u->flags_w != FlagsEmpty)
          vg_ppFlagSet("-w", u->flags_w);
       VG_(printf)(")");
    }

    if (ppRegsLiveness) {
       VG_(printf)("\t\t");
       pp_realregs_liveness ( u );
    }

    VG_(printf)("\n");
 }

 void VG_(pp_UInstr) ( Int instrNo, UInstr* u )
 {
    pp_UInstrWorker ( instrNo, u, /*ppRegsLiveness*/False );
 }

 void VG_(pp_UInstr_regs) ( Int instrNo, UInstr* u )
 {
    pp_UInstrWorker ( instrNo, u, /*ppRegsLiveness*/True );
 }

 void VG_(pp_UCodeBlock) ( UCodeBlock* cb, Char* title )
 {
    Int i;
    VG_(printf)("%s\n", title);
    for (i = 0; i < cb->used; i++)
       if (cb->instrs[i].opcode != NOP)
          VG_(pp_UInstr) ( i, &cb->instrs[i] );
    VG_(printf)("\n");
 }


 /*------------------------------------------------------------*/
 /*--- uinstr helpers for register allocation               ---*/
 /*--- and code improvement.                                ---*/
 /*------------------------------------------------------------*/

 /* Get the temp/reg use of a uinstr, parking them in an array supplied by
    the caller (regs), which is assumed to be big enough.  Return the number
    of entries.  Written regs are indicated in parallel array isWrites.
    Insns which read _and_ write a register wind up mentioning it twice.
    Entries are placed in the array in program order, so that if a reg is
    read-modified-written, it appears first as a read and then as a write.
    'tag' indicates whether we are looking at TempRegs or RealRegs.
 */
 Int VG_(get_reg_usage) ( UInstr* u, Tag tag, Int* regs, Bool* isWrites )
 {
 #  define RD(ono)    VG_UINSTR_READS_REG(ono, regs, isWrites)
 #  define WR(ono)    VG_UINSTR_WRITES_REG(ono, regs, isWrites)

    Int n = 0;
    switch (u->opcode) {
       case LEA1: RD(1); WR(2); break;
       case LEA2: RD(1); RD(2); WR(3); break;

       case SSE3a1_MemRd:
       case SSE2a1_MemRd:
       case SSE2e1_RegRd:
       case SSE3e_RegRd:
       case SSE3a_MemWr:
       case SSE3a_MemRd:
       case SSE2a_MemWr:
       case SSE3e1_RegRd:
       case SSE2a_MemRd: RD(3); break;

       case SSE2g_RegWr:
       case SSE2g1_RegWr:
       case SSE3e_RegWr:
       case SSE3g1_RegWr:
       case SSE3g_RegWr: WR(3); break;

       case SSE3ag_MemRd_RegWr: RD(1); WR(2); break;

       case MMX2a1_MemRd: RD(3); break;
       case MMX2_ERegRd: RD(2); break;
       case MMX2_ERegWr: WR(2); break;

       case SSE4: case SSE3: case SSE5:
       case MMX1: case MMX2: case MMX3:
       case NOP:   case FPU:   case INCEIP: case CALLM_S: case CALLM_E:
       case CLEAR: case CALLM: case LOCK: break;

       case CCALL:
          if (u->argc > 0)    RD(1);
          if (u->argc > 1)    RD(2);
          if (u->argc > 2)    RD(3);
          if (u->has_ret_val) WR(3);
          break;

       case MMX2_MemRd: case MMX2_MemWr:
       case FPU_R: case FPU_W: RD(2); break;

       case GETSEG: WR(2); break;
       case PUTSEG: RD(1); break;

       case GETF:  WR(1); break;
       case PUTF:  RD(1); break;

       case GET:   WR(2); break;
       case PUT:   RD(1); break;
       case LOAD:  RD(1); WR(2); break;
       case STORE: RD(1); RD(2); break;
       case MOV:   RD(1); WR(2); break;

       case JMP:   RD(1); break;

       case PUSH: RD(1); break;
       case POP:  WR(1); break;

       case USESEG:
       case CMOV:
       case ADD: case ADC: case AND: case OR:
       case XOR: case SUB: case SBB:
       case MUL:
          RD(1); RD(2); WR(2); break;

       case SHL: case SHR: case SAR:
       case ROL: case ROR: case RCL: case RCR:
          RD(1); RD(2); WR(2); break;

       case NOT: case NEG: case INC: case DEC: case BSWAP:
          RD(1); WR(1); break;

       case WIDEN: RD(1); WR(1); break;

       case CC2VAL: WR(1); break;
       case JIFZ: RD(1); break;

       default:
          if (VG_(needs).extended_UCode)
             return SK_(get_Xreg_usage)(u, tag, regs, isWrites);
          else {
             VG_(printf)("unhandled opcode: %u.  Perhaps "
                         "VG_(needs).extended_UCode should be set?",
                         u->opcode);
             VG_(core_panic)("VG_(get_reg_usage): unhandled opcode");
          }
    }
    return n;

 #  undef RD
 #  undef WR
 }


 /* Change temp regs in u into real regs, as directed by the
  * temps[i]-->reals[i] mapping. */
 static
 void patchUInstr ( UInstr* u, Int temps[], UInt reals[], Int n_tmap )
 {
    Int i;
    if (u->tag1 == TempReg) {
       for (i = 0; i < n_tmap; i++)
          if (temps[i] == u->val1) break;
       if (i == n_tmap) VG_(core_panic)("patchUInstr(1)");
       u->tag1 = RealReg;
       u->val1 = reals[i];
    }
    if (u->tag2 == TempReg) {
       for (i = 0; i < n_tmap; i++)
          if (temps[i] == u->val2) break;
       if (i == n_tmap) VG_(core_panic)("patchUInstr(2)");
       u->tag2 = RealReg;
       u->val2 = reals[i];
    }
    if (u->tag3 == TempReg) {
       for (i = 0; i < n_tmap; i++)
          if (temps[i] == u->val3) break;
       if (i == n_tmap) VG_(core_panic)("patchUInstr(3)");
       u->tag3 = RealReg;
       u->val3 = reals[i];
    }
 }


 /* Tedious x86-specific hack which compensates for the fact that the
    register numbers for %ah .. %dh do not correspond to those for %eax
    .. %edx.  It maps a (reg size, reg no) pair to the number of the
    containing 32-bit reg. */
 static __inline__
 Int containingArchRegOf ( Int sz, Int aregno )
 {
    switch (sz) {
       case 4: return aregno;
       case 2: return aregno;
       case 1: return aregno >= 4 ? aregno-4 : aregno;
       default: VG_(core_panic)("containingArchRegOf");
    }
 }


 /* If u reads an ArchReg, return the number of the containing arch
    reg.  Otherwise return -1.  Used in redundant-PUT elimination.
    Note that this is not required for skins extending UCode because
    this happens before instrumentation. */
 static
 Int maybe_uinstrReadsArchReg ( UInstr* u )
 {
    switch (u->opcode) {
       case GET:
       case ADD: case ADC: case AND: case OR:
       case XOR: case SUB: case SBB:
       case SHL: case SHR: case SAR: case ROL:
       case ROR: case RCL: case RCR:
       case MUL:
          if (u->tag1 == ArchReg)
             return containingArchRegOf ( u->size, u->val1 );
          else
             return -1;

       case GETF: case PUTF:
       case CALLM_S: case CALLM_E:
       case INCEIP:
       case LEA1:
       case LEA2:
       case NOP:
       case LOCK:
       case PUT:
       case LOAD:
       case STORE:
       case MOV:
       case CMOV:
       case JMP:
       case CALLM: case CLEAR: case PUSH: case POP:
       case NOT: case NEG: case INC: case DEC: case BSWAP:
       case CC2VAL:
       case JIFZ:
       case FPU: case FPU_R: case FPU_W:
       case MMX1: case MMX2: case MMX3:
       case MMX2_MemRd: case MMX2_MemWr: case MMX2a1_MemRd:
       case MMX2_ERegRd: case MMX2_ERegWr:
       case SSE2a_MemWr: case SSE2a_MemRd: case SSE2a1_MemRd:
       case SSE2g_RegWr: case SSE2g1_RegWr: case SSE2e1_RegRd:
       case SSE3a_MemWr: case SSE3a_MemRd: case SSE3a1_MemRd:
       case SSE3e_RegRd: case SSE3g_RegWr: case SSE3e_RegWr:
       case SSE3g1_RegWr: case SSE3e1_RegRd:
       case SSE4: case SSE3: case SSE5: case SSE3ag_MemRd_RegWr:
       case WIDEN:
       /* GETSEG and USESEG are to do with ArchRegS, not ArchReg */
       case GETSEG: case PUTSEG:
       case USESEG:
          return -1;

       default:
          VG_(pp_UInstr)(0,u);
          VG_(core_panic)("maybe_uinstrReadsArchReg: unhandled opcode");
    }
 }

 static __inline__
 Bool uInstrMentionsTempReg ( UInstr* u, Int tempreg )
 {
    Int i, k;
    Int tempUse[VG_MAX_REGS_USED];
    Bool notUsed[VG_MAX_REGS_USED];

    k = VG_(get_reg_usage) ( u, TempReg, &tempUse[0], &notUsed[0] );
    for (i = 0; i < k; i++)
       if (tempUse[i] == tempreg)
          return True;
    return False;
 }


 /*------------------------------------------------------------*/
 /*--- ucode improvement.                                   ---*/
 /*------------------------------------------------------------*/

 /* Improve the code in cb by doing
    -- Redundant ArchReg-fetch elimination
    -- Redundant PUT elimination
    -- Redundant cond-code restore/save elimination
    The overall effect of these is to allow target registers to be
    cached in host registers over multiple target insns.
 */
 static void vg_improve ( UCodeBlock* cb )
 {
    Int     i, j, k, m, n, ar, tr, told, actual_areg;
    Int     areg_map[8];
    Bool    annul_put[8];
    Int     tempUse[VG_MAX_REGS_USED];
    Bool    isWrites[VG_MAX_REGS_USED];
    UInstr* u;
    Bool    wr;
    Int*    last_live_before;
    FlagSet future_dead_flags;

    if (dis)
       VG_(printf) ("Improvements:\n");

    if (cb->nextTemp > 0)
       last_live_before = VG_(arena_malloc) ( VG_AR_JITTER,
                                              cb->nextTemp * sizeof(Int) );
    else
       last_live_before = NULL;


    /* PASS 1: redundant GET elimination.  (Actually, more general than
       that -- eliminates redundant fetches of ArchRegs). */

    /* Find the live-range-ends for all temporaries.  Duplicates code
       in the register allocator :-( */

    for (i = 0; i < cb->nextTemp; i++) last_live_before[i] = -1;

    for (i = cb->used-1; i >= 0; i--) {
       u = &cb->instrs[i];

       k = VG_(get_reg_usage)(u, TempReg, &tempUse[0], &isWrites[0]);

       /* For each temp usage ... bwds in program order. */
       for (j = k-1; j >= 0; j--) {
          tr = tempUse[j];
          wr = isWrites[j];
          if (last_live_before[tr] == -1) {
             vg_assert(tr >= 0 && tr < cb->nextTemp);
             last_live_before[tr] = wr ? (i+1) : i;
          }
       }

    }

 #  define BIND_ARCH_TO_TEMP(archreg,tempreg)\
    { Int q;                                           \
      /* Invalidate any old binding(s) to tempreg. */  \
      for (q = 0; q < 8; q++)                          \
         if (areg_map[q] == tempreg) areg_map[q] = -1; \
      /* Add the new binding. */                       \
      areg_map[archreg] = (tempreg);                   \
    }

    /* Set up the A-reg map. */
    for (i = 0; i < 8; i++) areg_map[i] = -1;

    /* Scan insns. */
    for (i = 0; i < cb->used; i++) {
       u = &cb->instrs[i];
       if (u->opcode == GET && u->size == 4) {
          /* GET; see if it can be annulled. */
          vg_assert(u->tag1 == ArchReg);
          vg_assert(u->tag2 == TempReg);
          ar   = u->val1;
          tr   = u->val2;
          told = areg_map[ar];
          if (told != -1 && last_live_before[told] <= i) {
             /* ar already has an old mapping to told, but that runs
                out here.  Annul this GET, rename tr to told for the
                rest of the block, and extend told's live range to that
                of tr.  */
             VG_(new_NOP)(u);
             n = last_live_before[tr] + 1;
             if (n > cb->used) n = cb->used;
             last_live_before[told] = last_live_before[tr];
             last_live_before[tr] = i-1;
             if (dis)
                VG_(printf)(
                   "   at %2d: delete GET, rename t%d to t%d in (%d .. %d)\n",
                   i, tr, told,i+1, n-1);
             for (m = i+1; m < n; m++) {
                if (cb->instrs[m].tag1 == TempReg
                    && cb->instrs[m].val1 == tr)
                  cb->instrs[m].val1 = told;
                if (cb->instrs[m].tag2 == TempReg
                    && cb->instrs[m].val2 == tr)
                  cb->instrs[m].val2 = told;
                if (cb->instrs[m].tag3 == TempReg
                    && cb->instrs[m].val3 == tr)
                  cb->instrs[m].val3 = told;
             }
             BIND_ARCH_TO_TEMP(ar,told);
          }
          else
             BIND_ARCH_TO_TEMP(ar,tr);
       }
       else if (u->opcode == GET && u->size != 4) {
          /* Invalidate any mapping for this archreg.  */
          actual_areg = containingArchRegOf ( u->size, u->val1 );
          areg_map[actual_areg] = -1;
       }
       else if (u->opcode == PUT && u->size == 4) {
          /* PUT; re-establish t -> a binding */
          vg_assert(u->tag1 == TempReg);
          vg_assert(u->tag2 == ArchReg);
          BIND_ARCH_TO_TEMP(u->val2, u->val1);
       }
       else if (u->opcode == PUT && u->size != 4) {
          /* Invalidate any mapping for this archreg. */
          actual_areg = containingArchRegOf ( u->size, u->val2 );
          areg_map[actual_areg] = -1;
       } else {

          /* see if insn has an archreg as a read operand; if so try to
             map it. */
          if (u->tag1 == ArchReg && u->size == 4
                                 && areg_map[u->val1] != -1) {
             switch (u->opcode) {
                case ADD: case SUB: case AND: case OR: case XOR:
                case ADC: case SBB:
                case SHL: case SHR: case SAR: case ROL: case ROR:
                case RCL: case RCR:
 	       case MUL:
                   if (dis)
                      VG_(printf)(
                         "   at %2d: change ArchReg %S to TempReg t%d\n",
                         i, nameIReg(4,u->val1), areg_map[u->val1]);
                   u->tag1 = TempReg;
                   u->val1 = areg_map[u->val1];
                   /* Remember to extend the live range of the TempReg,
                      if necessary. */
                   if (last_live_before[u->val1] < i)
                      last_live_before[u->val1] = i;
                   break;
                default:
                   break;
             }
          }

          /* boring insn; invalidate any mappings to temps it writes */
          k = VG_(get_reg_usage)(u, TempReg, &tempUse[0], &isWrites[0]);

          for (j = 0; j < k; j++) {
             wr  = isWrites[j];
             if (!wr) continue;
             tr = tempUse[j];
             for (m = 0; m < 8; m++)
                if (areg_map[m] == tr) areg_map[m] = -1;
          }
       }

    }

 #  undef BIND_ARCH_TO_TEMP

    /* PASS 2: redundant PUT elimination.  Don't annul (delay) puts of
       %ESP, since the memory check machinery always requires the
       in-memory value of %ESP to be up to date.  Although this isn't
       actually required by other analyses (cache simulation), it's
       simplest to be consistent for all end-uses. */
    for (j = 0; j < 8; j++)
       annul_put[j] = False;

    for (i = cb->used-1; i >= 0; i--) {
       u = &cb->instrs[i];
       if (u->opcode == NOP) continue;

       if (u->opcode == PUT && u->size == 4) {
          vg_assert(u->tag2 == ArchReg);
          actual_areg = containingArchRegOf ( 4, u->val2 );
          if (annul_put[actual_areg]) {
             vg_assert(actual_areg != R_ESP);
             VG_(new_NOP)(u);
             if (dis)
                VG_(printf)("   at %2d: delete PUT\n", i );
          } else {
             if (actual_areg != R_ESP)
                annul_put[actual_areg] = True;
          }
       }
       else if (u->opcode == PUT && u->size != 4) {
          actual_areg = containingArchRegOf ( u->size, u->val2 );
          annul_put[actual_areg] = False;
       }
       else if (u->opcode == JMP || u->opcode == JIFZ
                || u->opcode == CALLM) {
          for (j = 0; j < 8; j++)
             annul_put[j] = False;
       }
       else {
          /* If an instruction reads an ArchReg, the immediately
             preceding PUT cannot be annulled. */
          actual_areg = maybe_uinstrReadsArchReg ( u );
          if (actual_areg != -1)
             annul_put[actual_areg] = False;
       }
    }

    /* PASS 2a: redundant-move elimination.  Given MOV t1, t2 and t1 is
       dead after this point, annul the MOV insn and rename t2 to t1.
       Further modifies the last_live_before map. */

 #  if 0
    VG_(pp_UCodeBlock)(cb, "Before MOV elimination" );
    for (i = 0; i < cb->nextTemp; i++)
      VG_(printf)("llb[t%d]=%d   ", i, last_live_before[i]);
    VG_(printf)("\n");
 #  endif

    for (i = 0; i < cb->used-1; i++) {
       u = &cb->instrs[i];
       if (u->opcode != MOV) continue;
       if (u->tag1 == Literal) continue;
       vg_assert(u->tag1 == TempReg);
       vg_assert(u->tag2 == TempReg);
       if (last_live_before[u->val1] == i) {
          if (dis)
             VG_(printf)(
                "   at %2d: delete MOV, rename t%d to t%d in (%d .. %d)\n",
                i, u->val2, u->val1, i+1, last_live_before[u->val2] );
          for (j = i+1; j <= last_live_before[u->val2]; j++) {
             if (cb->instrs[j].tag1 == TempReg
                 && cb->instrs[j].val1 == u->val2)
                cb->instrs[j].val1 = u->val1;
             if (cb->instrs[j].tag2 == TempReg
                 && cb->instrs[j].val2 == u->val2)
                cb->instrs[j].val2 = u->val1;
             if (cb->instrs[j].tag3 == TempReg
                 && cb->instrs[j].val3 == u->val2)
                cb->instrs[j].val3 = u->val1;
          }
          last_live_before[u->val1] = last_live_before[u->val2];
          last_live_before[u->val2] = i-1;
          VG_(new_NOP)(u);
       }
    }

    /* PASS 3: redundant condition-code restore/save elimination.
       Scan backwards from the end.  future_dead_flags records the set
       of flags which are dead at this point, that is, will be written
       before they are next read.  Earlier uinsns which write flags
       already in future_dead_flags can have their writes annulled.
    */
    future_dead_flags = FlagsEmpty;

    for (i = cb->used-1; i >= 0; i--) {
       u = &cb->instrs[i];

       /* We might never make it to insns beyond this one, so be
          conservative. */
       if (u->opcode == JIFZ || u->opcode == JMP) {
          future_dead_flags = FlagsEmpty;
          continue;
       }

       /* PUTF modifies the %EFLAGS in essentially unpredictable ways.
          For example people try to mess with bit 21 to see if CPUID
          works.  The setting may or may not actually take hold.  So we
          play safe here. */
       if (u->opcode == PUTF) {
          future_dead_flags = FlagsEmpty;
          continue;
       }

       /* We can annul the flags written by this insn if it writes a
          subset (or eq) of the set of flags known to be dead after
          this insn.  If not, just record the flags also written by
          this insn.*/
       if (u->flags_w != FlagsEmpty
           && VG_IS_FLAG_SUBSET(u->flags_w, future_dead_flags)) {
          if (dis) {
             VG_(printf)("   at %2d: annul flag write ", i);
             vg_ppFlagSet("", u->flags_w);
             VG_(printf)(" due to later ");
             vg_ppFlagSet("", future_dead_flags);
             VG_(printf)("\n");
          }
          u->flags_w = FlagsEmpty;
       } else {
         future_dead_flags
            = VG_UNION_FLAG_SETS ( u->flags_w, future_dead_flags );
       }

       /* If this insn also reads flags, empty out future_dead_flags so
          as to force preceding writes not to be annulled. */
       if (u->flags_r != FlagsEmpty)
          future_dead_flags = FlagsEmpty;
    }

    if (last_live_before)
       VG_(arena_free) ( VG_AR_JITTER, last_live_before );

    if (dis) {
       VG_(printf)("\n");
       VG_(pp_UCodeBlock) ( cb, "Improved UCode:" );
    }
 }

 /*------------------------------------------------------------*/
 /*--- %ESP-update pass                                     ---*/
 /*------------------------------------------------------------*/

 /* For skins that want to know about %ESP changes, this pass adds
    in the appropriate hooks.  We have to do it after the skin's
    instrumentation, so the skin doesn't have to worry about the CCALLs
    it adds in, and we must do it before register allocation because
    spilled temps make it much harder to work out the %esp deltas.
    Thus we have it as an extra phase between the two.

    We look for "GETL %ESP, t_ESP", then track ADDs and SUBs of
    literal values to t_ESP, and the total delta of the ADDs/SUBs.  Then if
    "PUTL t_ESP, %ESP" happens, we call the helper with the known delta.  We
    also cope with "MOVL t_ESP, tX", making tX the new t_ESP.  If any other
    instruction clobbers t_ESP, we don't track it anymore, and fall back to
    the delta-is-unknown case.  That case is also used when the delta is not
    a nice small amount, or an unknown amount.
 */
 static
 UCodeBlock* vg_ESP_update_pass(UCodeBlock* cb_in)
 {
    UCodeBlock* cb;
    UInstr*     u;
    Int         delta = 0;
    UInt        t_ESP = INVALID_TEMPREG;
    Int         i;

    cb = VG_(setup_UCodeBlock)(cb_in);

    for (i = 0; i < VG_(get_num_instrs)(cb_in); i++) {
       u = VG_(get_instr)(cb_in, i);

       if (GET == u->opcode && R_ESP == u->val1) {
          t_ESP = u->val2;
          delta = 0;

       } else if (PUT == u->opcode && R_ESP == u->val2 && 4 == u->size) {

 #           define DO_GENERIC					\
                if (VG_(defined_new_mem_stack)() ||		\
                    VG_(defined_die_mem_stack)()) {		\
                   uInstr1(cb, CCALL, 0, TempReg, u->val1);	\
                   uCCall(cb, (Addr) VG_(unknown_esp_update),	\
                          1, 1, False);				\
                }

 #           define DO(kind, size)								\
                if (VG_(defined_##kind##_mem_stack_##size)()) {					\
                   uInstr1(cb, CCALL, 0, TempReg, u->val1);					\
                   uCCall(cb, (Addr) VG_(tool_interface).track_##kind##_mem_stack_##size,	\
                          1, 1, False);								\
 												\
                } else										\
                   DO_GENERIC									\
                break

          if (u->val1 == t_ESP) {
             /* Known delta, common cases handled specially. */
             switch (delta) {
             case   0: break;
             case   4: DO(die, 4);
             case  -4: DO(new, 4);
             case   8: DO(die, 8);
             case  -8: DO(new, 8);
             case  12: DO(die, 12);
             case -12: DO(new, 12);
             case  16: DO(die, 16);
             case -16: DO(new, 16);
             case  32: DO(die, 32);
             case -32: DO(new, 32);
             default:  DO_GENERIC;   break;
             }
          } else {
             /* Unknown delta */
             DO_GENERIC;

             /* now we know the temp that points to %ESP */
             t_ESP = u->val1;
          }
          delta = 0;

 #        undef DO
 #        undef DO_GENERIC

       } else if (ADD == u->opcode && Literal == u->tag1 && t_ESP == u->val2) {
          delta += u->lit32;

       } else if (SUB == u->opcode && Literal == u->tag1 && t_ESP == u->val2) {
          delta -= u->lit32;

       } else if (MOV == u->opcode && TempReg == u->tag1 && t_ESP == u->val1 &&
                                      TempReg == u->tag2) {
          // t_ESP is transferred
          t_ESP = u->val2;

       } else {
          // Stop tracking t_ESP if it's clobbered by this instruction.
          Int  tempUse [VG_MAX_REGS_USED];
          Bool isWrites[VG_MAX_REGS_USED];
          Int  j, n = VG_(get_reg_usage)(u, TempReg, tempUse, isWrites);

          for (j = 0; j < n; j++) {
             if (tempUse[j] == t_ESP && isWrites[j])
                t_ESP = INVALID_TEMPREG;
          }
       }
       VG_(copy_UInstr) ( cb, u );
    }

    VG_(free_UCodeBlock)(cb_in);
    return cb;
 }

 /*------------------------------------------------------------*/
 /*--- The new register allocator.                          ---*/
 /*------------------------------------------------------------*/

 typedef
    struct {
       /* Becomes live for the first time after this insn ... */
       Int live_after;
       /* Becomes dead for the last time after this insn ... */
       Int dead_before;
       /* The "home" spill slot, if needed.  Never changes. */
       Int spill_no;
       /* Where is it?  VG_NOVALUE==in a spill slot; else in reg. */
       Int real_no;
    }
    TempInfo;


 /* Take a ucode block and allocate its TempRegs to RealRegs, or put
    them in spill locations, and add spill code, if there are not
    enough real regs.  The usual register allocation deal, in short.

    Important redundancy of representation:

      real_to_temp maps real reg ranks (RRRs) to TempReg nos, or
      to VG_NOVALUE if the real reg has no currently assigned TempReg.

      The .real_no field of a TempInfo gives the current RRR for
      this TempReg, or VG_NOVALUE if the TempReg is currently
      in memory, in which case it is in the SpillNo denoted by
      spillno.

    These pieces of information (a fwds-bwds mapping, really) must
    be kept consistent!

    This allocator uses the so-called Second Chance Bin Packing
    algorithm, as described in "Quality and Speed in Linear-scan
    Register Allocation" (Traub, Holloway and Smith, ACM PLDI98,
    pp142-151).  It is simple and fast and remarkably good at
    minimising the amount of spill code introduced.
 */

 static
 UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 )
 {
    TempInfo*    temp_info;
    Int          real_to_temp [VG_MAX_REALREGS];
    Bool         is_spill_cand[VG_MAX_REALREGS];
    Int          ss_busy_until_before[VG_MAX_SPILLSLOTS];
    Int          i, j, k, m, r, tno, max_ss_no;
    Bool         wr, defer, isRead, spill_reqd;
    UInt         realUse [VG_MAX_REGS_USED];
    Int          tempUse [VG_MAX_REGS_USED];
    Bool         isWrites[VG_MAX_REGS_USED];
    UCodeBlock*  c2;

    /* Used to denote ... well, "no value" in this fn. */
 #  define VG_NOTHING (-2)

    /* Initialise the TempReg info.  */
    if (c1->nextTemp > 0)
       temp_info = VG_(arena_malloc)(VG_AR_JITTER,
                                     c1->nextTemp * sizeof(TempInfo) );
    else
       temp_info = NULL;

    for (i = 0; i < c1->nextTemp; i++) {
       temp_info[i].live_after  = VG_NOTHING;
       temp_info[i].dead_before = VG_NOTHING;
       temp_info[i].spill_no    = VG_NOTHING;
       /* temp_info[i].real_no is not yet relevant. */
    }

    spill_reqd = False;

    /* Scan fwds to establish live ranges. */

    for (i = 0; i < c1->used; i++) {
       k = VG_(get_reg_usage)(&c1->instrs[i], TempReg, &tempUse[0],
                              &isWrites[0]);
       vg_assert(k >= 0 && k <= VG_MAX_REGS_USED);

       /* For each temp usage ... fwds in program order */
       for (j = 0; j < k; j++) {
          tno = tempUse[j];
          wr  = isWrites[j];
          if (wr) {
             /* Writes hold a reg live until after this insn. */
             if (temp_info[tno].live_after == VG_NOTHING)
                temp_info[tno].live_after = i;
             if (temp_info[tno].dead_before < i + 1)
                temp_info[tno].dead_before = i + 1;
          } else {
             /* First use of a tmp should be a write. */
             if (temp_info[tno].live_after == VG_NOTHING) {
                VG_(printf)("At instr %d...\n", i);
                VG_(core_panic)("First use of tmp not a write,"
                                " probably a skin instrumentation error");
             }
             /* Reads only hold it live until before this insn. */
             if (temp_info[tno].dead_before < i)
                temp_info[tno].dead_before = i;
          }
       }
    }

 #  if 0
    /* Sanity check on live ranges.  Expensive but correct. */
    for (i = 0; i < c1->nextTemp; i++) {
       vg_assert( (temp_info[i].live_after == VG_NOTHING
                   && temp_info[i].dead_before == VG_NOTHING)
                  || (temp_info[i].live_after != VG_NOTHING
                      && temp_info[i].dead_before != VG_NOTHING) );
    }
 #  endif

    /* Do a rank-based allocation of TempRegs to spill slot numbers.
       We put as few as possible values in spill slots, but
       nevertheless need to have an assignment to them just in case. */

    max_ss_no = -1;

    for (i = 0; i < VG_MAX_SPILLSLOTS; i++)
       ss_busy_until_before[i] = 0;

    for (i = 0; i < c1->nextTemp; i++) {

       /* True iff this temp is unused. */
       if (temp_info[i].live_after == VG_NOTHING)
          continue;

       /* Find the lowest-numbered spill slot which is available at the
          start point of this interval, and assign the interval to
          it. */
       for (j = 0; j < VG_MAX_SPILLSLOTS; j++)
          if (ss_busy_until_before[j] <= temp_info[i].live_after)
             break;
       if (j == VG_MAX_SPILLSLOTS) {
          VG_(printf)("VG_MAX_SPILLSLOTS is too low; increase and recompile.\n");
          VG_(core_panic)("register allocation failed -- out of spill slots");
       }
       ss_busy_until_before[j] = temp_info[i].dead_before;
       temp_info[i].spill_no = j;
       if (j > max_ss_no)
          max_ss_no = j;
    }

    VG_(total_reg_rank) += (max_ss_no+1);

    /* Show live ranges and assigned spill slot nos. */

    if (dis) {
       VG_(printf)("Live range assignments:\n");

       for (i = 0; i < c1->nextTemp; i++) {
          if (temp_info[i].live_after == VG_NOTHING)
             continue;
          VG_(printf)(
             "   LR %d is  after %d to before %d\tspillno %d\n",
             i,
             temp_info[i].live_after,
             temp_info[i].dead_before,
             temp_info[i].spill_no
          );
       }
       VG_(printf)("\n");
    }

    /* Now that we've established a spill slot number for each used
       temporary, we can go ahead and do the core of the "Second-chance
       binpacking" allocation algorithm. */

    if (dis) VG_(printf)("Register allocated UCode:\n");


    /* Resulting code goes here.  We generate it all in a forwards
       pass. */
    c2 = VG_(alloc_UCodeBlock)();
    c2->orig_eip = c1->orig_eip;

    /* At the start, no TempRegs are assigned to any real register.
       Correspondingly, all temps claim to be currently resident in
       their spill slots, as computed by the previous two passes. */
    for (i = 0; i < VG_MAX_REALREGS; i++)
       real_to_temp[i] = VG_NOTHING;
    for (i = 0; i < c1->nextTemp; i++)
       temp_info[i].real_no = VG_NOTHING;

    /* Process each insn in turn. */
    for (i = 0; i < c1->used; i++) {

       if (c1->instrs[i].opcode == NOP) continue;
       VG_(uinstrs_prealloc)++;

 #     if 0
       /* Check map consistency.  Expensive but correct. */
       for (r = 0; r < VG_MAX_REALREGS; r++) {
          if (real_to_temp[r] != VG_NOTHING) {
             tno = real_to_temp[r];
             vg_assert(tno >= 0 && tno < c1->nextTemp);
             vg_assert(temp_info[tno].real_no == r);
          }
       }
       for (tno = 0; tno < c1->nextTemp; tno++) {
          if (temp_info[tno].real_no != VG_NOTHING) {
             r = temp_info[tno].real_no;
             vg_assert(r >= 0 && r < VG_MAX_REALREGS);
             vg_assert(real_to_temp[r] == tno);
          }
       }
 #     endif

       if (dis)
          VG_(pp_UInstr)(i, &c1->instrs[i]);

       /* First, free up enough real regs for this insn.  This may
          generate spill stores since we may have to evict some TempRegs
          currently in real regs.  Also generates spill loads. */

       k = VG_(get_reg_usage)(&c1->instrs[i], TempReg, &tempUse[0],
                              &isWrites[0]);
       vg_assert(k >= 0 && k <= VG_MAX_REGS_USED);

       /* For each ***different*** temp mentioned in the insn .... */
       for (j = 0; j < k; j++) {

          /* First check if the temp is mentioned again later; if so,
             ignore this mention.  We only want to process each temp
             used by the insn once, even if it is mentioned more than
             once. */
          defer = False;
          tno = tempUse[j];
          for (m = j+1; m < k; m++)
             if (tempUse[m] == tno)
                defer = True;
          if (defer)
             continue;

          /* Now we're trying to find a register for tempUse[j].
             First of all, if it already has a register assigned, we
             don't need to do anything more. */
          if (temp_info[tno].real_no != VG_NOTHING)
             continue;

          /* No luck.  The next thing to do is see if there is a
             currently unassigned register available.  If so, bag it. */
          for (r = 0; r < VG_MAX_REALREGS; r++) {
             if (real_to_temp[r] == VG_NOTHING)
                break;
          }
          if (r < VG_MAX_REALREGS) {
             real_to_temp[r]        = tno;
             temp_info[tno].real_no = r;
             continue;
          }

          /* Unfortunately, that didn't pan out either.  So we'll have
             to eject some other unfortunate TempReg into a spill slot
             in order to free up a register.  Of course, we need to be
             careful not to eject some other TempReg needed by this
             insn.

             Select r in 0 .. VG_MAX_REALREGS-1 such that
             real_to_temp[r] is not mentioned in
             tempUse[0 .. k-1], since it would be just plain
             wrong to eject some other TempReg which we need to use in
             this insn.

             It is here that it is important to make a good choice of
             register to spill.  */

          /* First, mark those regs which are not spill candidates. */
          for (r = 0; r < VG_MAX_REALREGS; r++) {
             is_spill_cand[r] = True;
             for (m = 0; m < k; m++) {
                if (real_to_temp[r] == tempUse[m]) {
                   is_spill_cand[r] = False;
                   break;
                }
             }
          }

          /* We can choose any r satisfying is_spill_cand[r].  However,
             try to make a good choice.  First, try and find r such
             that the associated TempReg is already dead. */
          for (r = 0; r < VG_MAX_REALREGS; r++) {
             if (is_spill_cand[r] &&
                 temp_info[real_to_temp[r]].dead_before <= i)
                goto have_spill_cand;
          }

          /* No spill cand is mapped to a dead TempReg.  Now we really
            _do_ have to generate spill code.  Choose r so that the
            next use of its associated TempReg is as far ahead as
            possible, in the hope that this will minimise the number of
            consequent reloads required.  This is a bit expensive, but
            we don't have to do it very often. */
          {
             Int furthest_r = VG_MAX_REALREGS;
             Int furthest = 0;
             for (r = 0; r < VG_MAX_REALREGS; r++) {
                if (!is_spill_cand[r]) continue;
                for (m = i+1; m < c1->used; m++)
                   if (uInstrMentionsTempReg(&c1->instrs[m],
                                             real_to_temp[r]))
                      break;
                if (m > furthest) {
                   furthest   = m;
                   furthest_r = r;
                }
             }
             r = furthest_r;
             goto have_spill_cand;
          }

          have_spill_cand:
          if (r == VG_MAX_REALREGS)
             VG_(core_panic)("new reg alloc: out of registers ?!");

          /* Eject r.  Important refinement: don't bother if the
             associated TempReg is now dead. */
          vg_assert(real_to_temp[r] != VG_NOTHING);
          vg_assert(real_to_temp[r] != tno);
          temp_info[real_to_temp[r]].real_no = VG_NOTHING;
          if (temp_info[real_to_temp[r]].dead_before > i) {
             uInstr2(c2, PUT, 4,
                         RealReg, VG_(rank_to_realreg)(r),
                         SpillNo, temp_info[real_to_temp[r]].spill_no);
             VG_(uinstrs_spill)++;
             spill_reqd = True;
             if (dis)
                VG_(pp_UInstr)(c2->used-1, &LAST_UINSTR(c2));
          }

          /* Decide if tno is read. */
          isRead = False;
          for (m = 0; m < k; m++)
             if (tempUse[m] == tno && !isWrites[m])
                isRead = True;

          /* If so, generate a spill load. */
          if (isRead) {
             uInstr2(c2, GET, 4,
                         SpillNo, temp_info[tno].spill_no,
                         RealReg, VG_(rank_to_realreg)(r) );
             VG_(uinstrs_spill)++;
             spill_reqd = True;
             if (dis)
                VG_(pp_UInstr)(c2->used-1, &LAST_UINSTR(c2));
          }

          /* Update the forwards and backwards maps. */
          real_to_temp[r]        = tno;
          temp_info[tno].real_no = r;
       }

       /* By this point, all TempRegs mentioned by the insn have been
          bought into real regs.  We now copy the insn to the output
          and use patchUInstr to convert its rTempRegs into
          realregs. */
       for (j = 0; j < k; j++)
          realUse[j] = VG_(rank_to_realreg)(temp_info[tempUse[j]].real_no);
       VG_(copy_UInstr)(c2, &c1->instrs[i]);
       patchUInstr(&LAST_UINSTR(c2), &tempUse[0], &realUse[0], k);

       if (dis) {
          VG_(pp_UInstr)(c2->used-1, &LAST_UINSTR(c2));
          VG_(printf)("\n");
       }
    }

    if (temp_info != NULL)
       VG_(arena_free)(VG_AR_JITTER, temp_info);

    VG_(free_UCodeBlock)(c1);

    if (spill_reqd)
       VG_(translations_needing_spill)++;

    return c2;

 #  undef VG_NOTHING

 }

 /* Analysis records liveness of all general-use RealRegs in the UCode. */
 static void vg_realreg_liveness_analysis ( UCodeBlock* cb )
 {
    Int      i, j, k;
    RRegSet  rregs_live;
    Int      regUse[VG_MAX_REGS_USED];
    Bool     isWrites[VG_MAX_REGS_USED];
    UInstr*  u;

    /* All regs are dead at the end of the block */
    rregs_live = ALL_RREGS_DEAD;

    for (i = cb->used-1; i >= 0; i--) {
       u = &cb->instrs[i];

       u->regs_live_after = rregs_live;

       k = VG_(get_reg_usage)(u, RealReg, &regUse[0], &isWrites[0]);

       /* For each reg usage ... bwds in program order.  Variable is live
          before this UInstr if it is read by this UInstr.
          Note that regUse[j] holds the Intel reg number, so we must
          convert it to our rank number.  */
       for (j = k-1; j >= 0; j--) {
          SET_RREG_LIVENESS ( VG_(realreg_to_rank)(regUse[j]),
                              rregs_live,
                              !isWrites[j] );
       }
    }
 }

 /*------------------------------------------------------------*/
 /*--- Main entry point for the JITter.                     ---*/
 /*------------------------------------------------------------*/

 /* Translate the basic block beginning at orig_addr, placing the
    translation in a vg_malloc'd block, the address and size of which
    are returned in trans_addr and trans_size.  Length of the original
    block is also returned in orig_size.  If the latter three are NULL,
    this call is being done for debugging purposes, in which case (a)
    throw away the translation once it is made, and (b) produce a load
    of debugging output.

    'tst' is the identity of the thread needing this block.
 */
 void VG_(translate) ( /*IN*/  ThreadId tid,
 		      /*IN*/  Addr  orig_addr,
                       /*OUT*/ UInt* orig_size,
                       /*OUT*/ Addr* trans_addr,
                       /*OUT*/ UInt* trans_size,
 		      /*OUT*/ UShort jumps[VG_MAX_JUMPS])
 {
    Int         n_disassembled_bytes, final_code_size;
    Bool        debugging_translation;
    UChar*      final_code;
    UCodeBlock* cb;
    Bool        notrace_until_done;
    UInt        notrace_until_limit = 0;
    Segment     *seg;
    Addr		redir;

    VGP_PUSHCC(VgpTranslate);
    debugging_translation
       = orig_size == NULL || trans_addr == NULL || trans_size == NULL;

    /* Look in the code redirect table to see if we should
       translate an alternative address for orig_addr. */
    redir = VG_(code_redirect)(orig_addr);

    if (redir != orig_addr && VG_(clo_verbosity) >= 2)
       VG_(message)(Vg_UserMsg,
 		   "TRANSLATE: %p redirected to %p",
 		   orig_addr,
 		   redir );
    orig_addr = redir;

    /* If codegen tracing, don't start tracing until
       notrace_until_limit blocks have gone by.  This avoids printing
       huge amounts of useless junk when all we want to see is the last
       few blocks translated prior to a failure.  Set
       notrace_until_limit to be the number of translations to be made
       before --trace-codegen= style printing takes effect. */
    notrace_until_done
       = VG_(overall_in_count) >= notrace_until_limit;

    seg = VG_(find_segment)(orig_addr);

    if (!debugging_translation)
       VG_TRACK( pre_mem_read, Vg_CoreTranslate, tid, "", orig_addr, 1 );

    if (seg == NULL ||
        !VG_(seg_contains)(seg, orig_addr, 1) ||
        (seg->prot & (VKI_PROT_READ|VKI_PROT_EXEC)) == 0) {
       /* Code address is bad - deliver a signal instead */
       vg_assert(!VG_(is_addressable)(orig_addr, 1));

       if (seg != NULL && VG_(seg_contains)(seg, orig_addr, 1)) {
 	 vg_assert((seg->prot & VKI_PROT_EXEC) == 0);
 	 VG_(synth_fault_perms)(tid, orig_addr);
       } else
 	 VG_(synth_fault_mapping)(tid, orig_addr);

       return;
    } else
       seg->flags |= SF_CODE;	/* contains cached code */

    cb = VG_(alloc_UCodeBlock)();
    cb->orig_eip = orig_addr;

    /* If doing any code printing, print a basic block start marker */
    if (VG_(clo_trace_codegen) && notrace_until_done) {
       Char fnname[64] = "";
       VG_(get_fnname_if_entry)(orig_addr, fnname, 64);
       VG_(printf)(
               "==== BB %d %s(%p) in %dB, out %dB, BBs exec'd %llu ====\n\n",
               VG_(overall_in_count), fnname, orig_addr,
               VG_(overall_in_osize), VG_(overall_in_tsize),
               VG_(bbs_done));
    }

    /* True if a debug trans., or if bit N set in VG_(clo_trace_codegen). */
 #  define DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(n)               \
       ( debugging_translation                                   \
         || (notrace_until_done                                  \
             && (VG_(clo_trace_codegen) & (1 << (n-1))) ))

    /* Disassemble this basic block into cb. */
    VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(1);
    VGP_PUSHCC(VgpToUCode);
    n_disassembled_bytes = VG_(disBB) ( cb, orig_addr );
    VGP_POPCC(VgpToUCode);

    /* Try and improve the code a bit. */
    if (VG_(clo_optimise)) {
       VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(2);
       VGP_PUSHCC(VgpImprove);
       vg_improve ( cb );
       VGP_POPCC(VgpImprove);
    }

    /* Skin's instrumentation (Nb: must set VG_(print_codegen) in case
       SK_(instrument) looks at it. */
    VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(3);
    VGP_PUSHCC(VgpInstrument);
    cb = SK_(instrument) ( cb, orig_addr );
    if (VG_(print_codegen))
       VG_(pp_UCodeBlock) ( cb, "Instrumented UCode:" );
    VG_(saneUCodeBlock)( cb );
    VGP_POPCC(VgpInstrument);

    /* Add %ESP-update hooks if the skin requires them */
    /* Nb: We don't print out this phase, because it doesn't do much */
    if (VG_(need_to_handle_esp_assignment)()) {
       VGP_PUSHCC(VgpESPUpdate);
       cb = vg_ESP_update_pass ( cb );
       VGP_POPCC(VgpESPUpdate);
    }

    /* Allocate registers. */
    VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(4);
    VGP_PUSHCC(VgpRegAlloc);
    cb = vg_do_register_allocation ( cb );
    VGP_POPCC(VgpRegAlloc);

    /* Do post reg-alloc %e[acd]x liveness analysis (too boring to print
     * anything;  results can be seen when emitting final code). */
    VGP_PUSHCC(VgpLiveness);
    vg_realreg_liveness_analysis ( cb );
    VGP_POPCC(VgpLiveness);

    /* Emit final code */
    VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(5);

    VGP_PUSHCC(VgpFromUcode);
    final_code = VG_(emit_code)(cb, &final_code_size, jumps );
    VGP_POPCC(VgpFromUcode);
    VG_(free_UCodeBlock)(cb);

 #undef DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE

    if (debugging_translation) {
       /* Only done for debugging -- throw away final result. */
       VG_(arena_free)(VG_AR_JITTER, final_code);
    } else {
       /* Doing it for real -- return values to caller. */
       *orig_size = n_disassembled_bytes;
       *trans_addr = (Addr)final_code;
       *trans_size = final_code_size;
    }
    VGP_POPCC(VgpTranslate);
 }

 /*--------------------------------------------------------------------*/
 /*--- end                                           vg_translate.c ---*/
 /*--------------------------------------------------------------------*/