priv/guest-x86/ghelpers.c - platform/external/valgrind - Gitiles


 /*---------------------------------------------------------------*/
 /*---                                                         ---*/
 /*--- This file (guest-x86/ghelpers.c) is                     ---*/
 /*--- Copyright (c) 2004 OpenWorks LLP.  All rights reserved. ---*/
 /*---                                                         ---*/
 /*---------------------------------------------------------------*/

 /*
    This file is part of LibVEX, a library for dynamic binary
    instrumentation and translation.

    Copyright (C) 2004 OpenWorks, LLP.

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; Version 2 dated June 1991 of the
    license.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or liability
    for damages.  See the GNU General Public License for more details.

    Neither the names of the U.S. Department of Energy nor the
    University of California nor the names of its contributors may be
    used to endorse or promote products derived from this software
    without prior written permission.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
    USA.
 */

 #include "libvex_basictypes.h"
 #include "libvex_guest_x86.h"
 #include "libvex_ir.h"
 #include "libvex.h"

 #include "main/vex_util.h"
 #include "guest-x86/gdefs.h"


 /* This file contains helper functions for x86 guest code.
    Calls to these functions are generated by the back end.
    These calls are of course in the host machine code and
    this file will be compiled to host machine code, so that
    all makes sense.

    Only change the signatures of these helper functions very
    carefully.  If you change the signature here, you'll have to change
    the parameters passed to it in the IR calls constructed by
    guest-x86/toIR.c.

    Some of this code/logic is derived from QEMU, which is copyright
    Fabrice Bellard, licensed under the LGPL.  It is used with
    permission.
 */

 /* Set to 1 to get detailed profiling info about use of the flag
    machinery. */
 #define PROFILE_EFLAGS 0


 static const UChar parity_table[256] = {
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
 };

 /* n must be a constant to be efficient */
 inline static Int lshift ( Int x, Int n )
 {
    if (n >= 0)
       return x << n;
    else
       return x >> (-n);
 }


 #define PREAMBLE(__data_bits)					\
    /* const */ UInt DATA_MASK 					\
       = __data_bits==8 ? 0xFF 					\
                        : (__data_bits==16 ? 0xFFFF 		\
                                           : 0xFFFFFFFF); 	\
    /* const */ UInt SIGN_MASK = 1 << (__data_bits - 1);		\
    /* const */ UInt CC_DEP1 = cc_dep1_formal;			\
    /* const */ UInt CC_DEP2 = cc_dep2_formal;			\
    /* const */ UInt CC_NDEP = cc_ndep_formal;			\
    /* Four bogus assignments, which hopefully gcc can     */	\
    /* optimise away, and which stop it complaining about  */	\
    /* unused variables.                                   */	\
    SIGN_MASK = SIGN_MASK;					\
    DATA_MASK = DATA_MASK;					\
    CC_DEP2 = CC_DEP2;						\
    CC_NDEP = CC_NDEP;


 /*-------------------------------------------------------------*/

 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, res;					\
      argL = CC_DEP1;						\
      argR = CC_DEP2;						\
      res  = argL + argR;					\
      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
                  12 - DATA_BITS) & X86G_CC_MASK_O;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, res;					\
      argL = CC_DEP1;						\
      argR = CC_DEP2;						\
      res  = argL - argR;					\
      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR) & (argL ^ res),	 		\
                  12 - DATA_BITS) & X86G_CC_MASK_O; 		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, oldC, res;		       			\
      oldC = CC_NDEP & X86G_CC_MASK_C;				\
      argL = CC_DEP1;						\
      argR = CC_DEP2 ^ oldC;	       				\
      res  = (argL + argR) + oldC;				\
      if (oldC)							\
         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
      else							\
         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
                   12 - DATA_BITS) & X86G_CC_MASK_O;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, oldC, res;		       			\
      oldC = CC_NDEP & X86G_CC_MASK_C;				\
      argL = CC_DEP1;						\
      argR = CC_DEP2 ^ oldC;	       				\
      res  = (argL - argR) - oldC;				\
      if (oldC)							\
         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
      else							\
         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR) & (argL ^ res), 			\
                  12 - DATA_BITS) & X86G_CC_MASK_O;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      cf = 0;							\
      pf = parity_table[(UChar)CC_DEP1];				\
      af = 0;							\
      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
      of = 0;							\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, res;					\
      res  = CC_DEP1;						\
      argL = res - 1;						\
      argR = 1;							\
      cf = CC_NDEP & X86G_CC_MASK_C;				\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, res;					\
      res  = CC_DEP1;						\
      argL = res + 1;						\
      argR = 1;							\
      cf = CC_NDEP & X86G_CC_MASK_C;				\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = ((res & DATA_MASK) 					\
           == ((UInt)SIGN_MASK - 1)) << 11;			\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      cf = (CC_DEP2 >> (DATA_BITS - 1)) & X86G_CC_MASK_C;	\
      pf = parity_table[(UChar)CC_DEP1];				\
      af = 0; /* undefined */					\
      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
      /* of is defined if shift count == 1 */			\
      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
           & X86G_CC_MASK_O;					\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);  					\
    { Int cf, pf, af, zf, sf, of;				\
      cf = CC_DEP2 & 1;						\
      pf = parity_table[(UChar)CC_DEP1];				\
      af = 0; /* undefined */					\
      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
      /* of is defined if shift count == 1 */			\
      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
           & X86G_CC_MASK_O;					\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
 /* DEP1 = result, NDEP = old flags */
 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int fl 							\
         = (CC_NDEP & ~(X86G_CC_MASK_O | X86G_CC_MASK_C))	\
           | (X86G_CC_MASK_C & CC_DEP1)				\
           | (X86G_CC_MASK_O & (lshift(CC_DEP1,  		\
                                       11-(DATA_BITS-1)) 	\
                      ^ lshift(CC_DEP1, 11)));			\
      return fl;							\
    }								\
 }

 /*-------------------------------------------------------------*/

 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
 /* DEP1 = result, NDEP = old flags */
 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int fl 							\
         = (CC_NDEP & ~(X86G_CC_MASK_O | X86G_CC_MASK_C))	\
           | (X86G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
           | (X86G_CC_MASK_O & (lshift(CC_DEP1, 			\
                                       11-(DATA_BITS-1)) 	\
                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
      return fl;							\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_UMUL(DATA_BITS,DATA_UTYPE,DATA_U2TYPE)          \
 {                                                               \
    PREAMBLE(DATA_BITS);                                         \
    { Int cf, pf, af, zf, sf, of;                                \
      DATA_UTYPE  hi;                                            \
      DATA_UTYPE  lo = ((DATA_UTYPE)CC_DEP1)                     \
                       * ((DATA_UTYPE)CC_DEP2);                  \
      DATA_U2TYPE rr = ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))      \
                       * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2));   \
      hi = (DATA_UTYPE)(rr >>/*u*/ DATA_BITS);                   \
      cf = (hi != 0);                                            \
      pf = parity_table[(UChar)lo];                              \
      af = 0; /* undefined */                                    \
      zf = (lo == 0) << 6;                                       \
      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
      of = cf << 11;                                             \
      return cf | pf | af | zf | sf | of;                        \
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SMUL(DATA_BITS,DATA_STYPE,DATA_S2TYPE)          \
 {                                                               \
    PREAMBLE(DATA_BITS);                                         \
    { Int cf, pf, af, zf, sf, of;                                \
      DATA_STYPE  hi;                                            \
      DATA_STYPE  lo = ((DATA_STYPE)CC_DEP1)                     \
                       * ((DATA_STYPE)CC_DEP2);                  \
      DATA_S2TYPE rr = ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))      \
                       * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2));   \
      hi = (DATA_STYPE)(rr >>/*s*/ DATA_BITS);                   \
      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
      pf = parity_table[(UChar)lo];                              \
      af = 0; /* undefined */                                    \
      zf = (lo == 0) << 6;                                       \
      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
      of = cf << 11;                                             \
      return cf | pf | af | zf | sf | of;                        \
    }								\
 }


 #if PROFILE_EFLAGS

 static UInt tabc[X86G_CC_OP_NUMBER];
 static UInt tab[X86G_CC_OP_NUMBER][16];
 static Bool initted     = False;
 static UInt n_calc_cond = 0;
 static UInt n_calc_all  = 0;
 static UInt n_calc_c    = 0;

 static void showCounts ( void )
 {
    Int op, co;
    Char ch;
    vex_printf("\nALL=%d  COND=%d   C=%d\n",
               n_calc_all-n_calc_cond-n_calc_c, n_calc_cond, n_calc_c);
    vex_printf("      CARRY    O   NO    B   NB    Z   NZ   BE  NBE"
               "    S   NS    P   NP    L   NL   LE  NLE\n");
    vex_printf("     ----------------------------------------------"
               "----------------------------------------\n");
    for (op = 0; op < X86G_CC_OP_NUMBER; op++) {

       ch = ' ';
       if (op > 0 && (op-1) % 3 == 0)
          ch = 'B';
       if (op > 0 && (op-1) % 3 == 1)
          ch = 'W';
       if (op > 0 && (op-1) % 3 == 2)
          ch = 'L';

       vex_printf("%2d%c: ", op, ch);
       vex_printf("%6d ", tabc[op]);
       for (co = 0; co < 16; co++) {
          Int n = tab[op][co];
          if (n >= 1000) {
             vex_printf(" %3dK", n / 1000);
          } else
          if (n >= 0) {
            vex_printf(" %3d ", n );
          } else {
             vex_printf("     ");
          }
       }
       vex_printf("\n");
    }
    vex_printf("\n");
 }

 static void initCounts ( void )
 {
    Int op, co;
    initted = True;
    for (op = 0; op < X86G_CC_OP_NUMBER; op++) {
       tabc[op] = 0;
       for (co = 0; co < 16; co++)
          tab[op][co] = 0;
    }
 }

 #endif /* PROFILE_EFLAGS */

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* Calculate all the 6 flags from the supplied thunk parameters. */
 UInt x86g_calculate_eflags_all ( UInt cc_op,
                                  UInt cc_dep1_formal,
                                  UInt cc_dep2_formal,
                                 UInt cc_ndep_formal )
 {
 #  if PROFILE_EFLAGS
    n_calc_all++;
 #  endif
    switch (cc_op) {
       case X86G_CC_OP_COPY:
          return cc_dep1_formal
                 & (X86G_CC_MASK_O | X86G_CC_MASK_S | X86G_CC_MASK_Z
                    | X86G_CC_MASK_A | X86G_CC_MASK_C | X86G_CC_MASK_P);

       case X86G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
       case X86G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
       case X86G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );

       case X86G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
       case X86G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
       case X86G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );

       case X86G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
       case X86G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
       case X86G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );

       case X86G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
       case X86G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
       case X86G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );

       case X86G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
       case X86G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
       case X86G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );

       case X86G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
       case X86G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
       case X86G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );

       case X86G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
       case X86G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
       case X86G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );

       case X86G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
       case X86G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
       case X86G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );

       case X86G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
       case X86G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
       case X86G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );

       case X86G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
       case X86G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
       case X86G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );

       case X86G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
       case X86G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
       case X86G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );

       case X86G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  UShort );
       case X86G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, UInt   );
       case X86G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   ULong  );

       case X86G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   Short );
       case X86G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  Int   );
       case X86G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    Long  );

       default:
          /* shouldn't really make these calls from generated code */
          vex_printf("calculate_eflags_all(X86)( %d, 0x%x, 0x%x, 0x%x )\n",
                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
          vpanic("calculate_eflags_all(X86)");
    }
 }


 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* Calculate just the carry flag from the supplied thunk parameters. */
 UInt x86g_calculate_eflags_c ( UInt cc_op,
                                UInt cc_dep1,
                                UInt cc_dep2,
                                UInt cc_ndep )
 {
    /* Fast-case some common ones. */
    switch (cc_op) {
       case X86G_CC_OP_LOGICL:
       case X86G_CC_OP_LOGICW:
       case X86G_CC_OP_LOGICB:
          return 0;
       case X86G_CC_OP_SUBL:
          return ((UInt)cc_dep1) < ((UInt)cc_dep2)
                    ? X86G_CC_MASK_C : 0;
 #if 0
       case X86G_CC_OP_SUBB:
          return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
                    ? X86G_CC_MASK_C : 0;
 #endif
 #if 0
       case X86G_CC_OP_DECL:
          return cc_src;
       case X86G_CC_OP_ADDL:
          return ( ((UInt)cc_src + (UInt)cc_dst) < ((UInt)cc_src) )
                    ? X86G_CC_MASK_C : 0;
       case X86G_CC_OP_SUBB:
          return ( ((UInt)(cc_src & 0xFF)) > ((UInt)(cc_dst & 0xFF)) )
                    ? X86G_CC_MASK_C : 0;
 #endif
       default:
          break;
    }

 #  if PROFILE_EFLAGS
    if (!initted)
       initCounts();
    tabc[cc_op]++;

    n_calc_c++;
 #  endif
    return x86g_calculate_eflags_all(cc_op,cc_dep1,cc_dep2,cc_ndep)
           & X86G_CC_MASK_C;
 }


 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* returns 1 or 0 */
 UInt x86g_calculate_condition ( UInt/*X86Condcode*/ cond,
                                 UInt cc_op,
                                 UInt cc_dep1,
                                 UInt cc_dep2,
                                 UInt cc_ndep )
 {
    UInt eflags = x86g_calculate_eflags_all(cc_op, cc_dep1,
                                            cc_dep2, cc_ndep);
    UInt of,sf,zf,cf,pf;
    UInt inv = cond & 1;

 #  if PROFILE_EFLAGS
    if (!initted)
      initCounts();

    tab[cc_op][cond]++;
    n_calc_cond++;

    if (0 == ((n_calc_all+n_calc_c) & 0x7FFFF)) showCounts();
 #  endif

    switch (cond) {
       case X86CondNO:
       case X86CondO: /* OF == 1 */
          of = eflags >> X86G_CC_SHIFT_O;
          return 1 & (inv ^ of);

       case X86CondNZ:
       case X86CondZ: /* ZF == 1 */
          zf = eflags >> X86G_CC_SHIFT_Z;
          return 1 & (inv ^ zf);

       case X86CondNB:
       case X86CondB: /* CF == 1 */
          cf = eflags >> X86G_CC_SHIFT_C;
          return 1 & (inv ^ cf);
          break;

       case X86CondNBE:
       case X86CondBE: /* (CF or ZF) == 1 */
          cf = eflags >> X86G_CC_SHIFT_C;
          zf = eflags >> X86G_CC_SHIFT_Z;
          return 1 & (inv ^ (cf | zf));
          break;

       case X86CondNS:
       case X86CondS: /* SF == 1 */
          sf = eflags >> X86G_CC_SHIFT_S;
          return 1 & (inv ^ sf);

       case X86CondNP:
       case X86CondP: /* PF == 1 */
          pf = eflags >> X86G_CC_SHIFT_P;
          return 1 & (inv ^ pf);

       case X86CondNL:
       case X86CondL: /* (SF xor OF) == 1 */
          sf = eflags >> X86G_CC_SHIFT_S;
          of = eflags >> X86G_CC_SHIFT_O;
          return 1 & (inv ^ (sf ^ of));
          break;

       case X86CondNLE:
       case X86CondLE: /* ((SF xor OF) or ZF)  == 1 */
          sf = eflags >> X86G_CC_SHIFT_S;
          of = eflags >> X86G_CC_SHIFT_O;
          zf = eflags >> X86G_CC_SHIFT_Z;
          return 1 & (inv ^ ((sf ^ of) | zf));
          break;

       default:
          /* shouldn't really make these calls from generated code */
          vex_printf("calculate_condition( %d, %d, 0x%x, 0x%x, 0x%x )\n",
                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
          vpanic("calculate_condition");
    }
 }


 /* Used by the optimiser to try specialisations.  Returns an
    equivalent expression, or NULL if none. */

 static Bool isU32 ( IRExpr* e, UInt n )
 {
    return e->tag == Iex_Const
           && e->Iex.Const.con->tag == Ico_U32
           && e->Iex.Const.con->Ico.U32 == n;
 }

 IRExpr* guest_x86_spechelper ( Char* function_name,
                                IRExpr** args )
 {
 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))

    Int i, arity = 0;
    for (i = 0; args[i]; i++)
       arity++;
 #  if 0
    vex_printf("spec request:\n");
    vex_printf("   %s  ", function_name);
    for (i = 0; i < arity; i++) {
       vex_printf("  ");
       ppIRExpr(args[i]);
    }
    vex_printf("\n");
 #  endif

    /* --------- specialising "calculate_eflags_c" --------- */

    if (vex_streq(function_name, "calculate_eflags_c")) {
       /* specialise calls to above "calculate_eflags_c" function */
       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
       vassert(arity == 4);
       cc_op   = args[0];
       cc_dep1 = args[1];
       cc_dep2 = args[2];
       cc_ndep = args[3];

       if (isU32(cc_op, X86G_CC_OP_SUBL)) {
          /* C after sub denotes unsigned less than */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLT32U, cc_dep1, cc_dep2));
       }
       if (isU32(cc_op, X86G_CC_OP_LOGICL)) {
          /* cflag after logic is zero */
          return mkU32(0);
       }
       if (isU32(cc_op, X86G_CC_OP_DECL) || isU32(cc_op, X86G_CC_OP_INCL)) {
          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
          return cc_ndep;
       }
 #     if 0
       if (cc_op->tag == Iex_Const) {
          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
       }
 #     endif

       return NULL;
    }

    /* --------- specialising "calculate_condition" --------- */

    if (vex_streq(function_name, "calculate_condition")) {
       /* specialise calls to above "calculate condition" function */
       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
       vassert(arity == 5);
       cond    = args[0];
       cc_op   = args[1];
       cc_dep1 = args[2];
       cc_dep2 = args[3];
       cc_ndep = args[4];

       /*---------------- ADDL ----------------*/

       if (isU32(cc_op, X86G_CC_OP_ADDL) && isU32(cond, X86CondZ)) {
          /* long add, then Z --> test (dst+src == 0) */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ32,
                            binop(Iop_Add32, cc_dep1, cc_dep2),
                            mkU32(0)));
       }

       /*---------------- SUBL ----------------*/

       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondZ)) {
          /* long sub/cmp, then Z --> test dst==src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ32, cc_dep1, cc_dep2));
       }

       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondL)) {
          /* long sub/cmp, then L (signed less than)
             --> test dst <s src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLT32S, cc_dep1, cc_dep2));
       }

       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondLE)) {
          /* long sub/cmp, then LE (signed less than or equal)
             --> test dst <=s src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLE32S, cc_dep1, cc_dep2));
       }

       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondBE)) {
          /* long sub/cmp, then BE (unsigned less than or equal)
             --> test dst <=u src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLE32U, cc_dep1, cc_dep2));
       }
 #if 0
       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondB)) {
          /* long sub/cmp, then B (unsigned less than)
             --> test dst <u src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLT32U, cc_dst, cc_src));
       }
 #endif
       /*---------------- SUBW ----------------*/

       if (isU32(cc_op, X86G_CC_OP_SUBW) && isU32(cond, X86CondZ)) {
          /* byte sub/cmp, then Z --> test dst==src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ16,
                            unop(Iop_32to16,cc_dep1),
                            unop(Iop_32to16,cc_dep2)));
       }

       /*---------------- SUBB ----------------*/

       if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondZ)) {
          /* byte sub/cmp, then Z --> test dst==src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ8,
                            unop(Iop_32to8,cc_dep1),
                            unop(Iop_32to8,cc_dep2)));
       }

       if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNZ)) {
          /* byte sub/cmp, then NZ --> test dst!=src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpNE8,
                            unop(Iop_32to8,cc_dep1),
                            unop(Iop_32to8,cc_dep2)));
       }

       if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNBE)) {
          /* long sub/cmp, then NBE (unsigned greater than)
             --> test src <=u dst */
          /* Note, args are opposite way round from the usual */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLT32U,
                            binop(Iop_And32,cc_dep2,mkU32(0xFF)),
 			   binop(Iop_And32,cc_dep1,mkU32(0xFF))));
       }

       /*---------------- LOGICL ----------------*/

       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondZ)) {
          /* long and/or/xor, then Z --> test dst==0 */
          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
       }

       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondS)) {
          /* long and/or/xor, then S --> test dst <s 0 */
          return unop(Iop_1Uto32,binop(Iop_CmpLT32S, cc_dep1, mkU32(0)));
       }

       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondLE)) {
          /* long and/or/xor, then LE
             This is pretty subtle.  LOGIC sets SF and ZF according to the
             result and makes OF be zero.  LE computes (SZ ^ OF) | ZF, but
             OF is zero, so this reduces to SZ | ZF -- which will be 1 iff
             the result is <=signed 0.  Hence ...
          */
          return unop(Iop_1Uto32,binop(Iop_CmpLE32S, cc_dep1, mkU32(0)));
       }

       /*---------------- LOGICW ----------------*/

       if (isU32(cc_op, X86G_CC_OP_LOGICW) && isU32(cond, X86CondZ)) {
          /* byte and/or/xor, then Z --> test dst==0 */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ32, binop(Iop_And32,cc_dep1,mkU32(0xFFFF)),
                                         mkU32(0)));
       }

       /*---------------- LOGICB ----------------*/

       if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondZ)) {
          /* byte and/or/xor, then Z --> test dst==0 */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ32, binop(Iop_And32,cc_dep1,mkU32(255)),
                                         mkU32(0)));
       }

       /*---------------- DECL ----------------*/

       if (isU32(cc_op, X86G_CC_OP_DECL) && isU32(cond, X86CondZ)) {
          /* dec L, then Z --> test dst == 0 */
          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
       }

       if (isU32(cc_op, X86G_CC_OP_DECL) && isU32(cond, X86CondS)) {
          /* dec L, then S --> compare DST <s 0 */
          return unop(Iop_1Uto32,binop(Iop_CmpLT32S, cc_dep1, mkU32(0)));
       }

       /*---------------- SHRL ----------------*/

       if (isU32(cc_op, X86G_CC_OP_SHRL) && isU32(cond, X86CondZ)) {
          /* SHRL, then Z --> test dep1 == 0 */
          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
       }

       return NULL;
    }

 #  undef unop
 #  undef binop
 #  undef mkU32

    return NULL;
 }


 /*-----------------------------------------------------------*/
 /*--- Utility functions for x87 FPU conversions.          ---*/
 /*-----------------------------------------------------------*/


 /* 80 and 64-bit floating point formats:

    80-bit:

     S  0       0-------0      zero
     S  0       0X------X      denormals
     S  1-7FFE  1X------X      normals (all normals have leading 1)
     S  7FFF    10------0      infinity
     S  7FFF    10X-----X      snan
     S  7FFF    11X-----X      qnan

    S is the sign bit.  For runs X----X, at least one of the Xs must be
    nonzero.  Exponent is 15 bits, fractional part is 63 bits, and
    there is an explicitly represented leading 1, and a sign bit,
    giving 80 in total.

    64-bit avoids the confusion of an explicitly represented leading 1
    and so is simpler:

     S  0      0------0   zero
     S  0      X------X   denormals
     S  1-7FE  any        normals
     S  7FF    0------0   infinity
     S  7FF    0X-----X   snan
     S  7FF    1X-----X   qnan

    Exponent is 11 bits, fractional part is 52 bits, and there is a
    sign bit, giving 64 in total.
 */

 static inline Bool host_is_little_endian ( void )
 {
    UInt x = 0x76543210;
    UChar* p = (UChar*)(&x);
    return (*p == 0x10);
 }

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 UInt x86g_calculate_FXAM ( UInt tag, ULong dbl )
 {
    Bool   mantissaIsZero;
    Int    bexp;
    UChar  sign;
    UInt   c1;
    UChar* f64;

    if (!host_is_little_endian()) {
       vassert(0);
    }

    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */

    f64  = (UChar*)(&dbl);
    sign = (f64[7] >> 7) & 1;

    /* First off, if the tag indicates the register was empty,
       return 1,0,sign,1 */
    if (tag == 0) {
       /* vex_printf("Empty\n"); */
       return X86G_FC_MASK_C3 | 0 | sign | X86G_FC_MASK_C0;
    }

    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
    bexp &= 0x7FF;

    c1 = ((UInt)sign) << 9;

    mantissaIsZero
       = (f64[6] & 0x0F) == 0
         && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0;

    /* If both exponent and mantissa are zero, the value is zero.
       Return 1,0,sign,0. */
    if (bexp == 0 && mantissaIsZero) {
       /* vex_printf("Zero\n"); */
       return X86G_FC_MASK_C3 | 0 | sign | 0;
    }

    /* If exponent is zero but mantissa isn't, it's a denormal.
       Return 1,1,sign,0. */
    if (bexp == 0 && !mantissaIsZero) {
       /* vex_printf("Denormal\n"); */
       return X86G_FC_MASK_C3 | X86G_FC_MASK_C2 | sign | 0;
    }

    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
       Return 0,1,sign,1. */
    if (bexp == 0x7FF && mantissaIsZero) {
       /* vex_printf("Inf\n"); */
       return 0 | X86G_FC_MASK_C2 | sign | X86G_FC_MASK_C0;
    }

    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
       Return 0,0,sign,1. */
    if (bexp == 0x7FF && !mantissaIsZero) {
       /* vex_printf("NaN\n"); */
       return 0 | 0 | sign | X86G_FC_MASK_C0;
    }

    /* Uh, ok, we give up.  It must be a normal finite number.
       Return 0,1,sign,0.
    */
    /* vex_printf("normal\n"); */
    return 0 | X86G_FC_MASK_C2 | sign | 0;
 }


 /////////////////////////////////////////////////////////////////

 static inline
 UInt read_bit_array ( UChar* arr, UInt n )
 {
    UChar c = arr[n >> 3];
    c >>= (n&7);
    return c & 1;
 }

 static inline
 void write_bit_array ( UChar* arr, UInt n, UInt b )
 {
    UChar c = arr[n >> 3];
    c &= ~(1 << (n&7));
    c |= ((b&1) << (n&7));
    arr[n >> 3] = c;
 }


 /* Convert a IEEE754 double (64-bit) into an x87 extended double
    (80-bit), mimicing the hardware fairly closely.  Both numbers are
    stored little-endian.  Limitations, all of which could be fixed,
    given some level of hassle:

    * Identity of NaNs is not preserved.

    See comments in the code for more details.
 */
 static void convert_f64le_to_f80le ( /*IN*/UChar* f64, /*OUT*/UChar* f80 )
 {
    Bool  mantissaIsZero;
    Int   bexp, i, j, shift;
    UChar sign;

    sign = (f64[7] >> 7) & 1;
    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
    bexp &= 0x7FF;

    mantissaIsZero = False;
    if (bexp == 0 || bexp == 0x7FF) {
       /* We'll need to know whether or not the mantissa (bits 51:0) is
          all zeroes in order to handle these cases.  So figure it
          out. */
       mantissaIsZero
          = (f64[6] & 0x0F) == 0
            && f64[5] == 0 && f64[4] == 0 && f64[3] == 0
            && f64[2] == 0 && f64[1] == 0 && f64[0] == 0;
    }

    /* If the exponent is zero, either we have a zero or a denormal.
       Produce a zero.  This is a hack in that it forces denormals to
       zero.  Could do better. */
    if (bexp == 0) {
       f80[9] = sign << 7;
       f80[8] = f80[7] = f80[6] = f80[5] = f80[4]
              = f80[3] = f80[2] = f80[1] = f80[0] = 0;

       if (mantissaIsZero)
          /* It really is zero, so that's all we can do. */
          return;

       /* There is at least one 1-bit in the mantissa.  So it's a
          potentially denormalised double -- but we can produce a
          normalised long double.  Count the leading zeroes in the
          mantissa so as to decide how much to bump the exponent down
          by.  Note, this is SLOW. */
       shift = 0;
       for (i = 51; i >= 0; i--) {
         if (read_bit_array(f64, i))
            break;
         shift++;
       }

       /* and copy into place as many bits as we can get our hands on. */
       j = 63;
       for (i = 51 - shift; i >= 0; i--) {
          write_bit_array( f80, j,
      	 read_bit_array( f64, i ) );
          j--;
       }

       /* Set the exponent appropriately, and we're done. */
       bexp -= shift;
       bexp += (16383 - 1023);
       f80[9] = (sign << 7) | ((bexp >> 8) & 0xFF);
       f80[8] = bexp & 0xFF;
       return;
    }

    /* If the exponent is 7FF, this is either an Infinity, a SNaN or
       QNaN, as determined by examining bits 51:0, thus:
           0  ... 0    Inf
           0X ... X    SNaN
           1X ... X    QNaN
       where at least one of the Xs is not zero.
    */
    if (bexp == 0x7FF) {
       if (mantissaIsZero) {
          /* Produce an appropriately signed infinity:
             S 1--1 (15)  1  0--0 (63)
          */
          f80[9] = (sign << 7) | 0x7F;
          f80[8] = 0xFF;
          f80[7] = 0x80;
          f80[6] = f80[5] = f80[4] = f80[3]
                 = f80[2] = f80[1] = f80[0] = 0;
          return;
       }
       /* So it's either a QNaN or SNaN.  Distinguish by considering
          bit 51.  Note, this destroys all the trailing bits
          (identity?) of the NaN.  IEEE754 doesn't require preserving
          these (it only requires that there be one QNaN value and one
          SNaN value), but x87 does seem to have some ability to
          preserve them.  Anyway, here, the NaN's identity is
          destroyed.  Could be improved. */
       if (f64[6] & 8) {
          /* QNaN.  Make a QNaN:
             S 1--1 (15)  1  1--1 (63)
          */
          f80[9] = (sign << 7) | 0x7F;
          f80[8] = 0xFF;
          f80[7] = 0xFF;
          f80[6] = f80[5] = f80[4] = f80[3]
                 = f80[2] = f80[1] = f80[0] = 0xFF;
       } else {
          /* SNaN.  Make a SNaN:
             S 1--1 (15)  0  1--1 (63)
          */
          f80[9] = (sign << 7) | 0x7F;
          f80[8] = 0xFF;
          f80[7] = 0x7F;
          f80[6] = f80[5] = f80[4] = f80[3]
                 = f80[2] = f80[1] = f80[0] = 0xFF;
       }
       return;
    }

    /* It's not a zero, denormal, infinity or nan.  So it must be a
       normalised number.  Rebias the exponent and build the new
       number.  */
    bexp += (16383 - 1023);

    f80[9] = (sign << 7) | ((bexp >> 8) & 0xFF);
    f80[8] = bexp & 0xFF;
    f80[7] = (1 << 7) | ((f64[6] << 3) & 0x78) | ((f64[5] >> 5) & 7);
    f80[6] = ((f64[5] << 3) & 0xF8) | ((f64[4] >> 5) & 7);
    f80[5] = ((f64[4] << 3) & 0xF8) | ((f64[3] >> 5) & 7);
    f80[4] = ((f64[3] << 3) & 0xF8) | ((f64[2] >> 5) & 7);
    f80[3] = ((f64[2] << 3) & 0xF8) | ((f64[1] >> 5) & 7);
    f80[2] = ((f64[1] << 3) & 0xF8) | ((f64[0] >> 5) & 7);
    f80[1] = ((f64[0] << 3) & 0xF8);
    f80[0] = 0;
 }


 /////////////////////////////////////////////////////////////////

 /* Convert a x87 extended double (80-bit) into an IEEE 754 double
    (64-bit), mimicking the hardware fairly closely.  Both numbers are
    stored little-endian.  Limitations, both of which could be fixed,
    given some level of hassle:

    * Rounding following truncation could be a bit better.

    * Identity of NaNs is not preserved.

    See comments in the code for more details.
 */
 static void convert_f80le_to_f64le ( /*IN*/UChar* f80, /*OUT*/UChar* f64 )
 {
    Bool  isInf;
    Int   bexp, i, j;
    UChar sign;

    sign = (f80[9] >> 7) & 1;
    bexp = (((UInt)f80[9]) << 8) | (UInt)f80[8];
    bexp &= 0x7FFF;

    /* If the exponent is zero, either we have a zero or a denormal.
       But an extended precision denormal becomes a double precision
       zero, so in either case, just produce the appropriately signed
       zero. */
    if (bexp == 0) {
       f64[7] = sign << 7;
       f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
       return;
    }

    /* If the exponent is 7FFF, this is either an Infinity, a SNaN or
       QNaN, as determined by examining bits 62:0, thus:
           0  ... 0    Inf
           0X ... X    SNaN
           1X ... X    QNaN
       where at least one of the Xs is not zero.
    */
    if (bexp == 0x7FFF) {
       isInf = (f80[7] & 0x7F) == 0
               && f80[6] == 0 && f80[5] == 0 && f80[4] == 0
               && f80[3] == 0 && f80[2] == 0 && f80[1] == 0 && f80[0] == 0;
       if (isInf) {
          if (0 == (f80[7] & 0x80))
             goto wierd_NaN;
          /* Produce an appropriately signed infinity:
             S 1--1 (11)  0--0 (52)
          */
          f64[7] = (sign << 7) | 0x7F;
          f64[6] = 0xF0;
          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
          return;
       }
       /* So it's either a QNaN or SNaN.  Distinguish by considering
          bit 62.  Note, this destroys all the trailing bits
          (identity?) of the NaN.  IEEE754 doesn't require preserving
          these (it only requires that there be one QNaN value and one
          SNaN value), but x87 does seem to have some ability to
          preserve them.  Anyway, here, the NaN's identity is
          destroyed.  Could be improved. */
       if (f80[8] & 0x40) {
          /* QNaN.  Make a QNaN:
             S 1--1 (11)  1  1--1 (51)
          */
          f64[7] = (sign << 7) | 0x7F;
          f64[6] = 0xFF;
          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;
       } else {
          /* SNaN.  Make a SNaN:
             S 1--1 (11)  0  1--1 (51)
          */
          f64[7] = (sign << 7) | 0x7F;
          f64[6] = 0xF7;
          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;
       }
       return;
    }

    /* If it's not a Zero, NaN or Inf, and the integer part (bit 62) is
       zero, the x87 FPU appears to consider the number denormalised
       and converts it to a QNaN. */
    if (0 == (f80[7] & 0x80)) {
       wierd_NaN:
       /* Strange hardware QNaN:
          S 1--1 (11)  1  0--0 (51)
       */
       /* On a PIII, these QNaNs always appear with sign==1.  I have
          no idea why. */
       f64[7] = (1 /*sign*/ << 7) | 0x7F;
       f64[6] = 0xF8;
       f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
       return;
    }

    /* It's not a zero, denormal, infinity or nan.  So it must be a
       normalised number.  Rebias the exponent and consider. */
    bexp -= (16383 - 1023);
    if (bexp >= 0x7FF) {
       /* It's too big for a double.  Construct an infinity. */
       f64[7] = (sign << 7) | 0x7F;
       f64[6] = 0xF0;
       f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
       return;
    }

    if (bexp <= 0) {
       /* It's too small for a normalised double.  First construct a
          zero and then see if it can be improved into a denormal.  */
       f64[7] = sign << 7;
       f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;

       if (bexp < -52)
          /* Too small even for a denormal. */
          return;

       /* Ok, let's make a denormal.  Note, this is SLOW. */
       /* Copy bits 63, 62, 61, etc of the src mantissa into the dst,
          indexes 52+bexp, 51+bexp, etc, until k+bexp < 0. */
       /* bexp is in range -52 .. 0 inclusive */
       for (i = 63; i >= 0; i--) {
          j = i - 12 + bexp;
          if (j < 0) break;
          /* We shouldn't really call vassert from generated code. */
          vassert(j >= 0 && j < 52);
          write_bit_array ( f64,
                            j,
                            read_bit_array ( f80, i ) );
       }
       /* and now we might have to round ... */
       if (read_bit_array(f80, 10+1 - bexp) == 1)
          goto do_rounding;

       return;
    }

    /* Ok, it's a normalised number which is representable as a double.
       Copy the exponent and mantissa into place. */
    /*
    for (i = 0; i < 52; i++)
       write_bit_array ( f64,
                         i,
                         read_bit_array ( f80, i+11 ) );
    */
    f64[0] = (f80[1] >> 3) | (f80[2] << 5);
    f64[1] = (f80[2] >> 3) | (f80[3] << 5);
    f64[2] = (f80[3] >> 3) | (f80[4] << 5);
    f64[3] = (f80[4] >> 3) | (f80[5] << 5);
    f64[4] = (f80[5] >> 3) | (f80[6] << 5);
    f64[5] = (f80[6] >> 3) | (f80[7] << 5);

    f64[6] = ((bexp << 4) & 0xF0) | ((f80[7] >> 3) & 0x0F);

    f64[7] = (sign << 7) | ((bexp >> 4) & 0x7F);

    /* Now consider any rounding that needs to happen as a result of
       truncating the mantissa. */
    if (f80[1] & 4) /* read_bit_array(f80, 10) == 1) */ {

       /* If the bottom bits of f80 are "100 0000 0000", then the
          infinitely precise value is deemed to be mid-way between the
          two closest representable values.  Since we're doing
          round-to-nearest (the default mode), in that case it is the
          bit immediately above which indicates whether we should round
          upwards or not -- if 0, we don't.  All that is encapsulated
          in the following simple test. */
       if ((f80[1] & 0xF) == 4/*0100b*/ && f80[0] == 0)
          return;

       do_rounding:
       /* Round upwards.  This is a kludge.  Once in every 2^24
          roundings (statistically) the bottom three bytes are all 0xFF
          and so we don't round at all.  Could be improved. */
       if (f64[0] != 0xFF) {
          f64[0]++;
       }
       else
       if (f64[0] == 0xFF && f64[1] != 0xFF) {
          f64[0] = 0;
          f64[1]++;
       }
       else
       if (f64[0] == 0xFF && f64[1] == 0xFF && f64[2] != 0xFF) {
          f64[0] = 0;
          f64[1] = 0;
          f64[2]++;
       }
       /* else we don't round, but we should. */
    }
 }


 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (reads guest memory) */
 ULong x86g_loadF80le ( UInt addrU )
 {
    ULong f64;
    convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
    return f64;
 }

 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (writes guest memory) */
 void x86g_storeF80le ( UInt addrU, ULong f64 )
 {
    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
 }


 /*----------------------------------------------*/
 /*--- The exported fns ..                    ---*/
 /*----------------------------------------------*/

 /* Layout of the real x87 state. */

 typedef
    struct {
       UShort env[14];
       UChar  reg[80];
    }
    Fpu_State;

 /* Offsets, in 16-bit ints, into the FPU environment (env) area. */
 #define FP_ENV_CTRL   0
 #define FP_ENV_STAT   2
 #define FP_ENV_TAG    4
 #define FP_ENV_IP     6 /* and 7 */
 #define FP_ENV_CS     8
 #define FP_ENV_OPOFF  10 /* and 11 */
 #define FP_ENV_OPSEL  12
 #define FP_REG(ii)    (10*(7-(ii)))


 /* VISIBLE TO LIBVEX CLIENT */
 void LibVEX_GuestX86_put_x87 ( /*IN*/UChar* x87_state,
                                /*OUT*/VexGuestX86State* vex_state )
 {
    Int        r;
    UInt       tag;
    Double*    vexRegs = (Double*)(&vex_state->guest_FPREG[0]);
    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
    Fpu_State* x87     = (Fpu_State*)x87_state;
    UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
    UInt       tagw    = x87->env[FP_ENV_TAG];
    UInt       fpucw   = x87->env[FP_ENV_CTRL];
    UInt       c3210   = x87->env[FP_ENV_STAT] & 0x4700;

    /* Copy registers and tags */
    for (r = 0; r < 8; r++) {
       tag = (tagw >> (2*r)) & 3;
       if (tag == 3) {
          /* register is empty */
          vexRegs[r] = 0.0;
          vexTags[r] = 0;
       } else {
          /* register is non-empty */
          convert_f80le_to_f64le( &x87->reg[FP_REG(r)], (UChar*)&vexRegs[r] );
          vexTags[r] = 1;
       }
    }

    /* stack pointer */
    vex_state->guest_FTOP = ftop;

    /* control word */
    vex_state->guest_FPUCW = fpucw;

    /* status word */
    vex_state->guest_FC3210 = c3210;
 }


 /* VISIBLE TO LIBVEX CLIENT */
 void LibVEX_GuestX86_get_x87 ( /*IN*/VexGuestX86State* vex_state,
                                /*OUT*/UChar* x87_state )
 {
    Int        i, r;
    UInt       tagw;
    Double*    vexRegs = (Double*)(&vex_state->guest_FPREG[0]);
    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
    Fpu_State* x87     = (Fpu_State*)x87_state;
    UInt       ftop    = vex_state->guest_FTOP;
    UInt       c3210   = vex_state->guest_FC3210;

    for (i = 0; i < 14; i++)
       x87->env[i] = 0;

    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
    x87->env[FP_ENV_CTRL] = (UShort)( vex_state->guest_FPUCW );
    x87->env[FP_ENV_STAT] = ((ftop & 7) << 11) | (c3210 & 0x4700);

    tagw = 0;
    for (r = 0; r < 8; r++) {
       if (vexTags[r] == 0) {
          /* register is empty */
          tagw |= (3 << (2*r));
          convert_f64le_to_f80le( (UChar*)&vexRegs[r], &x87->reg[FP_REG(r)] );
       } else {
          /* register is full. */
          tagw |= (0 << (2*r));
          convert_f64le_to_f80le( (UChar*)&vexRegs[r],  &x87->reg[FP_REG(r)] );
       }
    }
    x87->env[FP_ENV_TAG] = tagw;
 }


 /* VISIBLE TO LIBVEX CLIENT */
 void LibVEX_GuestX86_put_eflags ( UInt eflags_native,
                                   /*OUT*/VexGuestX86State* vex_state )
 {
    vex_state->guest_DFLAG
       = (eflags_native & (1<<10)) ? 0xFFFFFFFF : 0x00000001;
    vex_state->guest_IDFLAG
       = (eflags_native & (1<<21)) ? 1 : 0;

    /* Mask out everything except O S Z A C P. */
    eflags_native
       &= (X86G_CC_MASK_C | X86G_CC_MASK_P | X86G_CC_MASK_A
           | X86G_CC_MASK_Z | X86G_CC_MASK_S | X86G_CC_MASK_O);

    vex_state->guest_CC_OP   = X86G_CC_OP_COPY;
    vex_state->guest_CC_DEP1 = eflags_native;
    vex_state->guest_CC_DEP2 = 0;
    vex_state->guest_CC_NDEP = 0; /* unnecessary paranoia */
 }


 /* VISIBLE TO LIBVEX CLIENT */
 UInt LibVEX_GuestX86_get_eflags ( /*IN*/VexGuestX86State* vex_state )
 {
    UInt eflags = x86g_calculate_eflags_all(
                     vex_state->guest_CC_OP,
                     vex_state->guest_CC_DEP1,
                     vex_state->guest_CC_DEP2,
                     vex_state->guest_CC_NDEP
                  );
    UInt dflag = vex_state->guest_DFLAG;
    vassert(dflag == 1 || dflag == 0xFFFFFFFF);
    if (dflag == 0xFFFFFFFF)
       eflags |= (1<<10);
    if (vex_state->guest_IDFLAG == 1)
       eflags |= (1<<21);

    return eflags;
 }

 /* VISIBLE TO LIBVEX CLIENT */
 void LibVEX_GuestX86_initialise ( /*OUT*/VexGuestX86State* vex_state )
 {
    Int i;

    vex_state->guest_EAX = 0;
    vex_state->guest_ECX = 0;
    vex_state->guest_EDX = 0;
    vex_state->guest_EBX = 0;
    vex_state->guest_ESP = 0;
    vex_state->guest_EBP = 0;
    vex_state->guest_ESI = 0;
    vex_state->guest_EDI = 0;

    vex_state->guest_CC_OP   = X86G_CC_OP_COPY;
    vex_state->guest_CC_DEP1 = 0;
    vex_state->guest_CC_DEP2 = 0;
    vex_state->guest_CC_NDEP = 0;
    vex_state->guest_DFLAG   = 1; /* forwards */
    vex_state->guest_IDFLAG  = 0;

    vex_state->guest_EIP = 0;

    vex_state->guest_FTOP = 0;
    for (i = 0; i < 8; i++) {
       vex_state->guest_FPTAG[i] = 0; /* empty */
       vex_state->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
    }
    /* The default setting: all fp exceptions masked, rounding to
       nearest, precision to 64 bits */
    vex_state->guest_FPUCW = 0x03F7;
    vex_state->guest_FC3210 = 0;

    vex_state->guest_CS = 0;
    vex_state->guest_DS = 0;
    vex_state->guest_ES = 0;
    vex_state->guest_FS = 0;
    vex_state->guest_GS = 0;
    vex_state->guest_SS = 0;
 }


 /*----------------------------------------------*/
 /*--- Misc integer helpers                   ---*/
 /*----------------------------------------------*/

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* Calculate both flags and value result for rotate right
    through the carry bit.  Result in low 32 bits,
    new flags (OSZACP) in high 32 bits.
 */
 ULong x86g_calculate_RCR ( UInt arg, UInt rot_amt, UInt eflags_in, UInt sz )
 {
    UInt tempCOUNT = rot_amt & 0x1F, cf=0, of=0, tempcf;

    switch (sz) {
       case 4:
          cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
          of        = ((arg >> 31) ^ cf) & 1;
          while (tempCOUNT > 0) {
             tempcf = arg & 1;
             arg    = (arg >> 1) | (cf << 31);
             cf     = tempcf;
             tempCOUNT--;
          }
          break;
       case 2:
          while (tempCOUNT >= 17) tempCOUNT -= 17;
          cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
          of        = ((arg >> 15) ^ cf) & 1;
          while (tempCOUNT > 0) {
             tempcf = arg & 1;
             arg    = ((arg >> 1) & 0x7FFF) | (cf << 15);
             cf     = tempcf;
             tempCOUNT--;
          }
          break;
       case 1:
          while (tempCOUNT >= 9) tempCOUNT -= 9;
          cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
          of        = ((arg >> 7) ^ cf) & 1;
          while (tempCOUNT > 0) {
             tempcf = arg & 1;
             arg    = ((arg >> 1) & 0x7F) | (cf << 7);
             cf     = tempcf;
             tempCOUNT--;
          }
          break;
       default:
          vpanic("calculate_RCR: invalid size");
    }

    cf &= 1;
    of &= 1;
    eflags_in &= ~(X86G_CC_MASK_C | X86G_CC_MASK_O);
    eflags_in |= (cf << X86G_CC_SHIFT_C) | (of << X86G_CC_SHIFT_O);

    return (((ULong)eflags_in) << 32) | ((ULong)arg);
 }


 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (modifies guest state) */
 /* Claim to be a P54C P133 (pre-MMX Pentium) */
 void x86g_dirtyhelper_CPUID ( VexGuestX86State* st )
 {
    if (st->guest_EAX == 0) {
       st->guest_EAX = 0x1;
       st->guest_EBX = 0x756e6547;
       st->guest_ECX = 0x6c65746e;
       st->guest_EDX = 0x49656e69;
    } else {
       st->guest_EAX = 0x52b;
       st->guest_EBX = 0x0;
       st->guest_ECX = 0x0;
       st->guest_EDX = 0x1bf;
    }
 }


 /*----------------------------------------------*/
 /*--- Helpers for MMX                        ---*/
 /*----------------------------------------------*/

 /* Tuple/select functions for 32x2 vectors. */

 static inline ULong mk32x2 ( UInt w1, UInt w0 )
 {
    return (((ULong)w1) << 32) | ((ULong)w0);
 }
 static inline UInt sel32x2_1 ( ULong w64 )
 {
    return 0xFFFFFFFF & (UInt)(w64 >> 32);
 }
 static inline UInt sel32x2_0 ( ULong w64 )
 {
    return 0xFFFFFFFF & (UInt)w64;
 }


 /* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
    with 64-bit shifts so we give it a hand. */

 static inline ULong mk16x4 ( UShort w3, UShort w2,
                              UShort w1, UShort w0 )
 {
    UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
    UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
    return mk32x2(hi32, lo32);
 }
 static inline UShort sel16x4_3 ( ULong w64 )
 {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFFFF & (UShort)(hi32 >> 16);
 }
 static inline UShort sel16x4_2 ( ULong w64 )
 {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFFFF & (UShort)hi32;
 }
 static inline UShort sel16x4_1 ( ULong w64 )
 {
    UInt lo32 = (UInt)w64;
    return 0xFFFF & (UShort)(lo32 >> 16);
 }
 static inline UShort sel16x4_0 ( ULong w64 )
 {
    UInt lo32 = (UInt)w64;
    return 0xFFFF & (UShort)lo32;
 }


 /* Tuple/select functions for 8x8 vectors. */

 static inline ULong mk8x8 ( UChar w7, UChar w6,
                             UChar w5, UChar w4,
                             UChar w3, UChar w2,
 			    UChar w1, UChar w0 )
 {
    UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
                | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
    UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
                | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
    return mk32x2(hi32, lo32);
 }

 static inline UChar sel8x8_7 ( ULong w64 )
 {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFF & (UChar)(hi32 >> 24);
 }
 static inline UChar sel8x8_6 ( ULong w64 )
 {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFF & (UChar)(hi32 >> 16);
 }
 static inline UChar sel8x8_5 ( ULong w64 )
 {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFF & (UChar)(hi32 >> 8);
 }
 static inline UChar sel8x8_4 ( ULong w64 )
 {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFF & (UChar)(hi32 >> 0);
 }
 static inline UChar sel8x8_3 ( ULong w64 )
 {
    UInt lo32 = (UInt)w64;
    return 0xFF & (UChar)(lo32 >> 24);
 }
 static inline UChar sel8x8_2 ( ULong w64 )
 {
    UInt lo32 = (UInt)w64;
    return 0xFF & (UChar)(lo32 >> 16);
 }
 static inline UChar sel8x8_1 ( ULong w64 )
 {
    UInt lo32 = (UInt)w64;
    return 0xFF & (UChar)(lo32 >> 8);
 }
 static inline UChar sel8x8_0 ( ULong w64 )
 {
    UInt lo32 = (UInt)w64;
    return 0xFF & (UChar)(lo32 >> 0);
 }


 /* Scalar helpers. */

 static inline Short qadd16S ( Short xx, Short yy )
 {
    Int t = ((Int)xx) + ((Int)yy);
    if (t < -32768) t = -32768;
    if (t > 32767)  t = 32767;
    return (Short)t;
 }

 static inline Char qadd8S ( Char xx, Char yy )
 {
    Int t = ((Int)xx) + ((Int)yy);
    if (t < -128) t = -128;
    if (t > 127)  t = 127;
    return (Char)t;
 }

 static inline UShort qadd16U ( UShort xx, UShort yy )
 {
    UInt t = ((UInt)xx) + ((UInt)yy);
    if (t > 0xFFFF) t = 0xFFFF;
    return (UShort)t;
 }

 static inline UChar qadd8U ( UChar xx, UChar yy )
 {
    UInt t = ((UInt)xx) + ((UInt)yy);
    if (t > 0xFF) t = 0xFF;
    return (UChar)t;
 }

 static inline Short qsub16S ( Short xx, Short yy )
 {
    Int t = ((Int)xx) - ((Int)yy);
    if (t < -32768) t = -32768;
    if (t > 32767)  t = 32767;
    return (Short)t;
 }

 static inline Char qsub8S ( Char xx, Char yy )
 {
    Int t = ((Int)xx) - ((Int)yy);
    if (t < -128) t = -128;
    if (t > 127)  t = 127;
    return (Char)t;
 }

 static inline UShort qsub16U ( UShort xx, UShort yy )
 {
    Int t = ((Int)xx) - ((Int)yy);
    if (t < 0)      t = 0;
    if (t > 0xFFFF) t = 0xFFFF;
    return (UShort)t;
 }

 static inline UChar qsub8U ( UChar xx, UChar yy )
 {
    Int t = ((Int)xx) - ((Int)yy);
    if (t < 0)    t = 0;
    if (t > 0xFF) t = 0xFF;
    return (UChar)t;
 }

 static inline Short mulhi16S ( Short xx, Short yy )
 {
    Int t = ((Int)xx) * ((Int)yy);
    t >>=/*s*/ 16;
    return (Short)t;
 }

 static inline Short mullo16S ( Short xx, Short yy )
 {
    Int t = ((Int)xx) * ((Int)yy);
    return (Short)t;
 }

 static inline UInt cmpeq32 ( UInt xx, UInt yy )
 {
    return xx==yy ? 0xFFFFFFFF : 0;
 }

 static inline UShort cmpeq16 ( UShort xx, UShort yy )
 {
    return xx==yy ? 0xFFFF : 0;
 }

 static inline UChar cmpeq8 ( UChar xx, UChar yy )
 {
    return xx==yy ? 0xFF : 0;
 }

 static inline UInt cmpge32S ( Int xx, Int yy )
 {
    return xx>yy ? 0xFFFFFFFF : 0;
 }

 static inline UShort cmpge16S ( Short xx, Short yy )
 {
    return xx>yy ? 0xFFFF : 0;
 }

 static inline UChar cmpge8S ( Char xx, Char yy )
 {
    return xx>yy ? 0xFF : 0;
 }

 static inline Short qnarrow32Sto16 ( UInt xx0 )
 {
    Int xx = (Int)xx0;
    if (xx < -32768) xx = -32768;
    if (xx > 32767)  xx = 32767;
    return (Short)xx;
 }

 static inline Char qnarrow16Sto8 ( UShort xx0 )
 {
    Short xx = (Short)xx0;
    if (xx < -128) xx = -128;
    if (xx > 127)  xx = 127;
    return (Char)xx;
 }

 static inline UChar qnarrow16Uto8 ( UShort xx0 )
 {
    Short xx = (Short)xx0;
    if (xx < 0)   xx = 0;
    if (xx > 255) xx = 255;
    return (UChar)xx;
 }

 static inline UShort shl16 ( UShort v, ULong n )
 {
    return n > 15 ? 0 : v << n;
 }

 static inline UShort shr16U ( UShort v, ULong n )
 {
    return n > 15 ? 0 : (((UShort)v) >> n);
 }

 static inline UShort shr16S ( UShort v, ULong n )
 {
    if (n <= 15)
       return ((Short)v) >> n;
    return (v & 0x8000) ? 0xFFFF : 0;
 }

 static inline UInt shl32 ( UInt v, ULong n )
 {
    return n > 31 ? 0 : v << n;
 }

 static inline UInt shr32U ( UInt v, ULong n )
 {
    return n > 31 ? 0 : (((UInt)v) >> n);
 }

 static inline UInt shr32S ( UInt v, ULong n )
 {
    if (n <= 31)
       return ((Int)v) >> n;
    return (v & 0x80000000) ? 0xFFFFFFFF : 0;
 }


 /* ------------ Normal addition ------------ */

 ULong x86g_calculate_add32x2 ( ULong xx, ULong yy )
 {
    return mk32x2(
              sel32x2_1(xx) + sel32x2_1(yy),
              sel32x2_0(xx) + sel32x2_0(yy)
           );
 }

 ULong x86g_calculate_add16x4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              sel16x4_3(xx) + sel16x4_3(yy),
              sel16x4_2(xx) + sel16x4_2(yy),
              sel16x4_1(xx) + sel16x4_1(yy),
              sel16x4_0(xx) + sel16x4_0(yy)
           );
 }

 ULong x86g_calculate_add8x8 ( ULong xx, ULong yy )
 {
    return mk8x8(
              sel8x8_7(xx) + sel8x8_7(yy),
              sel8x8_6(xx) + sel8x8_6(yy),
              sel8x8_5(xx) + sel8x8_5(yy),
              sel8x8_4(xx) + sel8x8_4(yy),
              sel8x8_3(xx) + sel8x8_3(yy),
              sel8x8_2(xx) + sel8x8_2(yy),
              sel8x8_1(xx) + sel8x8_1(yy),
              sel8x8_0(xx) + sel8x8_0(yy)
           );
 }

 /* ------------ Saturating addition ------------ */

 ULong x86g_calculate_qadd16Sx4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
              qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
              qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
              qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
           );
 }

 ULong x86g_calculate_qadd8Sx8 ( ULong xx, ULong yy )
 {
    return mk8x8(
              qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
              qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
              qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
              qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
              qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
              qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
              qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
              qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
           );
 }

 ULong x86g_calculate_qadd16Ux4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
              qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
              qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
              qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
           );
 }

 ULong x86g_calculate_qadd8Ux8 ( ULong xx, ULong yy )
 {
    return mk8x8(
              qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
              qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
              qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
              qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
              qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
              qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
              qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
              qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
           );
 }

 /* ------------ Normal subtraction ------------ */

 ULong x86g_calculate_sub32x2 ( ULong xx, ULong yy )
 {
    return mk32x2(
              sel32x2_1(xx) - sel32x2_1(yy),
              sel32x2_0(xx) - sel32x2_0(yy)
           );
 }

 ULong x86g_calculate_sub16x4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              sel16x4_3(xx) - sel16x4_3(yy),
              sel16x4_2(xx) - sel16x4_2(yy),
              sel16x4_1(xx) - sel16x4_1(yy),
              sel16x4_0(xx) - sel16x4_0(yy)
           );
 }

 ULong x86g_calculate_sub8x8 ( ULong xx, ULong yy )
 {
    return mk8x8(
              sel8x8_7(xx) - sel8x8_7(yy),
              sel8x8_6(xx) - sel8x8_6(yy),
              sel8x8_5(xx) - sel8x8_5(yy),
              sel8x8_4(xx) - sel8x8_4(yy),
              sel8x8_3(xx) - sel8x8_3(yy),
              sel8x8_2(xx) - sel8x8_2(yy),
              sel8x8_1(xx) - sel8x8_1(yy),
              sel8x8_0(xx) - sel8x8_0(yy)
           );
 }

 /* ------------ Saturating subtraction ------------ */

 ULong x86g_calculate_qsub16Sx4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
              qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
              qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
              qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
           );
 }

 ULong x86g_calculate_qsub8Sx8 ( ULong xx, ULong yy )
 {
    return mk8x8(
              qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
              qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
              qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
              qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
              qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
              qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
              qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
              qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
           );
 }

 ULong x86g_calculate_qsub16Ux4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
              qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
              qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
              qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
           );
 }

 ULong x86g_calculate_qsub8Ux8 ( ULong xx, ULong yy )
 {
    return mk8x8(
              qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
              qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
              qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
              qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
              qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
              qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
              qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
              qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
           );
 }

 /* ------------ Multiplication ------------ */

 ULong x86g_calculate_mulhi16x4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
              mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
              mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
              mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
           );
 }

 ULong x86g_calculate_mullo16x4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              mullo16S( sel16x4_3(xx), sel16x4_3(yy) ),
              mullo16S( sel16x4_2(xx), sel16x4_2(yy) ),
              mullo16S( sel16x4_1(xx), sel16x4_1(yy) ),
              mullo16S( sel16x4_0(xx), sel16x4_0(yy) )
           );
 }

 ULong x86g_calculate_pmaddwd ( ULong xx, ULong yy )
 {
    return
       mk32x2(
          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
       );
 }

 /* ------------ Comparison ------------ */

 ULong x86g_calculate_cmpeq32x2 ( ULong xx, ULong yy )
 {
    return mk32x2(
              cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
              cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
           );
 }

 ULong x86g_calculate_cmpeq16x4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
              cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
              cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
              cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
           );
 }

 ULong x86g_calculate_cmpeq8x8 ( ULong xx, ULong yy )
 {
    return mk8x8(
              cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
              cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
              cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
              cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
              cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
              cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
              cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
              cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
           );
 }

 ULong x86g_calculate_cmpge32Sx2 ( ULong xx, ULong yy )
 {
    return mk32x2(
              cmpge32S( sel32x2_1(xx), sel32x2_1(yy) ),
              cmpge32S( sel32x2_0(xx), sel32x2_0(yy) )
           );
 }

 ULong x86g_calculate_cmpge16Sx4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              cmpge16S( sel16x4_3(xx), sel16x4_3(yy) ),
              cmpge16S( sel16x4_2(xx), sel16x4_2(yy) ),
              cmpge16S( sel16x4_1(xx), sel16x4_1(yy) ),
              cmpge16S( sel16x4_0(xx), sel16x4_0(yy) )
           );
 }

 ULong x86g_calculate_cmpge8Sx8 ( ULong xx, ULong yy )
 {
    return mk8x8(
              cmpge8S( sel8x8_7(xx), sel8x8_7(yy) ),
              cmpge8S( sel8x8_6(xx), sel8x8_6(yy) ),
              cmpge8S( sel8x8_5(xx), sel8x8_5(yy) ),
              cmpge8S( sel8x8_4(xx), sel8x8_4(yy) ),
              cmpge8S( sel8x8_3(xx), sel8x8_3(yy) ),
              cmpge8S( sel8x8_2(xx), sel8x8_2(yy) ),
              cmpge8S( sel8x8_1(xx), sel8x8_1(yy) ),
              cmpge8S( sel8x8_0(xx), sel8x8_0(yy) )
           );
 }

 /* ------------ Pack / unpack ------------ */

 ULong x86g_calculate_packssdw ( ULong dst, ULong src )
 {
    UInt d = sel32x2_1(dst);
    UInt c = sel32x2_0(dst);
    UInt b = sel32x2_1(src);
    UInt a = sel32x2_0(src);
    /* This just doesn't seem to match what the Intel documentation
       says -- that implies that the new word should be made in the
       sequence d c b a. */
    return mk16x4(
              qnarrow32Sto16(b),
              qnarrow32Sto16(a),
              qnarrow32Sto16(d),
              qnarrow32Sto16(c)
           );
 }

 ULong x86g_calculate_packsswb ( ULong dst, ULong src )
 {
    UShort h = sel16x4_3(dst);
    UShort g = sel16x4_2(dst);
    UShort f = sel16x4_1(dst);
    UShort e = sel16x4_0(dst);
    UShort d = sel16x4_3(src);
    UShort c = sel16x4_2(src);
    UShort b = sel16x4_1(src);
    UShort a = sel16x4_0(src);
    /* As per packssdw, this sequence also seems to contradict the
       Intel docs. */
    return mk8x8(
              qnarrow16Sto8(d),
              qnarrow16Sto8(c),
              qnarrow16Sto8(b),
              qnarrow16Sto8(a),
              qnarrow16Sto8(h),
              qnarrow16Sto8(g),
              qnarrow16Sto8(f),
              qnarrow16Sto8(e)
           );
 }

 ULong x86g_calculate_packuswb ( ULong dst, ULong src )
 {
    UShort h = sel16x4_3(dst);
    UShort g = sel16x4_2(dst);
    UShort f = sel16x4_1(dst);
    UShort e = sel16x4_0(dst);
    UShort d = sel16x4_3(src);
    UShort c = sel16x4_2(src);
    UShort b = sel16x4_1(src);
    UShort a = sel16x4_0(src);
    /* As per packssdw, this sequence also seems to contradict the
       Intel docs. */
    return mk8x8(
              qnarrow16Uto8(d),
              qnarrow16Uto8(c),
              qnarrow16Uto8(b),
              qnarrow16Uto8(a),
              qnarrow16Uto8(h),
              qnarrow16Uto8(g),
              qnarrow16Uto8(f),
              qnarrow16Uto8(e)
           );
 }

 ULong x86g_calculate_punpckhbw ( ULong dst, ULong src )
 {
   return mk8x8(
             sel8x8_7(src),
             sel8x8_7(dst),
             sel8x8_6(src),
             sel8x8_6(dst),
             sel8x8_5(src),
             sel8x8_5(dst),
             sel8x8_4(src),
             sel8x8_4(dst)
          );
 }

 ULong x86g_calculate_punpcklbw ( ULong dst, ULong src )
 {
   return mk8x8(
             sel8x8_3(src),
             sel8x8_3(dst),
             sel8x8_2(src),
             sel8x8_2(dst),
             sel8x8_1(src),
             sel8x8_1(dst),
             sel8x8_0(src),
             sel8x8_0(dst)
          );
 }

 ULong x86g_calculate_punpckhwd ( ULong dst, ULong src )
 {
   return mk16x4(
             sel16x4_3(src),
             sel16x4_3(dst),
             sel16x4_2(src),
             sel16x4_2(dst)
          );
 }

 ULong x86g_calculate_punpcklwd ( ULong dst, ULong src )
 {
   return mk16x4(
             sel16x4_1(src),
             sel16x4_1(dst),
             sel16x4_0(src),
             sel16x4_0(dst)
          );
 }

 ULong x86g_calculate_punpckhdq ( ULong dst, ULong src )
 {
   return mk32x2(
             sel32x2_1(src),
             sel32x2_1(dst)
          );
 }

 ULong x86g_calculate_punpckldq ( ULong dst, ULong src )
 {
   return mk32x2(
             sel32x2_0(src),
             sel32x2_0(dst)
          );
 }

 /* ------------ Shifting ------------ */

 ULong x86g_calculate_shl16x4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              shl16( sel16x4_3(xx), yy ),
              shl16( sel16x4_2(xx), yy ),
              shl16( sel16x4_1(xx), yy ),
              shl16( sel16x4_0(xx), yy )
           );
 }

 ULong x86g_calculate_shl32x2 ( ULong xx, ULong yy )
 {
    return mk32x2(
              shl32( sel32x2_1(xx), yy ),
              shl32( sel32x2_0(xx), yy )
           );
 }


 ULong x86g_calculate_shl64x1 ( ULong xx, ULong yy )
 {
    if (yy > 63) return 0;
    return xx << yy;
 }

 ULong x86g_calculate_shr16Ux4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              shr16U( sel16x4_3(xx), yy ),
              shr16U( sel16x4_2(xx), yy ),
              shr16U( sel16x4_1(xx), yy ),
              shr16U( sel16x4_0(xx), yy )
           );
 }

 ULong x86g_calculate_shr32Ux2 ( ULong xx, ULong yy )
 {
    return mk32x2(
              shr32U( sel32x2_1(xx), yy ),
              shr32U( sel32x2_0(xx), yy )
           );
 }

 ULong x86g_calculate_shr64Ux1 ( ULong xx, ULong yy )
 {
    if (yy > 63) return 0;
    return xx >> yy;
 }


 ULong x86g_calculate_shr16Sx4 ( ULong xx, ULong yy )
 {
    return mk16x4(
              shr16S( sel16x4_3(xx), yy ),
              shr16S( sel16x4_2(xx), yy ),
              shr16S( sel16x4_1(xx), yy ),
              shr16S( sel16x4_0(xx), yy )
           );
 }

 ULong x86g_calculate_shr32Sx2 ( ULong xx, ULong yy )
 {
    return mk32x2(
              shr32S( sel32x2_1(xx), yy ),
              shr32S( sel32x2_0(xx), yy )
           );
 }


 /*-----------------------------------------------------------*/
 /*--- Describing the x86 guest state, for the benefit     ---*/
 /*--- of iropt and instrumenters.                         ---*/
 /*-----------------------------------------------------------*/

 /* Figure out if any part of the guest state contained in minoff
    .. maxoff requires precise memory exceptions.  If in doubt return
    True (but this is generates significantly slower code).

    We enforce precise exns for guest %ESP and %EIP only.
 */
 Bool guest_x86_state_requires_precise_mem_exns ( Int minoff,
                                                  Int maxoff)
 {
    Int esp_min = offsetof(VexGuestX86State, guest_ESP);
    Int esp_max = esp_min + 4 - 1;
    Int eip_min = offsetof(VexGuestX86State, guest_EIP);
    Int eip_max = eip_min + 4 - 1;

    if (maxoff < esp_min || minoff > esp_max) {
       /* no overlap with esp */
    } else {
       return True;
    }

    if (maxoff < eip_min || minoff > eip_max) {
       /* no overlap with eip */
    } else {
       return True;
    }

    return False;
 }


 #define ALWAYSDEFD(field)                           \
     { offsetof(VexGuestX86State, field),            \
       (sizeof ((VexGuestX86State*)0)->field) }

 VexGuestLayout
    x86guest_layout
       = {
           /* Total size of the guest state, in bytes. */
           .total_sizeB = sizeof(VexGuestX86State),

           /* Describe the stack pointer. */
           .offset_SP = offsetof(VexGuestX86State,guest_ESP),
           .sizeof_SP = 4,

           /* Describe the instruction pointer. */
           .offset_IP = offsetof(VexGuestX86State,guest_EIP),
           .sizeof_IP = 4,

           /* Describe any sections to be regarded by Memcheck as
              'always-defined'. */
           .n_alwaysDefd = 15,
           /* flags thunk: OP and NDEP are always defd, whereas DEP1
              and DEP2 have to be tracked.  See detailed comment in
              gdefs.h on meaning of thunk fields. */

           .alwaysDefd
              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
                  /*  2 */ ALWAYSDEFD(guest_DFLAG),
                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
                  /*  4 */ ALWAYSDEFD(guest_EIP),
                  /*  5 */ ALWAYSDEFD(guest_FTOP),
                  /*  6 */ ALWAYSDEFD(guest_FPTAG),
                  /*  7 */ ALWAYSDEFD(guest_FPUCW),
                  /*  8 */ ALWAYSDEFD(guest_FC3210),
                  /*  9 */ ALWAYSDEFD(guest_CS),
                  /* 10 */ ALWAYSDEFD(guest_DS),
                  /* 11 */ ALWAYSDEFD(guest_ES),
                  /* 12 */ ALWAYSDEFD(guest_FS),
                  /* 13 */ ALWAYSDEFD(guest_GS),
 	         /* 14 */ ALWAYSDEFD(guest_SS)
                }
         };


 /*---------------------------------------------------------------*/
 /*--- end                                guest-x86/ghelpers.c ---*/
 /*---------------------------------------------------------------*/