priv/guest-x86/ghelpers.c - platform/external/valgrind - Gitiles


 /*---------------------------------------------------------------*/
 /*---                                                         ---*/
 /*--- This file (guest-x86/ghelpers.c) is                     ---*/
 /*--- Copyright (c) 2004 OpenWorks LLP.  All rights reserved. ---*/
 /*---                                                         ---*/
 /*---------------------------------------------------------------*/

 /*
    This file is part of LibVEX, a library for dynamic binary
    instrumentation and translation.

    Copyright (C) 2004 OpenWorks, LLP.

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; Version 2 dated June 1991 of the
    license.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or liability
    for damages.  See the GNU General Public License for more details.

    Neither the names of the U.S. Department of Energy nor the
    University of California nor the names of its contributors may be
    used to endorse or promote products derived from this software
    without prior written permission.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
    USA.
 */

 #include "libvex_basictypes.h"
 #include "libvex_emwarn.h"
 #include "libvex_guest_x86.h"
 #include "libvex_ir.h"
 #include "libvex.h"

 #include "main/vex_util.h"
 #include "guest-x86/gdefs.h"


 /* This file contains helper functions for x86 guest code.
    Calls to these functions are generated by the back end.
    These calls are of course in the host machine code and
    this file will be compiled to host machine code, so that
    all makes sense.

    Only change the signatures of these helper functions very
    carefully.  If you change the signature here, you'll have to change
    the parameters passed to it in the IR calls constructed by
    guest-x86/toIR.c.

    Some of this code/logic is derived from QEMU, which is copyright
    Fabrice Bellard, licensed under the LGPL.  It is used with
    permission.
 */

 /* Set to 1 to get detailed profiling info about use of the flag
    machinery. */
 #define PROFILE_EFLAGS 0


 static const UChar parity_table[256] = {
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
     0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
 };

 /* n must be a constant to be efficient */
 inline static Int lshift ( Int x, Int n )
 {
    if (n >= 0)
       return x << n;
    else
       return x >> (-n);
 }


 #define PREAMBLE(__data_bits)					\
    /* const */ UInt DATA_MASK 					\
       = __data_bits==8 ? 0xFF 					\
                        : (__data_bits==16 ? 0xFFFF 		\
                                           : 0xFFFFFFFF); 	\
    /* const */ UInt SIGN_MASK = 1 << (__data_bits - 1);		\
    /* const */ UInt CC_DEP1 = cc_dep1_formal;			\
    /* const */ UInt CC_DEP2 = cc_dep2_formal;			\
    /* const */ UInt CC_NDEP = cc_ndep_formal;			\
    /* Four bogus assignments, which hopefully gcc can     */	\
    /* optimise away, and which stop it complaining about  */	\
    /* unused variables.                                   */	\
    SIGN_MASK = SIGN_MASK;					\
    DATA_MASK = DATA_MASK;					\
    CC_DEP2 = CC_DEP2;						\
    CC_NDEP = CC_NDEP;


 /*-------------------------------------------------------------*/

 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, res;					\
      argL = CC_DEP1;						\
      argR = CC_DEP2;						\
      res  = argL + argR;					\
      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
                  12 - DATA_BITS) & X86G_CC_MASK_O;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, res;					\
      argL = CC_DEP1;						\
      argR = CC_DEP2;						\
      res  = argL - argR;					\
      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR) & (argL ^ res),	 		\
                  12 - DATA_BITS) & X86G_CC_MASK_O; 		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, oldC, res;		       			\
      oldC = CC_NDEP & X86G_CC_MASK_C;				\
      argL = CC_DEP1;						\
      argR = CC_DEP2 ^ oldC;	       				\
      res  = (argL + argR) + oldC;				\
      if (oldC)							\
         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
      else							\
         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
                   12 - DATA_BITS) & X86G_CC_MASK_O;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, oldC, res;		       			\
      oldC = CC_NDEP & X86G_CC_MASK_C;				\
      argL = CC_DEP1;						\
      argR = CC_DEP2 ^ oldC;	       				\
      res  = (argL - argR) - oldC;				\
      if (oldC)							\
         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
      else							\
         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR) & (argL ^ res), 			\
                  12 - DATA_BITS) & X86G_CC_MASK_O;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      cf = 0;							\
      pf = parity_table[(UChar)CC_DEP1];				\
      af = 0;							\
      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
      of = 0;							\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, res;					\
      res  = CC_DEP1;						\
      argL = res - 1;						\
      argR = 1;							\
      cf = CC_NDEP & X86G_CC_MASK_C;				\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      Int argL, argR, res;					\
      res  = CC_DEP1;						\
      argL = res + 1;						\
      argR = 1;							\
      cf = CC_NDEP & X86G_CC_MASK_C;				\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = ((res & DATA_MASK) 					\
           == ((UInt)SIGN_MASK - 1)) << 11;			\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int cf, pf, af, zf, sf, of;				\
      cf = (CC_DEP2 >> (DATA_BITS - 1)) & X86G_CC_MASK_C;	\
      pf = parity_table[(UChar)CC_DEP1];				\
      af = 0; /* undefined */					\
      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
      /* of is defined if shift count == 1 */			\
      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
           & X86G_CC_MASK_O;					\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);  					\
    { Int cf, pf, af, zf, sf, of;				\
      cf = CC_DEP2 & 1;						\
      pf = parity_table[(UChar)CC_DEP1];				\
      af = 0; /* undefined */					\
      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
      /* of is defined if shift count == 1 */			\
      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
           & X86G_CC_MASK_O;					\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
 /* DEP1 = result, NDEP = old flags */
 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int fl 							\
         = (CC_NDEP & ~(X86G_CC_MASK_O | X86G_CC_MASK_C))	\
           | (X86G_CC_MASK_C & CC_DEP1)				\
           | (X86G_CC_MASK_O & (lshift(CC_DEP1,  		\
                                       11-(DATA_BITS-1)) 	\
                      ^ lshift(CC_DEP1, 11)));			\
      return fl;							\
    }								\
 }

 /*-------------------------------------------------------------*/

 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
 /* DEP1 = result, NDEP = old flags */
 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Int fl 							\
         = (CC_NDEP & ~(X86G_CC_MASK_O | X86G_CC_MASK_C))	\
           | (X86G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
           | (X86G_CC_MASK_O & (lshift(CC_DEP1, 			\
                                       11-(DATA_BITS-1)) 	\
                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
      return fl;							\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_UMUL(DATA_BITS,DATA_UTYPE,DATA_U2TYPE)          \
 {                                                               \
    PREAMBLE(DATA_BITS);                                         \
    { Int cf, pf, af, zf, sf, of;                                \
      DATA_UTYPE  hi;                                            \
      DATA_UTYPE  lo = ((DATA_UTYPE)CC_DEP1)                     \
                       * ((DATA_UTYPE)CC_DEP2);                  \
      DATA_U2TYPE rr = ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))      \
                       * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2));   \
      hi = (DATA_UTYPE)(rr >>/*u*/ DATA_BITS);                   \
      cf = (hi != 0);                                            \
      pf = parity_table[(UChar)lo];                              \
      af = 0; /* undefined */                                    \
      zf = (lo == 0) << 6;                                       \
      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
      of = cf << 11;                                             \
      return cf | pf | af | zf | sf | of;                        \
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SMUL(DATA_BITS,DATA_STYPE,DATA_S2TYPE)          \
 {                                                               \
    PREAMBLE(DATA_BITS);                                         \
    { Int cf, pf, af, zf, sf, of;                                \
      DATA_STYPE  hi;                                            \
      DATA_STYPE  lo = ((DATA_STYPE)CC_DEP1)                     \
                       * ((DATA_STYPE)CC_DEP2);                  \
      DATA_S2TYPE rr = ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))      \
                       * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2));   \
      hi = (DATA_STYPE)(rr >>/*s*/ DATA_BITS);                   \
      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
      pf = parity_table[(UChar)lo];                              \
      af = 0; /* undefined */                                    \
      zf = (lo == 0) << 6;                                       \
      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
      of = cf << 11;                                             \
      return cf | pf | af | zf | sf | of;                        \
    }								\
 }


 #if PROFILE_EFLAGS

 static UInt tabc[X86G_CC_OP_NUMBER];
 static UInt tab[X86G_CC_OP_NUMBER][16];
 static Bool initted     = False;
 static UInt n_calc_cond = 0;
 static UInt n_calc_all  = 0;
 static UInt n_calc_c    = 0;

 static void showCounts ( void )
 {
    Int op, co;
    Char ch;
    vex_printf("\nALL=%d  COND=%d   C=%d\n",
               n_calc_all-n_calc_cond-n_calc_c, n_calc_cond, n_calc_c);
    vex_printf("      CARRY    O   NO    B   NB    Z   NZ   BE  NBE"
               "    S   NS    P   NP    L   NL   LE  NLE\n");
    vex_printf("     ----------------------------------------------"
               "----------------------------------------\n");
    for (op = 0; op < X86G_CC_OP_NUMBER; op++) {

       ch = ' ';
       if (op > 0 && (op-1) % 3 == 0)
          ch = 'B';
       if (op > 0 && (op-1) % 3 == 1)
          ch = 'W';
       if (op > 0 && (op-1) % 3 == 2)
          ch = 'L';

       vex_printf("%2d%c: ", op, ch);
       vex_printf("%6d ", tabc[op]);
       for (co = 0; co < 16; co++) {
          Int n = tab[op][co];
          if (n >= 1000) {
             vex_printf(" %3dK", n / 1000);
          } else
          if (n >= 0) {
            vex_printf(" %3d ", n );
          } else {
             vex_printf("     ");
          }
       }
       vex_printf("\n");
    }
    vex_printf("\n");
 }

 static void initCounts ( void )
 {
    Int op, co;
    initted = True;
    for (op = 0; op < X86G_CC_OP_NUMBER; op++) {
       tabc[op] = 0;
       for (co = 0; co < 16; co++)
          tab[op][co] = 0;
    }
 }

 #endif /* PROFILE_EFLAGS */

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* Calculate all the 6 flags from the supplied thunk parameters. */
 UInt x86g_calculate_eflags_all ( UInt cc_op,
                                  UInt cc_dep1_formal,
                                  UInt cc_dep2_formal,
                                 UInt cc_ndep_formal )
 {
 #  if PROFILE_EFLAGS
    n_calc_all++;
 #  endif
    switch (cc_op) {
       case X86G_CC_OP_COPY:
          return cc_dep1_formal
                 & (X86G_CC_MASK_O | X86G_CC_MASK_S | X86G_CC_MASK_Z
                    | X86G_CC_MASK_A | X86G_CC_MASK_C | X86G_CC_MASK_P);

       case X86G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
       case X86G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
       case X86G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );

       case X86G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
       case X86G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
       case X86G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );

       case X86G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
       case X86G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
       case X86G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );

       case X86G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
       case X86G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
       case X86G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );

       case X86G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
       case X86G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
       case X86G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );

       case X86G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
       case X86G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
       case X86G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );

       case X86G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
       case X86G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
       case X86G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );

       case X86G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
       case X86G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
       case X86G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );

       case X86G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
       case X86G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
       case X86G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );

       case X86G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
       case X86G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
       case X86G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );

       case X86G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
       case X86G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
       case X86G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );

       case X86G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  UShort );
       case X86G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, UInt   );
       case X86G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   ULong  );

       case X86G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   Short );
       case X86G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  Int   );
       case X86G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    Long  );

       default:
          /* shouldn't really make these calls from generated code */
          vex_printf("calculate_eflags_all(X86)( %d, 0x%x, 0x%x, 0x%x )\n",
                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
          vpanic("calculate_eflags_all(X86)");
    }
 }


 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* Calculate just the carry flag from the supplied thunk parameters. */
 UInt x86g_calculate_eflags_c ( UInt cc_op,
                                UInt cc_dep1,
                                UInt cc_dep2,
                                UInt cc_ndep )
 {
    /* Fast-case some common ones. */
    switch (cc_op) {
       case X86G_CC_OP_LOGICL:
       case X86G_CC_OP_LOGICW:
       case X86G_CC_OP_LOGICB:
          return 0;
       case X86G_CC_OP_SUBL:
          return ((UInt)cc_dep1) < ((UInt)cc_dep2)
                    ? X86G_CC_MASK_C : 0;
 #if 0
       case X86G_CC_OP_SUBB:
          return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
                    ? X86G_CC_MASK_C : 0;
 #endif
 #if 0
       case X86G_CC_OP_DECL:
          return cc_src;
       case X86G_CC_OP_ADDL:
          return ( ((UInt)cc_src + (UInt)cc_dst) < ((UInt)cc_src) )
                    ? X86G_CC_MASK_C : 0;
       case X86G_CC_OP_SUBB:
          return ( ((UInt)(cc_src & 0xFF)) > ((UInt)(cc_dst & 0xFF)) )
                    ? X86G_CC_MASK_C : 0;
 #endif
       default:
          break;
    }

 #  if PROFILE_EFLAGS
    if (!initted)
       initCounts();
    tabc[cc_op]++;

    n_calc_c++;
 #  endif
    return x86g_calculate_eflags_all(cc_op,cc_dep1,cc_dep2,cc_ndep)
           & X86G_CC_MASK_C;
 }


 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* returns 1 or 0 */
 UInt x86g_calculate_condition ( UInt/*X86Condcode*/ cond,
                                 UInt cc_op,
                                 UInt cc_dep1,
                                 UInt cc_dep2,
                                 UInt cc_ndep )
 {
    UInt eflags = x86g_calculate_eflags_all(cc_op, cc_dep1,
                                            cc_dep2, cc_ndep);
    UInt of,sf,zf,cf,pf;
    UInt inv = cond & 1;

 #  if PROFILE_EFLAGS
    if (!initted)
      initCounts();

    tab[cc_op][cond]++;
    n_calc_cond++;

    if (0 == ((n_calc_all+n_calc_c) & 0x7FFFF)) showCounts();
 #  endif

    switch (cond) {
       case X86CondNO:
       case X86CondO: /* OF == 1 */
          of = eflags >> X86G_CC_SHIFT_O;
          return 1 & (inv ^ of);

       case X86CondNZ:
       case X86CondZ: /* ZF == 1 */
          zf = eflags >> X86G_CC_SHIFT_Z;
          return 1 & (inv ^ zf);

       case X86CondNB:
       case X86CondB: /* CF == 1 */
          cf = eflags >> X86G_CC_SHIFT_C;
          return 1 & (inv ^ cf);
          break;

       case X86CondNBE:
       case X86CondBE: /* (CF or ZF) == 1 */
          cf = eflags >> X86G_CC_SHIFT_C;
          zf = eflags >> X86G_CC_SHIFT_Z;
          return 1 & (inv ^ (cf | zf));
          break;

       case X86CondNS:
       case X86CondS: /* SF == 1 */
          sf = eflags >> X86G_CC_SHIFT_S;
          return 1 & (inv ^ sf);

       case X86CondNP:
       case X86CondP: /* PF == 1 */
          pf = eflags >> X86G_CC_SHIFT_P;
          return 1 & (inv ^ pf);

       case X86CondNL:
       case X86CondL: /* (SF xor OF) == 1 */
          sf = eflags >> X86G_CC_SHIFT_S;
          of = eflags >> X86G_CC_SHIFT_O;
          return 1 & (inv ^ (sf ^ of));
          break;

       case X86CondNLE:
       case X86CondLE: /* ((SF xor OF) or ZF)  == 1 */
          sf = eflags >> X86G_CC_SHIFT_S;
          of = eflags >> X86G_CC_SHIFT_O;
          zf = eflags >> X86G_CC_SHIFT_Z;
          return 1 & (inv ^ ((sf ^ of) | zf));
          break;

       default:
          /* shouldn't really make these calls from generated code */
          vex_printf("calculate_condition( %d, %d, 0x%x, 0x%x, 0x%x )\n",
                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
          vpanic("calculate_condition");
    }
 }


 /* Used by the optimiser to try specialisations.  Returns an
    equivalent expression, or NULL if none. */

 static Bool isU32 ( IRExpr* e, UInt n )
 {
    return e->tag == Iex_Const
           && e->Iex.Const.con->tag == Ico_U32
           && e->Iex.Const.con->Ico.U32 == n;
 }

 IRExpr* guest_x86_spechelper ( Char* function_name,
                                IRExpr** args )
 {
 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))

    Int i, arity = 0;
    for (i = 0; args[i]; i++)
       arity++;
 #  if 0
    vex_printf("spec request:\n");
    vex_printf("   %s  ", function_name);
    for (i = 0; i < arity; i++) {
       vex_printf("  ");
       ppIRExpr(args[i]);
    }
    vex_printf("\n");
 #  endif

    /* --------- specialising "x86g_calculate_eflags_c" --------- */

    if (vex_streq(function_name, "x86g_calculate_eflags_c")) {
       /* specialise calls to above "calculate_eflags_c" function */
       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
       vassert(arity == 4);
       cc_op   = args[0];
       cc_dep1 = args[1];
       cc_dep2 = args[2];
       cc_ndep = args[3];

       if (isU32(cc_op, X86G_CC_OP_SUBL)) {
          /* C after sub denotes unsigned less than */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLT32U, cc_dep1, cc_dep2));
       }
       if (isU32(cc_op, X86G_CC_OP_LOGICL)) {
          /* cflag after logic is zero */
          return mkU32(0);
       }
       if (isU32(cc_op, X86G_CC_OP_DECL) || isU32(cc_op, X86G_CC_OP_INCL)) {
          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
          return cc_ndep;
       }
       if (isU32(cc_op, X86G_CC_OP_COPY)) {
          /* cflag after COPY is stored in DEP1. */
          return
             binop(
                Iop_And32,
                binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
                mkU32(1)
             );
       }
 #     if 0
       if (cc_op->tag == Iex_Const) {
          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
       }
 #     endif

       return NULL;
    }

    /* --------- specialising "x86g_calculate_condition" --------- */

    if (vex_streq(function_name, "x86g_calculate_condition")) {
       /* specialise calls to above "calculate condition" function */
       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
       vassert(arity == 5);
       cond    = args[0];
       cc_op   = args[1];
       cc_dep1 = args[2];
       cc_dep2 = args[3];
       cc_ndep = args[4];

       /*---------------- ADDL ----------------*/

       if (isU32(cc_op, X86G_CC_OP_ADDL) && isU32(cond, X86CondZ)) {
          /* long add, then Z --> test (dst+src == 0) */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ32,
                            binop(Iop_Add32, cc_dep1, cc_dep2),
                            mkU32(0)));
       }

       /*---------------- SUBL ----------------*/

       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondZ)) {
          /* long sub/cmp, then Z --> test dst==src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ32, cc_dep1, cc_dep2));
       }

       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondL)) {
          /* long sub/cmp, then L (signed less than)
             --> test dst <s src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLT32S, cc_dep1, cc_dep2));
       }

       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondLE)) {
          /* long sub/cmp, then LE (signed less than or equal)
             --> test dst <=s src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLE32S, cc_dep1, cc_dep2));
       }

       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondBE)) {
          /* long sub/cmp, then BE (unsigned less than or equal)
             --> test dst <=u src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLE32U, cc_dep1, cc_dep2));
       }
 #if 0
       if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondB)) {
          /* long sub/cmp, then B (unsigned less than)
             --> test dst <u src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLT32U, cc_dst, cc_src));
       }
 #endif
       /*---------------- SUBW ----------------*/

       if (isU32(cc_op, X86G_CC_OP_SUBW) && isU32(cond, X86CondZ)) {
          /* byte sub/cmp, then Z --> test dst==src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ16,
                            unop(Iop_32to16,cc_dep1),
                            unop(Iop_32to16,cc_dep2)));
       }

       /*---------------- SUBB ----------------*/

       if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondZ)) {
          /* byte sub/cmp, then Z --> test dst==src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ8,
                            unop(Iop_32to8,cc_dep1),
                            unop(Iop_32to8,cc_dep2)));
       }

       if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNZ)) {
          /* byte sub/cmp, then NZ --> test dst!=src */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpNE8,
                            unop(Iop_32to8,cc_dep1),
                            unop(Iop_32to8,cc_dep2)));
       }

       if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNBE)) {
          /* long sub/cmp, then NBE (unsigned greater than)
             --> test src <=u dst */
          /* Note, args are opposite way round from the usual */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpLT32U,
                            binop(Iop_And32,cc_dep2,mkU32(0xFF)),
 			   binop(Iop_And32,cc_dep1,mkU32(0xFF))));
       }

       /*---------------- LOGICL ----------------*/

       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondZ)) {
          /* long and/or/xor, then Z --> test dst==0 */
          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
       }

       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondS)) {
          /* long and/or/xor, then S --> test dst <s 0 */
          return unop(Iop_1Uto32,binop(Iop_CmpLT32S, cc_dep1, mkU32(0)));
       }

       if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondLE)) {
          /* long and/or/xor, then LE
             This is pretty subtle.  LOGIC sets SF and ZF according to the
             result and makes OF be zero.  LE computes (SZ ^ OF) | ZF, but
             OF is zero, so this reduces to SZ | ZF -- which will be 1 iff
             the result is <=signed 0.  Hence ...
          */
          return unop(Iop_1Uto32,binop(Iop_CmpLE32S, cc_dep1, mkU32(0)));
       }

       /*---------------- LOGICW ----------------*/

       if (isU32(cc_op, X86G_CC_OP_LOGICW) && isU32(cond, X86CondZ)) {
          /* byte and/or/xor, then Z --> test dst==0 */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ32, binop(Iop_And32,cc_dep1,mkU32(0xFFFF)),
                                         mkU32(0)));
       }

       /*---------------- LOGICB ----------------*/

       if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondZ)) {
          /* byte and/or/xor, then Z --> test dst==0 */
          return unop(Iop_1Uto32,
                      binop(Iop_CmpEQ32, binop(Iop_And32,cc_dep1,mkU32(255)),
                                         mkU32(0)));
       }

       /*---------------- DECL ----------------*/

       if (isU32(cc_op, X86G_CC_OP_DECL) && isU32(cond, X86CondZ)) {
          /* dec L, then Z --> test dst == 0 */
          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
       }

       if (isU32(cc_op, X86G_CC_OP_DECL) && isU32(cond, X86CondS)) {
          /* dec L, then S --> compare DST <s 0 */
          return unop(Iop_1Uto32,binop(Iop_CmpLT32S, cc_dep1, mkU32(0)));
       }

       /*---------------- SHRL ----------------*/

       if (isU32(cc_op, X86G_CC_OP_SHRL) && isU32(cond, X86CondZ)) {
          /* SHRL, then Z --> test dep1 == 0 */
          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
       }

       /*---------------- COPY ----------------*/
       /* This can happen, as a result of x87 FP compares: "fcom ... ;
          fnstsw %ax ; sahf ; jbe" for example. */

       if (isU32(cc_op, X86G_CC_OP_COPY) && isU32(cond, X86CondBE)) {
          /* COPY, then BE --> extract C and Z from dep1, and test (C
             or Z == 1). */
          return
             unop(
                Iop_1Uto32,
                binop(
                   Iop_CmpNE32,
                   binop(
                      Iop_And32,
                      binop(
                         Iop_Or32,
                         binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
                         binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_Z))
                      ),
                      mkU32(1)
                   ),
                   mkU32(0)
                )
             );
       }

       if (isU32(cc_op, X86G_CC_OP_COPY) && isU32(cond, X86CondB)) {
          /* COPY, then B --> extract C dep1, and test (C == 1). */
          return
             unop(
                Iop_1Uto32,
                binop(
                   Iop_CmpNE32,
                   binop(
                      Iop_And32,
                      binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
                      mkU32(1)
                   ),
                   mkU32(0)
                )
             );
       }

       return NULL;
    }

 #  undef unop
 #  undef binop
 #  undef mkU32
 #  undef mkU8

    return NULL;
 }


 /*-----------------------------------------------------------*/
 /*--- Utility functions for x87 FPU conversions.          ---*/
 /*-----------------------------------------------------------*/


 /* 80 and 64-bit floating point formats:

    80-bit:

     S  0       0-------0      zero
     S  0       0X------X      denormals
     S  1-7FFE  1X------X      normals (all normals have leading 1)
     S  7FFF    10------0      infinity
     S  7FFF    10X-----X      snan
     S  7FFF    11X-----X      qnan

    S is the sign bit.  For runs X----X, at least one of the Xs must be
    nonzero.  Exponent is 15 bits, fractional part is 63 bits, and
    there is an explicitly represented leading 1, and a sign bit,
    giving 80 in total.

    64-bit avoids the confusion of an explicitly represented leading 1
    and so is simpler:

     S  0      0------0   zero
     S  0      X------X   denormals
     S  1-7FE  any        normals
     S  7FF    0------0   infinity
     S  7FF    0X-----X   snan
     S  7FF    1X-----X   qnan

    Exponent is 11 bits, fractional part is 52 bits, and there is a
    sign bit, giving 64 in total.
 */

 static inline Bool host_is_little_endian ( void )
 {
    UInt x = 0x76543210;
    UChar* p = (UChar*)(&x);
    return (*p == 0x10);
 }

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 UInt x86g_calculate_FXAM ( UInt tag, ULong dbl )
 {
    Bool   mantissaIsZero;
    Int    bexp;
    UChar  sign;
    UInt   c1;
    UChar* f64;

    vassert(host_is_little_endian());

    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */

    f64  = (UChar*)(&dbl);
    sign = (f64[7] >> 7) & 1;

    /* First off, if the tag indicates the register was empty,
       return 1,0,sign,1 */
    if (tag == 0) {
       /* vex_printf("Empty\n"); */
       return X86G_FC_MASK_C3 | 0 | sign | X86G_FC_MASK_C0;
    }

    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
    bexp &= 0x7FF;

    c1 = ((UInt)sign) << 9;

    mantissaIsZero
       = (f64[6] & 0x0F) == 0
         && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0;

    /* If both exponent and mantissa are zero, the value is zero.
       Return 1,0,sign,0. */
    if (bexp == 0 && mantissaIsZero) {
       /* vex_printf("Zero\n"); */
       return X86G_FC_MASK_C3 | 0 | sign | 0;
    }

    /* If exponent is zero but mantissa isn't, it's a denormal.
       Return 1,1,sign,0. */
    if (bexp == 0 && !mantissaIsZero) {
       /* vex_printf("Denormal\n"); */
       return X86G_FC_MASK_C3 | X86G_FC_MASK_C2 | sign | 0;
    }

    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
       Return 0,1,sign,1. */
    if (bexp == 0x7FF && mantissaIsZero) {
       /* vex_printf("Inf\n"); */
       return 0 | X86G_FC_MASK_C2 | sign | X86G_FC_MASK_C0;
    }

    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
       Return 0,0,sign,1. */
    if (bexp == 0x7FF && !mantissaIsZero) {
       /* vex_printf("NaN\n"); */
       return 0 | 0 | sign | X86G_FC_MASK_C0;
    }

    /* Uh, ok, we give up.  It must be a normal finite number.
       Return 0,1,sign,0.
    */
    /* vex_printf("normal\n"); */
    return 0 | X86G_FC_MASK_C2 | sign | 0;
 }


 /////////////////////////////////////////////////////////////////

 static inline
 UInt read_bit_array ( UChar* arr, UInt n )
 {
    UChar c = arr[n >> 3];
    c >>= (n&7);
    return c & 1;
 }

 static inline
 void write_bit_array ( UChar* arr, UInt n, UInt b )
 {
    UChar c = arr[n >> 3];
    c &= ~(1 << (n&7));
    c |= ((b&1) << (n&7));
    arr[n >> 3] = c;
 }


 /* Convert a IEEE754 double (64-bit) into an x87 extended double
    (80-bit), mimicing the hardware fairly closely.  Both numbers are
    stored little-endian.  Limitations, all of which could be fixed,
    given some level of hassle:

    * Identity of NaNs is not preserved.

    See comments in the code for more details.
 */
 static void convert_f64le_to_f80le ( /*IN*/UChar* f64, /*OUT*/UChar* f80 )
 {
    Bool  mantissaIsZero;
    Int   bexp, i, j, shift;
    UChar sign;

    sign = (f64[7] >> 7) & 1;
    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
    bexp &= 0x7FF;

    mantissaIsZero = False;
    if (bexp == 0 || bexp == 0x7FF) {
       /* We'll need to know whether or not the mantissa (bits 51:0) is
          all zeroes in order to handle these cases.  So figure it
          out. */
       mantissaIsZero
          = (f64[6] & 0x0F) == 0
            && f64[5] == 0 && f64[4] == 0 && f64[3] == 0
            && f64[2] == 0 && f64[1] == 0 && f64[0] == 0;
    }

    /* If the exponent is zero, either we have a zero or a denormal.
       Produce a zero.  This is a hack in that it forces denormals to
       zero.  Could do better. */
    if (bexp == 0) {
       f80[9] = sign << 7;
       f80[8] = f80[7] = f80[6] = f80[5] = f80[4]
              = f80[3] = f80[2] = f80[1] = f80[0] = 0;

       if (mantissaIsZero)
          /* It really is zero, so that's all we can do. */
          return;

       /* There is at least one 1-bit in the mantissa.  So it's a
          potentially denormalised double -- but we can produce a
          normalised long double.  Count the leading zeroes in the
          mantissa so as to decide how much to bump the exponent down
          by.  Note, this is SLOW. */
       shift = 0;
       for (i = 51; i >= 0; i--) {
         if (read_bit_array(f64, i))
            break;
         shift++;
       }

       /* and copy into place as many bits as we can get our hands on. */
       j = 63;
       for (i = 51 - shift; i >= 0; i--) {
          write_bit_array( f80, j,
      	 read_bit_array( f64, i ) );
          j--;
       }

       /* Set the exponent appropriately, and we're done. */
       bexp -= shift;
       bexp += (16383 - 1023);
       f80[9] = (sign << 7) | ((bexp >> 8) & 0xFF);
       f80[8] = bexp & 0xFF;
       return;
    }

    /* If the exponent is 7FF, this is either an Infinity, a SNaN or
       QNaN, as determined by examining bits 51:0, thus:
           0  ... 0    Inf
           0X ... X    SNaN
           1X ... X    QNaN
       where at least one of the Xs is not zero.
    */
    if (bexp == 0x7FF) {
       if (mantissaIsZero) {
          /* Produce an appropriately signed infinity:
             S 1--1 (15)  1  0--0 (63)
          */
          f80[9] = (sign << 7) | 0x7F;
          f80[8] = 0xFF;
          f80[7] = 0x80;
          f80[6] = f80[5] = f80[4] = f80[3]
                 = f80[2] = f80[1] = f80[0] = 0;
          return;
       }
       /* So it's either a QNaN or SNaN.  Distinguish by considering
          bit 51.  Note, this destroys all the trailing bits
          (identity?) of the NaN.  IEEE754 doesn't require preserving
          these (it only requires that there be one QNaN value and one
          SNaN value), but x87 does seem to have some ability to
          preserve them.  Anyway, here, the NaN's identity is
          destroyed.  Could be improved. */
       if (f64[6] & 8) {
          /* QNaN.  Make a QNaN:
             S 1--1 (15)  1  1--1 (63)
          */
          f80[9] = (sign << 7) | 0x7F;
          f80[8] = 0xFF;
          f80[7] = 0xFF;
          f80[6] = f80[5] = f80[4] = f80[3]
                 = f80[2] = f80[1] = f80[0] = 0xFF;
       } else {
          /* SNaN.  Make a SNaN:
             S 1--1 (15)  0  1--1 (63)
          */
          f80[9] = (sign << 7) | 0x7F;
          f80[8] = 0xFF;
          f80[7] = 0x7F;
          f80[6] = f80[5] = f80[4] = f80[3]
                 = f80[2] = f80[1] = f80[0] = 0xFF;
       }
       return;
    }

    /* It's not a zero, denormal, infinity or nan.  So it must be a
       normalised number.  Rebias the exponent and build the new
       number.  */
    bexp += (16383 - 1023);

    f80[9] = (sign << 7) | ((bexp >> 8) & 0xFF);
    f80[8] = bexp & 0xFF;
    f80[7] = (1 << 7) | ((f64[6] << 3) & 0x78) | ((f64[5] >> 5) & 7);
    f80[6] = ((f64[5] << 3) & 0xF8) | ((f64[4] >> 5) & 7);
    f80[5] = ((f64[4] << 3) & 0xF8) | ((f64[3] >> 5) & 7);
    f80[4] = ((f64[3] << 3) & 0xF8) | ((f64[2] >> 5) & 7);
    f80[3] = ((f64[2] << 3) & 0xF8) | ((f64[1] >> 5) & 7);
    f80[2] = ((f64[1] << 3) & 0xF8) | ((f64[0] >> 5) & 7);
    f80[1] = ((f64[0] << 3) & 0xF8);
    f80[0] = 0;
 }


 /////////////////////////////////////////////////////////////////

 /* Convert a x87 extended double (80-bit) into an IEEE 754 double
    (64-bit), mimicking the hardware fairly closely.  Both numbers are
    stored little-endian.  Limitations, both of which could be fixed,
    given some level of hassle:

    * Rounding following truncation could be a bit better.

    * Identity of NaNs is not preserved.

    See comments in the code for more details.
 */
 static void convert_f80le_to_f64le ( /*IN*/UChar* f80, /*OUT*/UChar* f64 )
 {
    Bool  isInf;
    Int   bexp, i, j;
    UChar sign;

    sign = (f80[9] >> 7) & 1;
    bexp = (((UInt)f80[9]) << 8) | (UInt)f80[8];
    bexp &= 0x7FFF;

    /* If the exponent is zero, either we have a zero or a denormal.
       But an extended precision denormal becomes a double precision
       zero, so in either case, just produce the appropriately signed
       zero. */
    if (bexp == 0) {
       f64[7] = sign << 7;
       f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
       return;
    }

    /* If the exponent is 7FFF, this is either an Infinity, a SNaN or
       QNaN, as determined by examining bits 62:0, thus:
           0  ... 0    Inf
           0X ... X    SNaN
           1X ... X    QNaN
       where at least one of the Xs is not zero.
    */
    if (bexp == 0x7FFF) {
       isInf = (f80[7] & 0x7F) == 0
               && f80[6] == 0 && f80[5] == 0 && f80[4] == 0
               && f80[3] == 0 && f80[2] == 0 && f80[1] == 0 && f80[0] == 0;
       if (isInf) {
          if (0 == (f80[7] & 0x80))
             goto wierd_NaN;
          /* Produce an appropriately signed infinity:
             S 1--1 (11)  0--0 (52)
          */
          f64[7] = (sign << 7) | 0x7F;
          f64[6] = 0xF0;
          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
          return;
       }
       /* So it's either a QNaN or SNaN.  Distinguish by considering
          bit 62.  Note, this destroys all the trailing bits
          (identity?) of the NaN.  IEEE754 doesn't require preserving
          these (it only requires that there be one QNaN value and one
          SNaN value), but x87 does seem to have some ability to
          preserve them.  Anyway, here, the NaN's identity is
          destroyed.  Could be improved. */
       if (f80[8] & 0x40) {
          /* QNaN.  Make a QNaN:
             S 1--1 (11)  1  1--1 (51)
          */
          f64[7] = (sign << 7) | 0x7F;
          f64[6] = 0xFF;
          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;
       } else {
          /* SNaN.  Make a SNaN:
             S 1--1 (11)  0  1--1 (51)
          */
          f64[7] = (sign << 7) | 0x7F;
          f64[6] = 0xF7;
          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;
       }
       return;
    }

    /* If it's not a Zero, NaN or Inf, and the integer part (bit 62) is
       zero, the x87 FPU appears to consider the number denormalised
       and converts it to a QNaN. */
    if (0 == (f80[7] & 0x80)) {
       wierd_NaN:
       /* Strange hardware QNaN:
          S 1--1 (11)  1  0--0 (51)
       */
       /* On a PIII, these QNaNs always appear with sign==1.  I have
          no idea why. */
       f64[7] = (1 /*sign*/ << 7) | 0x7F;
       f64[6] = 0xF8;
       f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
       return;
    }

    /* It's not a zero, denormal, infinity or nan.  So it must be a
       normalised number.  Rebias the exponent and consider. */
    bexp -= (16383 - 1023);
    if (bexp >= 0x7FF) {
       /* It's too big for a double.  Construct an infinity. */
       f64[7] = (sign << 7) | 0x7F;
       f64[6] = 0xF0;
       f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
       return;
    }

    if (bexp <= 0) {
       /* It's too small for a normalised double.  First construct a
          zero and then see if it can be improved into a denormal.  */
       f64[7] = sign << 7;
       f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;

       if (bexp < -52)
          /* Too small even for a denormal. */
          return;

       /* Ok, let's make a denormal.  Note, this is SLOW. */
       /* Copy bits 63, 62, 61, etc of the src mantissa into the dst,
          indexes 52+bexp, 51+bexp, etc, until k+bexp < 0. */
       /* bexp is in range -52 .. 0 inclusive */
       for (i = 63; i >= 0; i--) {
          j = i - 12 + bexp;
          if (j < 0) break;
          /* We shouldn't really call vassert from generated code. */
          vassert(j >= 0 && j < 52);
          write_bit_array ( f64,
                            j,
                            read_bit_array ( f80, i ) );
       }
       /* and now we might have to round ... */
       if (read_bit_array(f80, 10+1 - bexp) == 1)
          goto do_rounding;

       return;
    }

    /* Ok, it's a normalised number which is representable as a double.
       Copy the exponent and mantissa into place. */
    /*
    for (i = 0; i < 52; i++)
       write_bit_array ( f64,
                         i,
                         read_bit_array ( f80, i+11 ) );
    */
    f64[0] = (f80[1] >> 3) | (f80[2] << 5);
    f64[1] = (f80[2] >> 3) | (f80[3] << 5);
    f64[2] = (f80[3] >> 3) | (f80[4] << 5);
    f64[3] = (f80[4] >> 3) | (f80[5] << 5);
    f64[4] = (f80[5] >> 3) | (f80[6] << 5);
    f64[5] = (f80[6] >> 3) | (f80[7] << 5);

    f64[6] = ((bexp << 4) & 0xF0) | ((f80[7] >> 3) & 0x0F);

    f64[7] = (sign << 7) | ((bexp >> 4) & 0x7F);

    /* Now consider any rounding that needs to happen as a result of
       truncating the mantissa. */
    if (f80[1] & 4) /* read_bit_array(f80, 10) == 1) */ {

       /* If the bottom bits of f80 are "100 0000 0000", then the
          infinitely precise value is deemed to be mid-way between the
          two closest representable values.  Since we're doing
          round-to-nearest (the default mode), in that case it is the
          bit immediately above which indicates whether we should round
          upwards or not -- if 0, we don't.  All that is encapsulated
          in the following simple test. */
       if ((f80[1] & 0xF) == 4/*0100b*/ && f80[0] == 0)
          return;

       do_rounding:
       /* Round upwards.  This is a kludge.  Once in every 2^24
          roundings (statistically) the bottom three bytes are all 0xFF
          and so we don't round at all.  Could be improved. */
       if (f64[0] != 0xFF) {
          f64[0]++;
       }
       else
       if (f64[0] == 0xFF && f64[1] != 0xFF) {
          f64[0] = 0;
          f64[1]++;
       }
       else
       if (f64[0] == 0xFF && f64[1] == 0xFF && f64[2] != 0xFF) {
          f64[0] = 0;
          f64[1] = 0;
          f64[2]++;
       }
       /* else we don't round, but we should. */
    }
 }


 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (reads guest memory) */
 ULong x86g_loadF80le ( UInt addrU )
 {
    ULong f64;
    convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
    return f64;
 }

 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (writes guest memory) */
 void x86g_storeF80le ( UInt addrU, ULong f64 )
 {
    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
 }


 /*----------------------------------------------*/
 /*--- The exported fns ..                    ---*/
 /*----------------------------------------------*/

 /* Layout of the real x87 state. */

 typedef
    struct {
       UShort env[14];
       UChar  reg[80];
    }
    Fpu_State;

 /* Offsets, in 16-bit ints, into the FPU environment (env) area. */
 #define FP_ENV_CTRL   0
 #define FP_ENV_STAT   2
 #define FP_ENV_TAG    4
 #define FP_ENV_IP     6 /* and 7 */
 #define FP_ENV_CS     8
 #define FP_ENV_OPOFF  10 /* and 11 */
 #define FP_ENV_OPSEL  12
 #define FP_REG(ii)    (10*(7-(ii)))


 /* CLEAN HELPER */
 /* fpucw[15:0] contains a x87 native format FPU control word.
    Extract from it the required FPROUND value and any resulting
    emulation warning, and return (warn << 32) | fpround value.
 */
 ULong x86g_check_fldcw ( UInt fpucw )
 {
    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
    /* NOTE, encoded exactly as per enum IRRoundingMode. */
    UInt rmode = (fpucw >> 10) & 3;

    /* Detect any required emulation warnings. */
    VexEmWarn ew = EmWarn_NONE;

    if ((fpucw & 0x3F) != 0x3F) {
       /* unmasked exceptions! */
       ew = EmWarn_X86_x87exns;
    }
    else
    if (((fpucw >> 8) & 3) != 3) {
       /* unsupported precision */
       ew = EmWarn_X86_x87precision;
    }

    return (((ULong)ew) << 32) | ((ULong)rmode);
 }

 /* CLEAN HELPER */
 /* Given fpround as an IRRoundingMode value, create a suitable x87
    native format FPU control word. */
 UInt x86g_create_fpucw ( UInt fpround )
 {
    fpround &= 3;
    return 0x037F | (fpround << 10);
 }


 /* CLEAN HELPER */
 /* mxcsr[15:0] contains a SSE native format MXCSR value.
    Extract from it the required SSEROUND value and any resulting
    emulation warning, and return (warn << 32) | sseround value.
 */
 ULong x86g_check_ldmxcsr ( UInt mxcsr )
 {
    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
    /* NOTE, encoded exactly as per enum IRRoundingMode. */
    UInt rmode = (mxcsr >> 13) & 3;

    /* Detect any required emulation warnings. */
    VexEmWarn ew = EmWarn_NONE;

    if ((mxcsr & 0x1F80) != 0x1F80) {
       /* unmasked exceptions! */
       ew = EmWarn_X86_sseExns;
    }
    else
    if (mxcsr & (1<<15)) {
       /* FZ is set */
       ew = EmWarn_X86_fz;
    }
    else
    if (mxcsr & (1<<6)) {
       /* DAZ is set */
       ew = EmWarn_X86_daz;
    }

    return (((ULong)ew) << 32) | ((ULong)rmode);
 }


 /* CLEAN HELPER */
 /* Given sseround as an IRRoundingMode value, create a suitable SSE
    native format MXCSR value. */
 UInt x86g_create_mxcsr ( UInt sseround )
 {
    sseround &= 3;
    return 0x1F80 | (sseround << 13);
 }


 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (writes guest state) */
 /* Initialise the x87 FPU state as per 'finit'. */
 void x86g_dirtyhelper_FINIT ( VexGuestX86State* gst )
 {
    Int i;
    gst->guest_FTOP = 0;
    for (i = 0; i < 8; i++) {
       gst->guest_FPTAG[i] = 0; /* empty */
       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
    }
    gst->guest_FPROUND = (UInt)Irrm_NEAREST;
    gst->guest_FC3210  = 0;
 }


 /* This is used to implement both 'frstor' and 'fldenv'.  The latter
    appears to differ from the former only in that the 8 FP registers
    themselves are not transferred into the guest state. */
 static
 VexEmWarn do_put_x87 ( Bool moveRegs,
                        /*IN*/UChar* x87_state,
                        /*OUT*/VexGuestX86State* vex_state )
 {
    Int        stno, preg;
    UInt       tag;
    Double*    vexRegs = (Double*)(&vex_state->guest_FPREG[0]);
    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
    Fpu_State* x87     = (Fpu_State*)x87_state;
    UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
    UInt       tagw    = x87->env[FP_ENV_TAG];
    UInt       fpucw   = x87->env[FP_ENV_CTRL];
    UInt       c3210   = x87->env[FP_ENV_STAT] & 0x4700;
    VexEmWarn  ew;
    UInt       fpround;
    ULong      pair;

    /* Copy registers and tags */
    for (stno = 0; stno < 8; stno++) {
       preg = (stno + ftop) & 7;
       tag = (tagw >> (2*preg)) & 3;
       if (tag == 3) {
          /* register is empty */
          /* hmm, if it's empty, does it still get written?  Probably
             safer to say it does.  If we don't, memcheck could get out
             of sync, in that it thinks all FP registers are defined by
             this helper, but in reality some have not been updated. */
          if (True || moveRegs)
             vexRegs[preg] = 0.0;
          vexTags[preg] = 0;
       } else {
          /* register is non-empty */
          if (moveRegs)
             convert_f80le_to_f64le( &x87->reg[10*stno],
                                     (UChar*)&vexRegs[preg] );
          vexTags[preg] = 1;
       }
    }

    /* stack pointer */
    vex_state->guest_FTOP = ftop;

    /* status word */
    vex_state->guest_FC3210 = c3210;

    /* handle the control word, setting FPROUND and detecting any
       emulation warnings. */
    pair    = x86g_check_fldcw ( (UInt)fpucw );
    fpround = (UInt)pair;
    ew      = (VexEmWarn)(pair >> 32);

    vex_state->guest_FPROUND = fpround & 3;

    /* emulation warnings --> caller */
    return ew;
 }


 /* Create an x87 FPU state from the guest state, as close as
    we can approximate it. */
 static
 void do_get_x87 ( /*IN*/VexGuestX86State* vex_state,
                   /*OUT*/UChar* x87_state )
 {
    Int        i, stno, preg;
    UInt       tagw;
    Double*    vexRegs = (Double*)(&vex_state->guest_FPREG[0]);
    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
    Fpu_State* x87     = (Fpu_State*)x87_state;
    UInt       ftop    = vex_state->guest_FTOP;
    UInt       c3210   = vex_state->guest_FC3210;

    for (i = 0; i < 14; i++)
       x87->env[i] = 0;

    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
    x87->env[FP_ENV_STAT] = ((ftop & 7) << 11) | (c3210 & 0x4700);
    x87->env[FP_ENV_CTRL]
       = (UShort)x86g_create_fpucw( vex_state->guest_FPROUND );

    /* Dump the register stack in ST order. */
    tagw = 0;
    for (stno = 0; stno < 8; stno++) {
       preg = (stno + ftop) & 7;
       if (vexTags[preg] == 0) {
          /* register is empty */
          tagw |= (3 << (2*preg));
          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
                                  &x87->reg[10*stno] );
       } else {
          /* register is full. */
          tagw |= (0 << (2*preg));
          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
                                  &x87->reg[10*stno] );
       }
    }
    x87->env[FP_ENV_TAG] = tagw;
 }


 /* VISIBLE TO LIBVEX CLIENT */
 UInt LibVEX_GuestX86_get_eflags ( /*IN*/VexGuestX86State* vex_state )
 {
    UInt eflags = x86g_calculate_eflags_all(
                     vex_state->guest_CC_OP,
                     vex_state->guest_CC_DEP1,
                     vex_state->guest_CC_DEP2,
                     vex_state->guest_CC_NDEP
                  );
    UInt dflag = vex_state->guest_DFLAG;
    vassert(dflag == 1 || dflag == 0xFFFFFFFF);
    if (dflag == 0xFFFFFFFF)
       eflags |= (1<<10);
    if (vex_state->guest_IDFLAG == 1)
       eflags |= (1<<21);

    return eflags;
 }

 /* VISIBLE TO LIBVEX CLIENT */
 void LibVEX_GuestX86_initialise ( /*OUT*/VexGuestX86State* vex_state )
 {
    vex_state->guest_EAX = 0;
    vex_state->guest_ECX = 0;
    vex_state->guest_EDX = 0;
    vex_state->guest_EBX = 0;
    vex_state->guest_ESP = 0;
    vex_state->guest_EBP = 0;
    vex_state->guest_ESI = 0;
    vex_state->guest_EDI = 0;

    vex_state->guest_CC_OP   = X86G_CC_OP_COPY;
    vex_state->guest_CC_DEP1 = 0;
    vex_state->guest_CC_DEP2 = 0;
    vex_state->guest_CC_NDEP = 0;
    vex_state->guest_DFLAG   = 1; /* forwards */
    vex_state->guest_IDFLAG  = 0;

    vex_state->guest_EIP = 0;

    /* Initialise the simulated FPU */
    x86g_dirtyhelper_FINIT( vex_state );

    /* Initialse the SSE state. */
 #  define SSEZERO(_xmm) _xmm[0]=_xmm[1]=_xmm[2]=_xmm[3] = 0;

    vex_state->guest_SSEROUND = (UInt)Irrm_NEAREST;
    SSEZERO(vex_state->guest_XMM0);
    SSEZERO(vex_state->guest_XMM1);
    SSEZERO(vex_state->guest_XMM2);
    SSEZERO(vex_state->guest_XMM3);
    SSEZERO(vex_state->guest_XMM4);
    SSEZERO(vex_state->guest_XMM5);
    SSEZERO(vex_state->guest_XMM6);
    SSEZERO(vex_state->guest_XMM7);

 #  undef SSEZERO

    vex_state->guest_CS  = 0;
    vex_state->guest_DS  = 0;
    vex_state->guest_ES  = 0;
    vex_state->guest_FS  = 0;
    vex_state->guest_GS  = 0;
    vex_state->guest_SS  = 0;
    vex_state->guest_LDT = 0;
    vex_state->guest_GDT = 0;

    vex_state->guest_EMWARN = EmWarn_NONE;
 }


 /*----------------------------------------------*/
 /*--- Misc integer/fp helpers                ---*/
 /*----------------------------------------------*/

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* Calculate both flags and value result for rotate right
    through the carry bit.  Result in low 32 bits,
    new flags (OSZACP) in high 32 bits.
 */
 ULong x86g_calculate_RCR ( UInt arg, UInt rot_amt, UInt eflags_in, UInt sz )
 {
    UInt tempCOUNT = rot_amt & 0x1F, cf=0, of=0, tempcf;

    switch (sz) {
       case 4:
          cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
          of        = ((arg >> 31) ^ cf) & 1;
          while (tempCOUNT > 0) {
             tempcf = arg & 1;
             arg    = (arg >> 1) | (cf << 31);
             cf     = tempcf;
             tempCOUNT--;
          }
          break;
       case 2:
          while (tempCOUNT >= 17) tempCOUNT -= 17;
          cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
          of        = ((arg >> 15) ^ cf) & 1;
          while (tempCOUNT > 0) {
             tempcf = arg & 1;
             arg    = ((arg >> 1) & 0x7FFF) | (cf << 15);
             cf     = tempcf;
             tempCOUNT--;
          }
          break;
       case 1:
          while (tempCOUNT >= 9) tempCOUNT -= 9;
          cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
          of        = ((arg >> 7) ^ cf) & 1;
          while (tempCOUNT > 0) {
             tempcf = arg & 1;
             arg    = ((arg >> 1) & 0x7F) | (cf << 7);
             cf     = tempcf;
             tempCOUNT--;
          }
          break;
       default:
          vpanic("calculate_RCR: invalid size");
    }

    cf &= 1;
    of &= 1;
    eflags_in &= ~(X86G_CC_MASK_C | X86G_CC_MASK_O);
    eflags_in |= (cf << X86G_CC_SHIFT_C) | (of << X86G_CC_SHIFT_O);

    return (((ULong)eflags_in) << 32) | ((ULong)arg);
 }


 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (modifies guest state) */
 /* Claim to be a P55C (Intel Pentium/MMX) */
 void x86g_dirtyhelper_CPUID_sse0 ( VexGuestX86State* st )
 {
    switch (st->guest_EAX) {
       case 0:
          st->guest_EAX = 0x1;
          st->guest_EBX = 0x756e6547;
          st->guest_ECX = 0x6c65746e;
          st->guest_EDX = 0x49656e69;
          break;
       default:
          st->guest_EAX = 0x543;
          st->guest_EBX = 0x0;
          st->guest_ECX = 0x0;
          st->guest_EDX = 0x8001bf;
          break;
    }
 }

 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (modifies guest state) */
 /* Claim to be the following SSE1-capable CPU:
    vendor_id       : GenuineIntel
    cpu family      : 6
    model           : 11
    model name      : Intel(R) Pentium(R) III CPU family      1133MHz
    stepping        : 1
    cpu MHz         : 1131.013
    cache size      : 512 KB
 */
 void x86g_dirtyhelper_CPUID_sse1 ( VexGuestX86State* st )
 {
    switch (st->guest_EAX) {
       case 0:
          st->guest_EAX = 0x00000002;
          st->guest_EBX = 0x756e6547;
          st->guest_ECX = 0x6c65746e;
          st->guest_EDX = 0x49656e69;
          break;
       case 1:
          st->guest_EAX = 0x000006b1;
          st->guest_EBX = 0x00000004;
          st->guest_ECX = 0x00000000;
          st->guest_EDX = 0x0383fbff;
          break;
       default:
          st->guest_EAX = 0x03020101;
          st->guest_EBX = 0x00000000;
          st->guest_ECX = 0x00000000;
          st->guest_EDX = 0x0c040883;
          break;
    }
 }

 /* Claim to be the following SSE2-capable CPU:
    vendor_id       : GenuineIntel
    cpu family      : 15
    model           : 2
    model name      : Intel(R) Pentium(R) 4 CPU 2.40GHz
    stepping        : 7
    cpu MHz         : 2394.234
    cache size      : 512 KB
 */
 void x86g_dirtyhelper_CPUID_sse2 ( VexGuestX86State* st )
 {
    switch (st->guest_EAX) {
       case 0:
          st->guest_EAX = 0x00000002;
          st->guest_EBX = 0x756e6547;
          st->guest_ECX = 0x6c65746e;
          st->guest_EDX = 0x49656e69;
          break;
       case 1:
          st->guest_EAX = 0x00000f27;
          st->guest_EBX = 0x00010809;
          st->guest_ECX = 0x00004400;
          st->guest_EDX = 0xbfebfbff;
          break;
       default:
          st->guest_EAX = 0x665b5101;
          st->guest_EBX = 0x00000000;
          st->guest_ECX = 0x00000000;
          st->guest_EDX = 0x007b7040;
          break;
    }
 }


 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (reads guest state, writes guest mem) */
 void x86g_dirtyhelper_FXSAVE ( VexGuestX86State* gst, HWord addr )
 {
    /* Somewhat roundabout, but at least it's simple. */
    Fpu_State tmp;
    UShort*   addrS = (UShort*)addr;
    UChar*    addrC = (UChar*)addr;
    U128*     xmm   = (U128*)(addr + 160);
    UInt      mxcsr;
    UShort    fp_tags;
    UChar     summary_tags;
    Int       r, stno;
    UShort    *srcS, *dstS;

    do_get_x87( gst, (UChar*)&tmp );
    mxcsr = x86g_create_mxcsr( gst->guest_SSEROUND );

    /* Now build the proper fxsave image from the x87 image we just
       made. */

    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */

    /* set addrS[2] in an endian-independent way */
    summary_tags = 0;
    fp_tags = tmp.env[FP_ENV_TAG];
    for (r = 0; r < 8; r++) {
       if ( ((fp_tags >> (2*r)) & 3) != 3 )
          summary_tags |= (1 << r);
    }
    addrC[4]  = summary_tags; /* FTW: tag summary byte */
    addrC[5]  = 0; /* pad */

    addrS[3]  = 0; /* FOP: fpu opcode (bogus) */
    addrS[4]  = 0;
    addrS[5]  = 0; /* FPU IP (bogus) */
    addrS[6]  = 0; /* FPU IP's segment selector (bogus) (although we
                      could conceivably dump %CS here) */

    addrS[7]  = 0; /* Intel reserved */

    addrS[8]  = 0; /* FPU DP (operand pointer) (bogus) */
    addrS[9]  = 0; /* FPU DP (operand pointer) (bogus) */
    addrS[10] = 0; /* segment selector for above operand pointer; %DS
                      perhaps? */
    addrS[11] = 0; /* Intel reserved */

    addrS[12] = (UShort)mxcsr;  /* MXCSR */
    addrS[13] = (UShort)(mxcsr >> 16);

    addrS[14] = 0xFFFF; /* MXCSR mask (lo16); who knows what for */
    addrS[15] = 0xFFFF; /* MXCSR mask (hi16); who knows what for */

    /* Copy in the FP registers, in ST order. */
    for (stno = 0; stno < 8; stno++) {
       srcS = (UShort*)(&tmp.reg[10*stno]);
       dstS = (UShort*)(&addrS[16 + 8*stno]);
       dstS[0] = srcS[0];
       dstS[1] = srcS[1];
       dstS[2] = srcS[2];
       dstS[3] = srcS[3];
       dstS[4] = srcS[4];
       dstS[5] = 0;
       dstS[6] = 0;
       dstS[7] = 0;
    }

    /* That's the first 160 bytes of the image done.  Now only %xmm0
       .. %xmm7 remain to be copied.  If the host is big-endian, these
       need to be byte-swapped. */
    vassert(host_is_little_endian());

 #  define COPY_U128(_dst,_src)                       \
       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
       while (0)

    COPY_U128( xmm[0], gst->guest_XMM0 );
    COPY_U128( xmm[1], gst->guest_XMM1 );
    COPY_U128( xmm[2], gst->guest_XMM2 );
    COPY_U128( xmm[3], gst->guest_XMM3 );
    COPY_U128( xmm[4], gst->guest_XMM4 );
    COPY_U128( xmm[5], gst->guest_XMM5 );
    COPY_U128( xmm[6], gst->guest_XMM6 );
    COPY_U128( xmm[7], gst->guest_XMM7 );

 #  undef COPY_U128
 }


 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (reads guest state, writes guest mem) */
 void x86g_dirtyhelper_FSAVE ( VexGuestX86State* gst, HWord addr )
 {
    do_get_x87( gst, (UChar*)addr );
 }

 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (writes guest state, reads guest mem) */
 VexEmWarn x86g_dirtyhelper_FRSTOR ( VexGuestX86State* gst, HWord addr )
 {
    return do_put_x87( True/*regs too*/, (UChar*)addr, gst );
 }

 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (reads guest state, writes guest mem) */
 void x86g_dirtyhelper_FSTENV ( VexGuestX86State* gst, HWord addr )
 {
    /* Somewhat roundabout, but at least it's simple. */
    Int       i;
    UShort*   addrP = (UShort*)addr;
    Fpu_State tmp;
    do_get_x87( gst, (UChar*)&tmp );
    for (i = 0; i < 14; i++)
       addrP[i] = tmp.env[i];
 }

 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (writes guest state, reads guest mem) */
 VexEmWarn x86g_dirtyhelper_FLDENV ( VexGuestX86State* gst, HWord addr )
 {
    return do_put_x87( False/*don't move regs*/, (UChar*)addr, gst);
 }


 /*----------------------------------------------*/
 /*--- Helpers for MMX/SSE                    ---*/
 /*----------------------------------------------*/

 static inline UChar abdU8 ( UChar xx, UChar yy ) {
    return xx>yy ? xx-yy : yy-xx;
 }

 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
    return (((ULong)w1) << 32) | ((ULong)w0);
 }

 static inline UShort sel16x4_3 ( ULong w64 ) {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFFFF & (UShort)(hi32 >> 16);
 }
 static inline UShort sel16x4_2 ( ULong w64 ) {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFFFF & (UShort)hi32;
 }
 static inline UShort sel16x4_1 ( ULong w64 ) {
    UInt lo32 = (UInt)w64;
    return 0xFFFF & (UShort)(lo32 >> 16);
 }
 static inline UShort sel16x4_0 ( ULong w64 ) {
    UInt lo32 = (UInt)w64;
    return 0xFFFF & (UShort)lo32;
 }

 static inline UChar sel8x8_7 ( ULong w64 ) {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFF & (UChar)(hi32 >> 24);
 }
 static inline UChar sel8x8_6 ( ULong w64 ) {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFF & (UChar)(hi32 >> 16);
 }
 static inline UChar sel8x8_5 ( ULong w64 ) {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFF & (UChar)(hi32 >> 8);
 }
 static inline UChar sel8x8_4 ( ULong w64 ) {
    UInt hi32 = (UInt)(w64 >> 32);
    return 0xFF & (UChar)(hi32 >> 0);
 }
 static inline UChar sel8x8_3 ( ULong w64 ) {
    UInt lo32 = (UInt)w64;
    return 0xFF & (UChar)(lo32 >> 24);
 }
 static inline UChar sel8x8_2 ( ULong w64 ) {
    UInt lo32 = (UInt)w64;
    return 0xFF & (UChar)(lo32 >> 16);
 }
 static inline UChar sel8x8_1 ( ULong w64 ) {
    UInt lo32 = (UInt)w64;
    return 0xFF & (UChar)(lo32 >> 8);
 }
 static inline UChar sel8x8_0 ( ULong w64 ) {
    UInt lo32 = (UInt)w64;
    return 0xFF & (UChar)(lo32 >> 0);
 }

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 ULong x86g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
 {
    return
       mk32x2(
          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
       );
 }

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 UInt x86g_calculate_mmx_pmovmskb ( ULong xx )
 {
    UInt r = 0;
    if (xx & (1ULL << (64-1))) r |= (1<<7);
    if (xx & (1ULL << (56-1))) r |= (1<<6);
    if (xx & (1ULL << (48-1))) r |= (1<<5);
    if (xx & (1ULL << (40-1))) r |= (1<<4);
    if (xx & (1ULL << (32-1))) r |= (1<<3);
    if (xx & (1ULL << (24-1))) r |= (1<<2);
    if (xx & (1ULL << (16-1))) r |= (1<<1);
    if (xx & (1ULL << ( 8-1))) r |= (1<<0);
    return r;
 }

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 ULong x86g_calculate_mmx_psadbw ( ULong xx, ULong yy )
 {
    UInt t = 0;
    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
    t &= 0xFFFF;
    return (ULong)t;
 }

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo )
 {
    UInt rHi8 = x86g_calculate_mmx_pmovmskb ( w64hi );
    UInt rLo8 = x86g_calculate_mmx_pmovmskb ( w64lo );
    return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF);
 }


 /*----------------------------------------------*/
 /*--- Helpers for segment overrides          ---*/
 /*----------------------------------------------*/

 static inline
 UInt get_segdescr_base ( VexGuestX86SegDescr* ent )
 {
    UInt lo  = 0xFFFF & (UInt)ent->LdtEnt.Bits.BaseLow;
    UInt mid =   0xFF & (UInt)ent->LdtEnt.Bits.BaseMid;
    UInt hi  =   0xFF & (UInt)ent->LdtEnt.Bits.BaseHi;
    return (hi << 24) | (mid << 16) | lo;
 }

 static inline
 UInt get_segdescr_limit ( VexGuestX86SegDescr* ent )
 {
     UInt lo    = 0xFFFF & (UInt)ent->LdtEnt.Bits.LimitLow;
     UInt hi    =    0xF & (UInt)ent->LdtEnt.Bits.LimitHi;
     UInt limit = (hi << 16) | lo;
     if (ent->LdtEnt.Bits.Granularity)
        limit = (limit << 12) | 0xFFF;
     return limit;
 }

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
                               UInt seg_selector, UInt virtual_addr )
 {
    UInt tiBit, base, limit;
    VexGuestX86SegDescr* the_descrs;

    Bool verboze = False;

    /* If this isn't true, we're in Big Trouble. */
    vassert(8 == sizeof(VexGuestX86SegDescr));

    if (verboze)
       vex_printf("x86h_use_seg_selector: "
                  "seg_selector = 0x%x, vaddr = 0x%x\n",
                  seg_selector, virtual_addr);

    /* Check for wildly invalid selector. */
    if (seg_selector & ~0xFFFF)
       goto bad;

    seg_selector &= 0x0000FFFF;

    /* Sanity check the segment selector.  Ensure that RPL=11b (least
       privilege).  This forms the bottom 2 bits of the selector. */
    if ((seg_selector & 3) != 3)
       goto bad;

    /* Extract the TI bit (0 means GDT, 1 means LDT) */
    tiBit = (seg_selector >> 2) & 1;

    /* Convert the segment selector onto a table index */
    seg_selector >>= 3;
    vassert(seg_selector >= 0 && seg_selector < 8192);

    if (tiBit == 0) {

       /* GDT access. */
       /* Do we actually have a GDT to look at? */
       if (gdt == 0)
          goto bad;

       /* Check for access to non-existent entry. */
       if (seg_selector >= VEX_GUEST_X86_GDT_NENT)
          goto bad;

       the_descrs = (VexGuestX86SegDescr*)gdt;
       base  = get_segdescr_base (&the_descrs[seg_selector]);
       limit = get_segdescr_limit(&the_descrs[seg_selector]);

    } else {

       /* All the same stuff, except for the LDT. */
       if (ldt == 0)
          goto bad;

       if (seg_selector >= VEX_GUEST_X86_LDT_NENT)
          goto bad;

       the_descrs = (VexGuestX86SegDescr*)ldt;
       base  = get_segdescr_base (&the_descrs[seg_selector]);
       limit = get_segdescr_limit(&the_descrs[seg_selector]);

    }

    /* Do the limit check.  Note, this check is just slightly too
       slack.  Really it should be "if (virtual_addr + size - 1 >=
       limit)," but we don't have the size info to hand.  Getting it
       could be significantly complex.  */
    if (virtual_addr >= limit)
       goto bad;

    if (verboze)
       vex_printf("x86h_use_seg_selector: "
                  "base = 0x%x, addr = 0x%x\n",
                  base, base + virtual_addr);

    /* High 32 bits are zero, indicating success. */
    return (ULong)( ((UInt)virtual_addr) + base );

  bad:
    return 1ULL << 32;
 }


 /*-----------------------------------------------------------*/
 /*--- Describing the x86 guest state, for the benefit     ---*/
 /*--- of iropt and instrumenters.                         ---*/
 /*-----------------------------------------------------------*/

 /* Figure out if any part of the guest state contained in minoff
    .. maxoff requires precise memory exceptions.  If in doubt return
    True (but this is generates significantly slower code).

    We enforce precise exns for guest %ESP and %EIP only.
 */
 Bool guest_x86_state_requires_precise_mem_exns ( Int minoff,
                                                  Int maxoff)
 {
    Int esp_min = offsetof(VexGuestX86State, guest_ESP);
    Int esp_max = esp_min + 4 - 1;
    Int eip_min = offsetof(VexGuestX86State, guest_EIP);
    Int eip_max = eip_min + 4 - 1;

    if (maxoff < esp_min || minoff > esp_max) {
       /* no overlap with esp */
    } else {
       return True;
    }

    if (maxoff < eip_min || minoff > eip_max) {
       /* no overlap with eip */
    } else {
       return True;
    }

    return False;
 }


 #define ALWAYSDEFD(field)                           \
     { offsetof(VexGuestX86State, field),            \
       (sizeof ((VexGuestX86State*)0)->field) }

 VexGuestLayout
    x86guest_layout
       = {
           /* Total size of the guest state, in bytes. */
           .total_sizeB = sizeof(VexGuestX86State),

           /* Describe the stack pointer. */
           .offset_SP = offsetof(VexGuestX86State,guest_ESP),
           .sizeof_SP = 4,

           /* Describe the instruction pointer. */
           .offset_IP = offsetof(VexGuestX86State,guest_EIP),
           .sizeof_IP = 4,

           /* Describe any sections to be regarded by Memcheck as
              'always-defined'. */
           .n_alwaysDefd = 18,

           /* flags thunk: OP and NDEP are always defd, whereas DEP1
              and DEP2 have to be tracked.  See detailed comment in
              gdefs.h on meaning of thunk fields. */
           .alwaysDefd
              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
                  /*  2 */ ALWAYSDEFD(guest_DFLAG),
                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
                  /*  4 */ ALWAYSDEFD(guest_EIP),
                  /*  5 */ ALWAYSDEFD(guest_FTOP),
                  /*  6 */ ALWAYSDEFD(guest_FPTAG),
                  /*  7 */ ALWAYSDEFD(guest_FPROUND),
                  /*  8 */ ALWAYSDEFD(guest_FC3210),
                  /*  9 */ ALWAYSDEFD(guest_CS),
                  /* 10 */ ALWAYSDEFD(guest_DS),
                  /* 11 */ ALWAYSDEFD(guest_ES),
                  /* 12 */ ALWAYSDEFD(guest_FS),
                  /* 13 */ ALWAYSDEFD(guest_GS),
                  /* 14 */ ALWAYSDEFD(guest_SS),
                  /* 15 */ ALWAYSDEFD(guest_LDT),
                  /* 16 */ ALWAYSDEFD(guest_GDT),
                  /* 17 */ ALWAYSDEFD(guest_EMWARN)
                }
         };


 /*---------------------------------------------------------------*/
 /*--- end                                guest-x86/ghelpers.c ---*/
 /*---------------------------------------------------------------*/