priv/guest-amd64/ghelpers.c - platform/external/valgrind - Gitiles


 /*---------------------------------------------------------------*/
 /*---                                                         ---*/
 /*--- This file (guest-amd64/ghelpers.c) is                   ---*/
 /*--- Copyright (C) OpenWorks LLP.  All rights reserved.      ---*/
 /*---                                                         ---*/
 /*---------------------------------------------------------------*/

 /*
    This file is part of LibVEX, a library for dynamic binary
    instrumentation and translation.

    Copyright (C) 2004-2006 OpenWorks LLP.  All rights reserved.

    This library is made available under a dual licensing scheme.

    If you link LibVEX against other code all of which is itself
    licensed under the GNU General Public License, version 2 dated June
    1991 ("GPL v2"), then you may use LibVEX under the terms of the GPL
    v2, as appearing in the file LICENSE.GPL.  If the file LICENSE.GPL
    is missing, you can obtain a copy of the GPL v2 from the Free
    Software Foundation Inc., 51 Franklin St, Fifth Floor, Boston, MA
    02110-1301, USA.

    For any other uses of LibVEX, you must first obtain a commercial
    license from OpenWorks LLP.  Please contact info@open-works.co.uk
    for information about commercial licensing.

    This software is provided by OpenWorks LLP "as is" and any express
    or implied warranties, including, but not limited to, the implied
    warranties of merchantability and fitness for a particular purpose
    are disclaimed.  In no event shall OpenWorks LLP be liable for any
    direct, indirect, incidental, special, exemplary, or consequential
    damages (including, but not limited to, procurement of substitute
    goods or services; loss of use, data, or profits; or business
    interruption) however caused and on any theory of liability,
    whether in contract, strict liability, or tort (including
    negligence or otherwise) arising in any way out of the use of this
    software, even if advised of the possibility of such damage.

    Neither the names of the U.S. Department of Energy nor the
    University of California nor the names of its contributors may be
    used to endorse or promote products derived from this software
    without prior written permission.
 */

 #include "libvex_basictypes.h"
 #include "libvex_emwarn.h"
 #include "libvex_guest_amd64.h"
 #include "libvex_ir.h"
 #include "libvex.h"

 #include "main/vex_util.h"
 #include "guest-generic/bb_to_IR.h"
 #include "guest-amd64/gdefs.h"
 #include "guest-generic/g_generic_x87.h"


 /* This file contains helper functions for amd64 guest code.
    Calls to these functions are generated by the back end.
    These calls are of course in the host machine code and
    this file will be compiled to host machine code, so that
    all makes sense.

    Only change the signatures of these helper functions very
    carefully.  If you change the signature here, you'll have to change
    the parameters passed to it in the IR calls constructed by
    guest-amd64/toIR.c.

    The convention used is that all functions called from generated
    code are named amd64g_<something>, and any function whose name lacks
    that prefix is not called from generated code.  Note that some
    LibVEX_* functions can however be called by VEX's client, but that
    is not the same as calling them from VEX-generated code.
 */


 /* Set to 1 to get detailed profiling info about use of the flag
    machinery. */
 #define PROFILE_RFLAGS 0


 /*---------------------------------------------------------------*/
 /*--- %rflags run-time helpers.                               ---*/
 /*---------------------------------------------------------------*/

 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
    after imulq/mulq. */

 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
 {
    ULong u0, v0, w0;
     Long u1, v1, w1, w2, t;
    u0   = u & 0xFFFFFFFFULL;
    u1   = u >> 32;
    v0   = v & 0xFFFFFFFFULL;
    v1   = v >> 32;
    w0   = u0 * v0;
    t    = u1 * v0 + (w0 >> 32);
    w1   = t & 0xFFFFFFFFULL;
    w2   = t >> 32;
    w1   = u0 * v1 + w1;
    *rHi = u1 * v1 + w2 + (w1 >> 32);
    *rLo = u * v;
 }

 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
 {
    ULong u0, v0, w0;
    ULong u1, v1, w1,w2,t;
    u0   = u & 0xFFFFFFFFULL;
    u1   = u >> 32;
    v0   = v & 0xFFFFFFFFULL;
    v1   = v >> 32;
    w0   = u0 * v0;
    t    = u1 * v0 + (w0 >> 32);
    w1   = t & 0xFFFFFFFFULL;
    w2   = t >> 32;
    w1   = u0 * v1 + w1;
    *rHi = u1 * v1 + w2 + (w1 >> 32);
    *rLo = u * v;
 }


 static const UChar parity_table[256] = {
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 };

 /* generalised left-shifter */
 static inline Long lshift ( Long x, Int n )
 {
    if (n >= 0)
       return x << n;
    else
       return x >> (-n);
 }

 /* identity on ULong */
 static inline ULong idULong ( ULong x )
 {
    return x;
 }


 #define PREAMBLE(__data_bits)					\
    /* const */ ULong DATA_MASK 					\
       = __data_bits==8                                          \
            ? 0xFFULL 					        \
            : (__data_bits==16                                   \
                 ? 0xFFFFULL 		                        \
                 : (__data_bits==32                              \
                      ? 0xFFFFFFFFULL                            \
                      : 0xFFFFFFFFFFFFFFFFULL));                 \
    /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
    /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
    /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
    /* const */ ULong CC_NDEP = cc_ndep_formal;			\
    /* Four bogus assignments, which hopefully gcc can     */	\
    /* optimise away, and which stop it complaining about  */	\
    /* unused variables.                                   */	\
    SIGN_MASK = SIGN_MASK;					\
    DATA_MASK = DATA_MASK;					\
    CC_DEP2 = CC_DEP2;						\
    CC_NDEP = CC_NDEP;


 /*-------------------------------------------------------------*/

 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Long cf, pf, af, zf, sf, of;				\
      Long argL, argR, res;					\
      argL = CC_DEP1;						\
      argR = CC_DEP2;						\
      res  = argL + argR;					\
      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Long cf, pf, af, zf, sf, of;				\
      Long argL, argR, res;					\
      argL = CC_DEP1;						\
      argR = CC_DEP2;						\
      res  = argL - argR;					\
      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR) & (argL ^ res),	 		\
                  12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Long cf, pf, af, zf, sf, of;				\
      Long argL, argR, oldC, res;		 		\
      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
      argL = CC_DEP1;						\
      argR = CC_DEP2 ^ oldC;	       				\
      res  = (argL + argR) + oldC;				\
      if (oldC)							\
         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
      else							\
         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
                   12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Long cf, pf, af, zf, sf, of;				\
      Long argL, argR, oldC, res;	       			\
      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
      argL = CC_DEP1;						\
      argR = CC_DEP2 ^ oldC;	       				\
      res  = (argL - argR) - oldC;				\
      if (oldC)							\
         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
      else							\
         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = lshift((argL ^ argR) & (argL ^ res), 			\
                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Long cf, pf, af, zf, sf, of;				\
      cf = 0;							\
      pf = parity_table[(UChar)CC_DEP1];				\
      af = 0;							\
      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
      of = 0;							\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Long cf, pf, af, zf, sf, of;				\
      Long argL, argR, res;					\
      res  = CC_DEP1;						\
      argL = res - 1;						\
      argR = 1;							\
      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Long cf, pf, af, zf, sf, of;				\
      Long argL, argR, res;					\
      res  = CC_DEP1;						\
      argL = res + 1;						\
      argR = 1;							\
      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
      pf = parity_table[(UChar)res];				\
      af = (res ^ argL ^ argR) & 0x10;				\
      zf = ((DATA_UTYPE)res == 0) << 6;				\
      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
      of = ((res & DATA_MASK) 					\
           == ((ULong)SIGN_MASK - 1)) << 11;			\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Long cf, pf, af, zf, sf, of;				\
      cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
      pf = parity_table[(UChar)CC_DEP1];				\
      af = 0; /* undefined */					\
      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
      /* of is defined if shift count == 1 */			\
      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
           & AMD64G_CC_MASK_O;					\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);  					\
    { Long cf, pf, af, zf, sf, of;				\
      cf = CC_DEP2 & 1;						\
      pf = parity_table[(UChar)CC_DEP1];				\
      af = 0; /* undefined */					\
      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
      /* of is defined if shift count == 1 */			\
      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
           & AMD64G_CC_MASK_O;					\
      return cf | pf | af | zf | sf | of;			\
    }								\
 }

 /*-------------------------------------------------------------*/

 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
 /* DEP1 = result, NDEP = old flags */
 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Long fl 							\
         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
           | (AMD64G_CC_MASK_C & CC_DEP1)			\
           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
                                       11-(DATA_BITS-1)) 	\
                      ^ lshift(CC_DEP1, 11)));			\
      return fl;							\
    }								\
 }

 /*-------------------------------------------------------------*/

 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
 /* DEP1 = result, NDEP = old flags */
 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
 {								\
    PREAMBLE(DATA_BITS);						\
    { Long fl 							\
         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
           | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
                                       11-(DATA_BITS-1)) 	\
                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
      return fl;							\
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
                                 DATA_U2TYPE, NARROWto2U)        \
 {                                                               \
    PREAMBLE(DATA_BITS);                                         \
    { Long cf, pf, af, zf, sf, of;                               \
      DATA_UTYPE  hi;                                            \
      DATA_UTYPE  lo                                             \
         = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
                      * ((DATA_UTYPE)CC_DEP2) );                 \
      DATA_U2TYPE rr                                             \
         = NARROWto2U(                                           \
              ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
              * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
      hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
      cf = (hi != 0);                                            \
      pf = parity_table[(UChar)lo];                              \
      af = 0; /* undefined */                                    \
      zf = (lo == 0) << 6;                                       \
      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
      of = cf << 11;                                             \
      return cf | pf | af | zf | sf | of;                        \
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
                                 DATA_S2TYPE, NARROWto2S)        \
 {                                                               \
    PREAMBLE(DATA_BITS);                                         \
    { Long cf, pf, af, zf, sf, of;                               \
      DATA_STYPE  hi;                                            \
      DATA_STYPE  lo                                             \
         = NARROWtoS( ((DATA_STYPE)CC_DEP1)                      \
                      * ((DATA_STYPE)CC_DEP2) );                 \
      DATA_S2TYPE rr                                             \
         = NARROWto2S(                                           \
              ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
              * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
      hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
      pf = parity_table[(UChar)lo];                              \
      af = 0; /* undefined */                                    \
      zf = (lo == 0) << 6;                                       \
      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
      of = cf << 11;                                             \
      return cf | pf | af | zf | sf | of;                        \
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_UMULQ                                           \
 {                                                               \
    PREAMBLE(64);                                                \
    { Long cf, pf, af, zf, sf, of;                               \
      ULong lo, hi;                                              \
      mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
      cf = (hi != 0);                                            \
      pf = parity_table[(UChar)lo];                              \
      af = 0; /* undefined */                                    \
      zf = (lo == 0) << 6;                                       \
      sf = lshift(lo, 8 - 64) & 0x80;                            \
      of = cf << 11;                                             \
      return cf | pf | af | zf | sf | of;                        \
    }								\
 }

 /*-------------------------------------------------------------*/

 #define ACTIONS_SMULQ                                           \
 {                                                               \
    PREAMBLE(64);                                                \
    { Long cf, pf, af, zf, sf, of;                               \
      Long lo, hi;                                               \
      mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
      cf = (hi != (lo >>/*s*/ (64-1)));                          \
      pf = parity_table[(UChar)lo];                              \
      af = 0; /* undefined */                                    \
      zf = (lo == 0) << 6;                                       \
      sf = lshift(lo, 8 - 64) & 0x80;                            \
      of = cf << 11;                                             \
      return cf | pf | af | zf | sf | of;                        \
    }								\
 }


 #if PROFILE_RFLAGS

 static Bool initted     = False;

 /* C flag, fast route */
 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
 /* C flag, slow route */
 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
 /* table for calculate_cond */
 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
 /* total entry counts for calc_all, calc_c, calc_cond. */
 static UInt n_calc_all  = 0;
 static UInt n_calc_c    = 0;
 static UInt n_calc_cond = 0;

 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))


 static void showCounts ( void )
 {
    Int op, co;
    Char ch;
    vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
               n_calc_all, n_calc_cond, n_calc_c);

    vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
               "    S   NS    P   NP    L   NL   LE  NLE\n");
    vex_printf("     -----------------------------------------------------"
               "----------------------------------------\n");
    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {

       ch = ' ';
       if (op > 0 && (op-1) % 4 == 0)
          ch = 'B';
       if (op > 0 && (op-1) % 4 == 1)
          ch = 'W';
       if (op > 0 && (op-1) % 4 == 2)
          ch = 'L';
       if (op > 0 && (op-1) % 4 == 3)
          ch = 'Q';

       vex_printf("%2d%c: ", op, ch);
       vex_printf("%6u ", tabc_slow[op]);
       vex_printf("%6u ", tabc_fast[op]);
       for (co = 0; co < 16; co++) {
          Int n = tab_cond[op][co];
          if (n >= 1000) {
             vex_printf(" %3dK", n / 1000);
          } else
          if (n >= 0) {
             vex_printf(" %3d ", n );
          } else {
             vex_printf("     ");
          }
       }
       vex_printf("\n");
    }
    vex_printf("\n");
 }

 static void initCounts ( void )
 {
    Int op, co;
    initted = True;
    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
       tabc_fast[op] = tabc_slow[op] = 0;
       for (co = 0; co < 16; co++)
          tab_cond[op][co] = 0;
    }
 }

 #endif /* PROFILE_RFLAGS */


 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* Calculate all the 6 flags from the supplied thunk parameters.
    Worker function, not directly called from generated code. */
 static
 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
                                         ULong cc_dep1_formal,
                                         ULong cc_dep2_formal,
                                         ULong cc_ndep_formal )
 {
    switch (cc_op) {
       case AMD64G_CC_OP_COPY:
          return cc_dep1_formal
                 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
                    | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);

       case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
       case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
       case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
       case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );

       case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
       case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
       case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
       case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );

       case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
       case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
       case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
       case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );

       case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
       case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
       case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
       case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );

       case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
       case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
       case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
       case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );

       case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
       case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
       case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
       case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );

       case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
       case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
       case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
       case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );

       case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
       case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
       case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
       case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );

       case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
       case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
       case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
       case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );

       case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
       case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
       case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
       case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );

       case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
       case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
       case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
       case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );

       case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
                                                   UShort, toUShort );
       case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
                                                   UInt,   toUInt );
       case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
                                                   ULong,  idULong );

       case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;

       case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
                                                   Short,  toUShort );
       case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
                                                   Int,    toUInt   );
       case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
                                                   Long,   idULong );

       case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;

       default:
          /* shouldn't really make these calls from generated code */
          vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
                     "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
          vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
    }
 }


 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* Calculate all the 6 flags from the supplied thunk parameters. */
 ULong amd64g_calculate_rflags_all ( ULong cc_op,
                                     ULong cc_dep1,
                                     ULong cc_dep2,
                                     ULong cc_ndep )
 {
 #  if PROFILE_RFLAGS
    if (!initted) initCounts();
    n_calc_all++;
    if (SHOW_COUNTS_NOW) showCounts();
 #  endif
    return
       amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
 }


 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* Calculate just the carry flag from the supplied thunk parameters. */
 ULong amd64g_calculate_rflags_c ( ULong cc_op,
                                   ULong cc_dep1,
                                   ULong cc_dep2,
                                   ULong cc_ndep )
 {
 #  if PROFILE_RFLAGS
    if (!initted) initCounts();
    n_calc_c++;
    tabc_fast[cc_op]++;
    if (SHOW_COUNTS_NOW) showCounts();
 #  endif

    /* Fast-case some common ones. */
    switch (cc_op) {
       case AMD64G_CC_OP_COPY:
          return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
       case AMD64G_CC_OP_LOGICQ:
       case AMD64G_CC_OP_LOGICL:
       case AMD64G_CC_OP_LOGICW:
       case AMD64G_CC_OP_LOGICB:
          return 0;
 	 //      case AMD64G_CC_OP_SUBL:
 	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
 	 //                   ? AMD64G_CC_MASK_C : 0;
 	 //      case AMD64G_CC_OP_SUBW:
 	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
 	 //                   ? AMD64G_CC_MASK_C : 0;
 	 //      case AMD64G_CC_OP_SUBB:
 	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
 	 //                   ? AMD64G_CC_MASK_C : 0;
 	 //      case AMD64G_CC_OP_INCL:
 	 //      case AMD64G_CC_OP_DECL:
 	 //         return cc_ndep & AMD64G_CC_MASK_C;
       default:
          break;
    }

 #  if PROFILE_RFLAGS
    tabc_fast[cc_op]--;
    tabc_slow[cc_op]++;
 #  endif

    return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
           & AMD64G_CC_MASK_C;
 }


 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 /* returns 1 or 0 */
 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
                                    ULong cc_op,
                                    ULong cc_dep1,
                                    ULong cc_dep2,
                                    ULong cc_ndep )
 {
    ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
                                                   cc_dep2, cc_ndep);
    ULong of,sf,zf,cf,pf;
    ULong inv = cond & 1;

 #  if PROFILE_RFLAGS
    if (!initted) initCounts();
    tab_cond[cc_op][cond]++;
    n_calc_cond++;
    if (SHOW_COUNTS_NOW) showCounts();
 #  endif

    switch (cond) {
       case AMD64CondNO:
       case AMD64CondO: /* OF == 1 */
          of = rflags >> AMD64G_CC_SHIFT_O;
          return 1 & (inv ^ of);

       case AMD64CondNZ:
       case AMD64CondZ: /* ZF == 1 */
          zf = rflags >> AMD64G_CC_SHIFT_Z;
          return 1 & (inv ^ zf);

       case AMD64CondNB:
       case AMD64CondB: /* CF == 1 */
          cf = rflags >> AMD64G_CC_SHIFT_C;
          return 1 & (inv ^ cf);
          break;

       case AMD64CondNBE:
       case AMD64CondBE: /* (CF or ZF) == 1 */
          cf = rflags >> AMD64G_CC_SHIFT_C;
          zf = rflags >> AMD64G_CC_SHIFT_Z;
          return 1 & (inv ^ (cf | zf));
          break;

       case AMD64CondNS:
       case AMD64CondS: /* SF == 1 */
          sf = rflags >> AMD64G_CC_SHIFT_S;
          return 1 & (inv ^ sf);

       case AMD64CondNP:
       case AMD64CondP: /* PF == 1 */
          pf = rflags >> AMD64G_CC_SHIFT_P;
          return 1 & (inv ^ pf);

       case AMD64CondNL:
       case AMD64CondL: /* (SF xor OF) == 1 */
          sf = rflags >> AMD64G_CC_SHIFT_S;
          of = rflags >> AMD64G_CC_SHIFT_O;
          return 1 & (inv ^ (sf ^ of));
          break;

       case AMD64CondNLE:
       case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
          sf = rflags >> AMD64G_CC_SHIFT_S;
          of = rflags >> AMD64G_CC_SHIFT_O;
          zf = rflags >> AMD64G_CC_SHIFT_Z;
          return 1 & (inv ^ ((sf ^ of) | zf));
          break;

       default:
          /* shouldn't really make these calls from generated code */
          vex_printf("amd64g_calculate_condition"
                     "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
          vpanic("amd64g_calculate_condition");
    }
 }


 /* VISIBLE TO LIBVEX CLIENT */
 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/VexGuestAMD64State* vex_state )
 {
    ULong rflags = amd64g_calculate_rflags_all_WRK(
                      vex_state->guest_CC_OP,
                      vex_state->guest_CC_DEP1,
                      vex_state->guest_CC_DEP2,
                      vex_state->guest_CC_NDEP
                   );
    Long dflag = vex_state->guest_DFLAG;
    vassert(dflag == 1 || dflag == -1);
    if (dflag == -1)
       rflags |= (1<<10);
    if (vex_state->guest_IDFLAG == 1)
       rflags |= (1<<21);
    return rflags;
 }


 /*---------------------------------------------------------------*/
 /*--- %rflags translation-time function specialisers.         ---*/
 /*--- These help iropt specialise calls the above run-time    ---*/
 /*--- %rflags functions.                                      ---*/
 /*---------------------------------------------------------------*/

 /* Used by the optimiser to try specialisations.  Returns an
    equivalent expression, or NULL if none. */

 static Bool isU64 ( IRExpr* e, ULong n )
 {
    return toBool( e->tag == Iex_Const
                   && e->Iex.Const.con->tag == Ico_U64
                   && e->Iex.Const.con->Ico.U64 == n );
 }

 IRExpr* guest_amd64_spechelper ( HChar* function_name,
                                  IRExpr** args )
 {
 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))

    Int i, arity = 0;
    for (i = 0; args[i]; i++)
       arity++;
 #  if 0
    vex_printf("spec request:\n");
    vex_printf("   %s  ", function_name);
    for (i = 0; i < arity; i++) {
       vex_printf("  ");
       ppIRExpr(args[i]);
    }
    vex_printf("\n");
 #  endif

    /* --------- specialising "amd64g_calculate_condition" --------- */

    if (vex_streq(function_name, "amd64g_calculate_condition")) {
       /* specialise calls to above "calculate condition" function */
       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
       vassert(arity == 5);
       cond    = args[0];
       cc_op   = args[1];
       cc_dep1 = args[2];
       cc_dep2 = args[3];

       /*---------------- ADDQ ----------------*/

       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
          /* long long add, then Z --> test (dst+src == 0) */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpEQ64,
                            binop(Iop_Add64, cc_dep1, cc_dep2),
                            mkU64(0)));
       }

       /*---------------- SUBQ ----------------*/

       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
          /* long long sub/cmp, then Z --> test dst==src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
       }

       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
          /* long long sub/cmp, then NZ --> test dst!=src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpNE64,cc_dep1,cc_dep2));
       }

       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
          /* long long sub/cmp, then L (signed less than)
             --> test dst <s src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
       }

       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
          /* long long sub/cmp, then B (unsigned less than)
             --> test dst <u src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
       }

       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
          /* long long sub/cmp, then NB (unsigned greater than or equal)
             --> test src <=u dst */
          /* Note, args are opposite way round from the usual */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
       }

       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
          /* long long sub/cmp, then BE (unsigned less than or equal)
             --> test dst <=u src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
       }

       /*---------------- SUBL ----------------*/

       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
          /* long sub/cmp, then Z --> test dst==src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpEQ64,
                            binop(Iop_Shl64,cc_dep1,mkU8(32)),
                            binop(Iop_Shl64,cc_dep2,mkU8(32))));
       }

       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
          /* long sub/cmp, then NZ --> test dst!=src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpNE64,
                            binop(Iop_Shl64,cc_dep1,mkU8(32)),
                            binop(Iop_Shl64,cc_dep2,mkU8(32))));
       }

 //..       if (isU32(cc_op, AMD64G_CC_OP_SUBL) && isU32(cond, X86CondNZ)) {
 //..          /* long sub/cmp, then NZ --> test dst!=src */
 //..          return unop(Iop_1Uto32,
 //..                      binop(Iop_CmpNE32, cc_dep1, cc_dep2));
 //..       }

       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
          /* long sub/cmp, then L (signed less than)
             --> test dst <s src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLT64S,
                            binop(Iop_Shl64,cc_dep1,mkU8(32)),
                            binop(Iop_Shl64,cc_dep2,mkU8(32))));
       }

       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
          /* long sub/cmp, then LE (signed less than or equal)
             --> test dst <=s src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLE64S,
                            binop(Iop_Shl64,cc_dep1,mkU8(32)),
                            binop(Iop_Shl64,cc_dep2,mkU8(32))));

       }

       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
          /* long sub/cmp, then BE (unsigned less than or equal)
             --> test dst <=u src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLE64U,
                            binop(Iop_Shl64,cc_dep1,mkU8(32)),
                            binop(Iop_Shl64,cc_dep2,mkU8(32))));
       }

 //..       if (isU32(cc_op, AMD64G_CC_OP_SUBL) && isU32(cond, X86CondB)) {
 //..          /* long sub/cmp, then B (unsigned less than)
 //..             --> test dst <u src */
 //..          return unop(Iop_1Uto32,
 //..                      binop(Iop_CmpLT32U, cc_dep1, cc_dep2));
 //..       }

       /*---------------- SUBW ----------------*/

       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
          /* word sub/cmp, then Z --> test dst==src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpEQ16,
                            unop(Iop_64to16,cc_dep1),
                            unop(Iop_64to16,cc_dep2)));
       }

       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
          /* word sub/cmp, then LE (signed less than or equal)
             --> test dst <=s src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLE64S,
                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
                            binop(Iop_Shl64,cc_dep2,mkU8(48))));

       }

       /*---------------- SUBB ----------------*/

       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
          /* byte sub/cmp, then Z --> test dst==src */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpEQ8,
                            unop(Iop_64to8,cc_dep1),
                            unop(Iop_64to8,cc_dep2)));
       }

       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
                                           && isU64(cc_dep2, 0)) {
          /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
                                          --> test dst <s 0
                                          --> (ULong)dst[7]
             This is yet another scheme by which gcc figures out if the
             top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
          /* Note: isU64(cc_dep2, 0) is correct, even though this is
             for an 8-bit comparison, since the args to the helper
             function are always U64s. */
          return binop(Iop_And64,
                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
                       mkU64(1));
       }

 //      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
 //         /* byte sub/cmp, then NZ --> test dst!=src */
 //         return unop(Iop_32Uto64,
 //                unop(Iop_1Uto32,
 //                     binop(Iop_CmpNE8,
 //                           unop(Iop_32to8,unop(Iop_64to32,cc_dep1)),
 //                           unop(Iop_32to8,unop(Iop_64to32,cc_dep2)))));
 //      }

 //..       if (isU32(cc_op, AMD64G_CC_OP_SUBB) && isU32(cond, X86CondNBE)) {
 //..          /* long sub/cmp, then NBE (unsigned greater than)
 //..             --> test src <u dst */
 //..          /* Note, args are opposite way round from the usual */
 //..          return unop(Iop_1Uto32,
 //..                      binop(Iop_CmpLT32U,
 //..                            binop(Iop_And32,cc_dep2,mkU32(0xFF)),
 //.. 			   binop(Iop_And32,cc_dep1,mkU32(0xFF))));
 //..       }

       /*---------------- LOGICQ ----------------*/

       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
          /* long long and/or/xor, then Z --> test dst==0 */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
       }

       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
          /* long long and/or/xor, then L
             LOGIC sets SF and ZF according to the
             result and makes OF be zero.  L computes SF ^ OF, but
             OF is zero, so this reduces to SF -- which will be 1 iff
             the result is < signed 0.  Hence ...
          */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLT64S,
                            cc_dep1,
                            mkU64(0)));
       }

       /*---------------- LOGICL ----------------*/

       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
          /* long and/or/xor, then Z --> test dst==0 */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpEQ64,
                            binop(Iop_Shl64,cc_dep1,mkU8(32)),
                            mkU64(0)));
       }

       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
          /* long and/or/xor, then NZ --> test dst!=0 */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpNE64,
                            binop(Iop_Shl64,cc_dep1,mkU8(32)),
                            mkU64(0)));
       }

 //..       if (isU32(cc_op, AMD64G_CC_OP_LOGICL) && isU32(cond, X86CondS)) {
 //..          /* long and/or/xor, then S --> test dst <s 0 */
 //..          return unop(Iop_1Uto32,binop(Iop_CmpLT32S, cc_dep1, mkU32(0)));
 //..       }

       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
          /* long and/or/xor, then LE
             This is pretty subtle.  LOGIC sets SF and ZF according to the
             result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
             OF is zero, so this reduces to SF | ZF -- which will be 1 iff
             the result is <=signed 0.  Hence ...
          */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLE64S,
                            binop(Iop_Shl64,cc_dep1,mkU8(32)),
                            mkU64(0)));
       }

 //..       if (isU32(cc_op, AMD64G_CC_OP_LOGICL) && isU32(cond, X86CondBE)) {
 //..          /* long and/or/xor, then BE
 //..             LOGIC sets ZF according to the result and makes CF be zero.
 //..             BE computes (CF | ZF), but CF is zero, so this reduces ZF
 //..             -- which will be 1 iff the result is zero.  Hence ...
 //..          */
 //..          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
 //..       }
 //..
 //..       /*---------------- LOGICW ----------------*/
 //..
 //..       if (isU32(cc_op, AMD64G_CC_OP_LOGICW) && isU32(cond, X86CondZ)) {
 //..          /* byte and/or/xor, then Z --> test dst==0 */
 //..          return unop(Iop_1Uto32,
 //..                      binop(Iop_CmpEQ32, binop(Iop_And32,cc_dep1,mkU32(0xFFFF)),
 //..                                         mkU32(0)));
 //..       }

       /*---------------- LOGICB ----------------*/

       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
          /* byte and/or/xor, then Z --> test dst==0 */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
                                         mkU64(0)));
       }

       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
          /* this is an idiom gcc sometimes uses to find out if the top
             bit of a byte register is set: eg testb %al,%al; js ..
             Since it just depends on the top bit of the byte, extract
             that bit and explicitly get rid of all the rest.  This
             helps memcheck avoid false positives in the case where any
             of the other bits in the byte are undefined. */
          /* byte and/or/xor, then S --> (UInt)result[7] */
          return binop(Iop_And64,
                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
                       mkU64(1));
       }

       /*---------------- INCB ----------------*/

       if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
          /* 8-bit inc, then LE --> test result <=s 0 */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLE64S,
                            binop(Iop_Shl64,cc_dep1,mkU8(56)),
                            mkU64(0)));
       }

       /*---------------- DECL ----------------*/

       if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
          /* dec L, then Z --> test dst == 0 */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpEQ64,
                            binop(Iop_Shl64,cc_dep1,mkU8(32)),
                            mkU64(0)));
       }

       /*---------------- DECW ----------------*/

       if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
          /* 16-bit dec, then NZ --> test dst != 0 */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpNE64,
                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
                            mkU64(0)));
       }

 //..       /*---------------- DECL ----------------*/
 //..
 //..       if (isU32(cc_op, AMD64G_CC_OP_DECL) && isU32(cond, X86CondZ)) {
 //..          /* dec L, then Z --> test dst == 0 */
 //..          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
 //..       }
 //..
 //..       if (isU32(cc_op, AMD64G_CC_OP_DECL) && isU32(cond, X86CondS)) {
 //..          /* dec L, then S --> compare DST <s 0 */
 //..          return unop(Iop_1Uto32,binop(Iop_CmpLT32S, cc_dep1, mkU32(0)));
 //..       }
 //..
 //..       /*---------------- SHRL ----------------*/
 //..
 //..       if (isU32(cc_op, AMD64G_CC_OP_SHRL) && isU32(cond, X86CondZ)) {
 //..          /* SHRL, then Z --> test dep1 == 0 */
 //..          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
 //..       }

       /*---------------- COPY ----------------*/
       /* This can happen, as a result of amd64 FP compares: "comisd ... ;
          jbe" for example. */

       if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
           (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
          /* COPY, then BE --> extract C and Z from dep1, and test (C
             or Z == 1). */
          /* COPY, then NBE --> extract C and Z from dep1, and test (C
             or Z == 0). */
          ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
          return
             unop(
                Iop_1Uto64,
                binop(
                   Iop_CmpEQ64,
                   binop(
                      Iop_And64,
                      binop(
                         Iop_Or64,
                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
                      ),
                      mkU64(1)
                   ),
                   mkU64(nnn)
                )
             );
       }

       if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
          /* COPY, then B --> extract C dep1, and test (C == 1). */
          return
             unop(
                Iop_1Uto64,
                binop(
                   Iop_CmpNE64,
                   binop(
                      Iop_And64,
                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
                      mkU64(1)
                   ),
                   mkU64(0)
                )
             );
       }

       return NULL;
    }

    /* --------- specialising "amd64g_calculate_rflags_c" --------- */

    if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
       /* specialise calls to above "calculate_rflags_c" function */
       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
       vassert(arity == 4);
       cc_op   = args[0];
       cc_dep1 = args[1];
       cc_dep2 = args[2];
       cc_ndep = args[3];

       if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
          /* C after sub denotes unsigned less than */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLT64U,
                            cc_dep1,
                            cc_dep2));
       }
       if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
          /* C after sub denotes unsigned less than */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLT64U,
                            binop(Iop_Shl64,cc_dep1,mkU8(32)),
                            binop(Iop_Shl64,cc_dep2,mkU8(32))));
       }
       if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
          /* C after sub denotes unsigned less than */
          return unop(Iop_1Uto64,
                      binop(Iop_CmpLT64U,
                            binop(Iop_And64,cc_dep1,mkU64(0xFF)),
                            binop(Iop_And64,cc_dep2,mkU64(0xFF))));
       }
       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
           || isU64(cc_op, AMD64G_CC_OP_LOGICL)
           || isU64(cc_op, AMD64G_CC_OP_LOGICW)
           || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
          /* cflag after logic is zero */
          return mkU64(0);
       }
       if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
           || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
          return cc_ndep;
       }
 //..       if (isU64(cc_op, AMD64G_CC_OP_COPY)) {
 //..          /* cflag after COPY is stored in DEP1. */
 //..          return
 //..             binop(
 //..                Iop_And64,
 //..                binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
 //..                mkU64(1)
 //..             );
 //..       }
 //.. #     if 0
 //..       if (cc_op->tag == Iex_Const) {
 //..          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
 //..       }
 //.. #     endif

       return NULL;
    }

 //..    /* --------- specialising "x86g_calculate_rflags_all" --------- */
 //..
 //..    if (vex_streq(function_name, "x86g_calculate_rflags_all")) {
 //..       /* specialise calls to above "calculate_rflags_all" function */
 //..       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
 //..       vassert(arity == 4);
 //..       cc_op   = args[0];
 //..       cc_dep1 = args[1];
 //..       cc_dep2 = args[2];
 //..       cc_ndep = args[3];
 //..
 //..       if (isU32(cc_op, AMD64G_CC_OP_COPY)) {
 //..          /* eflags after COPY are stored in DEP1. */
 //..          return
 //..             binop(
 //..                Iop_And32,
 //..                cc_dep1,
 //..                mkU32(AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
 //..                      | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P)
 //..             );
 //..       }
 //..       return NULL;
 //..   }

 #  undef unop
 #  undef binop
 #  undef mkU64
 #  undef mkU8

    return NULL;
 }


 /*---------------------------------------------------------------*/
 /*--- Supporting functions for x87 FPU activities.            ---*/
 /*---------------------------------------------------------------*/

 static inline Bool host_is_little_endian ( void )
 {
    UInt x = 0x76543210;
    UChar* p = (UChar*)(&x);
    return toBool(*p == 0x10);
 }

 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
 {
    Bool   mantissaIsZero;
    Int    bexp;
    UChar  sign;
    UChar* f64;

    vassert(host_is_little_endian());

    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */

    f64  = (UChar*)(&dbl);
    sign = toUChar( (f64[7] >> 7) & 1 );

    /* First off, if the tag indicates the register was empty,
       return 1,0,sign,1 */
    if (tag == 0) {
       /* vex_printf("Empty\n"); */
       return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
                                    | AMD64G_FC_MASK_C0;
    }

    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
    bexp &= 0x7FF;

    mantissaIsZero
       = toBool(
            (f64[6] & 0x0F) == 0
            && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
         );

    /* If both exponent and mantissa are zero, the value is zero.
       Return 1,0,sign,0. */
    if (bexp == 0 && mantissaIsZero) {
       /* vex_printf("Zero\n"); */
       return AMD64G_FC_MASK_C3 | 0
                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
    }

    /* If exponent is zero but mantissa isn't, it's a denormal.
       Return 1,1,sign,0. */
    if (bexp == 0 && !mantissaIsZero) {
       /* vex_printf("Denormal\n"); */
       return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
    }

    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
       Return 0,1,sign,1. */
    if (bexp == 0x7FF && mantissaIsZero) {
       /* vex_printf("Inf\n"); */
       return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
                                    | AMD64G_FC_MASK_C0;
    }

    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
       Return 0,0,sign,1. */
    if (bexp == 0x7FF && !mantissaIsZero) {
       /* vex_printf("NaN\n"); */
       return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
    }

    /* Uh, ok, we give up.  It must be a normal finite number.
       Return 0,1,sign,0.
    */
    /* vex_printf("normal\n"); */
    return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
 }


 /* DIRTY HELPER (writes guest state) */
 /* Initialise the x87 FPU state as per 'finit'. */
 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
 {
    Int i;
    gst->guest_FTOP = 0;
    for (i = 0; i < 8; i++) {
       gst->guest_FPTAG[i] = 0; /* empty */
       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
    }
    gst->guest_FPROUND = (ULong)Irrm_NEAREST;
    gst->guest_FC3210  = 0;
 }


 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (reads guest memory) */
 ULong amd64g_dirtyhelper_loadF80le ( ULong addrU )
 {
    ULong f64;
    convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 );
    return f64;
 }

 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (writes guest memory) */
 void amd64g_dirtyhelper_storeF80le ( ULong addrU, ULong f64 )
 {
    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) );
 }


 /* CALLED FROM GENERATED CODE */
 /* CLEAN HELPER */
 /* mxcsr[15:0] contains a SSE native format MXCSR value.
    Extract from it the required SSEROUND value and any resulting
    emulation warning, and return (warn << 32) | sseround value.
 */
 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
 {
    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
    /* NOTE, encoded exactly as per enum IRRoundingMode. */
    ULong rmode = (mxcsr >> 13) & 3;

    /* Detect any required emulation warnings. */
    VexEmWarn ew = EmWarn_NONE;

    if ((mxcsr & 0x1F80) != 0x1F80) {
       /* unmasked exceptions! */
       ew = EmWarn_X86_sseExns;
    }
    else
    if (mxcsr & (1<<15)) {
       /* FZ is set */
       ew = EmWarn_X86_fz;
    }
    else
    if (mxcsr & (1<<6)) {
       /* DAZ is set */
       ew = EmWarn_X86_daz;
    }

    return (((ULong)ew) << 32) | ((ULong)rmode);
 }


 /* CALLED FROM GENERATED CODE */
 /* CLEAN HELPER */
 /* Given sseround as an IRRoundingMode value, create a suitable SSE
    native format MXCSR value. */
 ULong amd64g_create_mxcsr ( ULong sseround )
 {
    sseround &= 3;
    return 0x1F80 | (sseround << 13);
 }


 /* CLEAN HELPER */
 /* fpucw[15:0] contains a x87 native format FPU control word.
    Extract from it the required FPROUND value and any resulting
    emulation warning, and return (warn << 32) | fpround value.
 */
 ULong amd64g_check_fldcw ( ULong fpucw )
 {
    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
    /* NOTE, encoded exactly as per enum IRRoundingMode. */
    ULong rmode = (fpucw >> 10) & 3;

    /* Detect any required emulation warnings. */
    VexEmWarn ew = EmWarn_NONE;

    if ((fpucw & 0x3F) != 0x3F) {
       /* unmasked exceptions! */
       ew = EmWarn_X86_x87exns;
    }
    else
    if (((fpucw >> 8) & 3) != 3) {
       /* unsupported precision */
       ew = EmWarn_X86_x87precision;
    }

    return (((ULong)ew) << 32) | ((ULong)rmode);
 }


 /* CLEAN HELPER */
 /* Given fpround as an IRRoundingMode value, create a suitable x87
    native format FPU control word. */
 ULong amd64g_create_fpucw ( ULong fpround )
 {
    fpround &= 3;
    return 0x037F | (fpround << 10);
 }


 /* This is used to implement 'fldenv'.
    Reads 28 bytes at x87_state[0 .. 27]. */
 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER */
 VexEmWarn amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
                                       /*IN*/HWord x87_state)
 {
    Int        stno, preg;
    UInt       tag;
    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
    Fpu_State* x87     = (Fpu_State*)x87_state;
    UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
    UInt       tagw    = x87->env[FP_ENV_TAG];
    UInt       fpucw   = x87->env[FP_ENV_CTRL];
    ULong      c3210   = x87->env[FP_ENV_STAT] & 0x4700;
    VexEmWarn  ew;
    ULong      fpround;
    ULong      pair;

    /* Copy tags */
    for (stno = 0; stno < 8; stno++) {
       preg = (stno + ftop) & 7;
       tag = (tagw >> (2*preg)) & 3;
       if (tag == 3) {
          /* register is empty */
          vexTags[preg] = 0;
       } else {
          /* register is non-empty */
          vexTags[preg] = 1;
       }
    }

    /* stack pointer */
    vex_state->guest_FTOP = ftop;

    /* status word */
    vex_state->guest_FC3210 = c3210;

    /* handle the control word, setting FPROUND and detecting any
       emulation warnings. */
    pair    = amd64g_check_fldcw ( (ULong)fpucw );
    fpround = pair & 0xFFFFFFFFULL;
    ew      = (VexEmWarn)(pair >> 32);

    vex_state->guest_FPROUND = fpround & 3;

    /* emulation warnings --> caller */
    return ew;
 }


 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER */
 /* Create an x87 FPU env from the guest state, as close as we can
    approximate it.  Writes 28 bytes at x87_state[0..27]. */
 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
                                  /*OUT*/HWord x87_state )
 {
    Int        i, stno, preg;
    UInt       tagw;
    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
    Fpu_State* x87     = (Fpu_State*)x87_state;
    UInt       ftop    = vex_state->guest_FTOP;
    ULong      c3210   = vex_state->guest_FC3210;

    for (i = 0; i < 14; i++)
       x87->env[i] = 0;

    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
    x87->env[FP_ENV_STAT]
       = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
    x87->env[FP_ENV_CTRL]
       = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));

    /* Compute the x87 tag word. */
    tagw = 0;
    for (stno = 0; stno < 8; stno++) {
       preg = (stno + ftop) & 7;
       if (vexTags[preg] == 0) {
          /* register is empty */
          tagw |= (3 << (2*preg));
       } else {
          /* register is full. */
          tagw |= (0 << (2*preg));
       }
    }
    x87->env[FP_ENV_TAG] = toUShort(tagw);

    /* We don't dump the x87 registers, tho. */
 }


 /*---------------------------------------------------------------*/
 /*--- Misc integer helpers, including rotates and CPUID.      ---*/
 /*---------------------------------------------------------------*/

 /* Claim to be the following CPU:
    vendor_id       : AuthenticAMD
    cpu family      : 15
    model           : 12
    model name      : AMD Athlon(tm) 64 Processor 3200+
    stepping        : 0
    cpu MHz         : 2202.917
    cache size      : 512 KB
    fpu             : yes
    fpu_exception   : yes
    cpuid level     : 1
    wp              : yes
    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr
                      pge mca cmov pat pse36 clflush mmx fxsr sse sse2
                      pni syscall nx mmxext lm 3dnowext 3dnow
    bogomips        : 4308.99
    TLB size        : 1088 4K pages
    clflush size    : 64
    cache_alignment : 64
    address sizes   : 40 bits physical, 48 bits virtual
    power management: ts fid vid ttp
 */
 void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* st )
 {
 #  define SET_ABCD(_a,_b,_c,_d)                \
       do { st->guest_RAX = (ULong)(_a);        \
            st->guest_RBX = (ULong)(_b);        \
            st->guest_RCX = (ULong)(_c);        \
            st->guest_RDX = (ULong)(_d);        \
       } while (0)

    switch (0xFFFFFFFF & st->guest_RAX) {
       case 0x0:
          SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
          break;
       case 0x1:
          SET_ABCD(0x00000fc0, 0x00000800, 0x00000000, 0x078bfbff);
          break;
       case 0x80000000:
          SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
          break;
       case 0x80000001:
          SET_ABCD(0x00000fc0, 0x0000010a, 0x00000000, 0xe1d3fbff);
          break;
       case 0x80000002:
          SET_ABCD(0x20444d41, 0x6c687441, 0x74286e6f, 0x3620296d);
          break;
       case 0x80000003:
          SET_ABCD(0x72502034, 0x7365636f, 0x20726f73, 0x30303233);
          break;
       case 0x80000004:
          SET_ABCD(0x0000002b, 0x00000000, 0x00000000, 0x00000000);
          break;
       case 0x80000005:
          SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
          break;
       case 0x80000006:
          SET_ABCD(0x00000000, 0x42004200, 0x02008140, 0x00000000);
          break;
       case 0x80000007:
          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
          break;
       case 0x80000008:
          SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
          break;
       default:
          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
          break;
    }
 #  undef SET_ABCD
 }


 ULong amd64g_calculate_RCR ( ULong arg,
                              ULong rot_amt,
                              ULong rflags_in,
                              Long  szIN )
 {
    Bool  wantRflags = toBool(szIN < 0);
    ULong sz         = wantRflags ? (-szIN) : szIN;
    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
    ULong cf=0, of=0, tempcf;

    switch (sz) {
       case 8:
          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
          of        = ((arg >> 63) ^ cf) & 1;
          while (tempCOUNT > 0) {
             tempcf = arg & 1;
             arg    = (arg >> 1) | (cf << 63);
             cf     = tempcf;
             tempCOUNT--;
          }
          break;
       case 4:
          while (tempCOUNT >= 33) tempCOUNT -= 33;
          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
          of        = ((arg >> 31) ^ cf) & 1;
          while (tempCOUNT > 0) {
             tempcf = arg & 1;
             arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
             cf     = tempcf;
             tempCOUNT--;
          }
          break;
       case 2:
          while (tempCOUNT >= 17) tempCOUNT -= 17;
          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
          of        = ((arg >> 15) ^ cf) & 1;
          while (tempCOUNT > 0) {
             tempcf = arg & 1;
             arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
             cf     = tempcf;
             tempCOUNT--;
          }
          break;
       case 1:
          while (tempCOUNT >= 9) tempCOUNT -= 9;
          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
          of        = ((arg >> 7) ^ cf) & 1;
          while (tempCOUNT > 0) {
             tempcf = arg & 1;
             arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
             cf     = tempcf;
             tempCOUNT--;
          }
          break;
       default:
          vpanic("calculate_RCR(amd64g): invalid size");
    }

    cf &= 1;
    of &= 1;
    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);

    /* caller can ask to have back either the resulting flags or
       resulting value, but not both */
    return wantRflags ? rflags_in : arg;
 }


 /* CALLED FROM GENERATED CODE */
 /* DIRTY HELPER (non-referentially-transparent) */
 /* Horrible hack.  On non-amd64 platforms, return 1. */
 ULong amd64g_dirtyhelper_RDTSC ( void )
 {
 #  if defined(__x86_64__)
    UInt  eax, edx;
    __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
    return (((ULong)edx) << 32) | ((ULong)eax);
 #  else
    return 1ULL;
 #  endif
 }


 /*---------------------------------------------------------------*/
 /*--- Helpers for MMX/SSE/SSE2.                               ---*/
 /*---------------------------------------------------------------*/

 static inline UChar abdU8 ( UChar xx, UChar yy ) {
    return toUChar(xx>yy ? xx-yy : yy-xx);
 }

 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
    return (((ULong)w1) << 32) | ((ULong)w0);
 }

 static inline UShort sel16x4_3 ( ULong w64 ) {
    UInt hi32 = toUInt(w64 >> 32);
    return toUShort(hi32 >> 16);
 }
 static inline UShort sel16x4_2 ( ULong w64 ) {
    UInt hi32 = toUInt(w64 >> 32);
    return toUShort(hi32);
 }
 static inline UShort sel16x4_1 ( ULong w64 ) {
    UInt lo32 = toUInt(w64);
    return toUShort(lo32 >> 16);
 }
 static inline UShort sel16x4_0 ( ULong w64 ) {
    UInt lo32 = toUInt(w64);
    return toUShort(lo32);
 }

 static inline UChar sel8x8_7 ( ULong w64 ) {
    UInt hi32 = toUInt(w64 >> 32);
    return toUChar(hi32 >> 24);
 }
 static inline UChar sel8x8_6 ( ULong w64 ) {
    UInt hi32 = toUInt(w64 >> 32);
    return toUChar(hi32 >> 16);
 }
 static inline UChar sel8x8_5 ( ULong w64 ) {
    UInt hi32 = toUInt(w64 >> 32);
    return toUChar(hi32 >> 8);
 }
 static inline UChar sel8x8_4 ( ULong w64 ) {
    UInt hi32 = toUInt(w64 >> 32);
    return toUChar(hi32 >> 0);
 }
 static inline UChar sel8x8_3 ( ULong w64 ) {
    UInt lo32 = toUInt(w64);
    return toUChar(lo32 >> 24);
 }
 static inline UChar sel8x8_2 ( ULong w64 ) {
    UInt lo32 = toUInt(w64);
    return toUChar(lo32 >> 16);
 }
 static inline UChar sel8x8_1 ( ULong w64 ) {
    UInt lo32 = toUInt(w64);
    return toUChar(lo32 >> 8);
 }
 static inline UChar sel8x8_0 ( ULong w64 ) {
    UInt lo32 = toUInt(w64);
    return toUChar(lo32 >> 0);
 }

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
 {
    return
       mk32x2(
          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
       );
 }

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 ULong amd64g_calculate_mmx_pmovmskb ( ULong xx )
 {
    ULong r = 0;
    if (xx & (1ULL << (64-1))) r |= (1<<7);
    if (xx & (1ULL << (56-1))) r |= (1<<6);
    if (xx & (1ULL << (48-1))) r |= (1<<5);
    if (xx & (1ULL << (40-1))) r |= (1<<4);
    if (xx & (1ULL << (32-1))) r |= (1<<3);
    if (xx & (1ULL << (24-1))) r |= (1<<2);
    if (xx & (1ULL << (16-1))) r |= (1<<1);
    if (xx & (1ULL << ( 8-1))) r |= (1<<0);
    return r;
 }

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
 {
    UInt t = 0;
    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
    t &= 0xFFFF;
    return (ULong)t;
 }

 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo )
 {
    ULong rHi8 = amd64g_calculate_mmx_pmovmskb ( w64hi );
    ULong rLo8 = amd64g_calculate_mmx_pmovmskb ( w64lo );
    return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF);
 }


 /*---------------------------------------------------------------*/
 /*--- Helpers for dealing with, and describing,               ---*/
 /*--- guest state as a whole.                                 ---*/
 /*---------------------------------------------------------------*/

 /* Initialise the entire amd64 guest state. */
 /* VISIBLE TO LIBVEX CLIENT */
 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
 {
    //Int i;

    vex_state->guest_RAX = 0;
    vex_state->guest_RCX = 0;
    vex_state->guest_RDX = 0;
    vex_state->guest_RBX = 0;
    vex_state->guest_RSP = 0;
    vex_state->guest_RBP = 0;
    vex_state->guest_RSI = 0;
    vex_state->guest_RDI = 0;
    vex_state->guest_R8  = 0;
    vex_state->guest_R9  = 0;
    vex_state->guest_R10 = 0;
    vex_state->guest_R11 = 0;
    vex_state->guest_R12 = 0;
    vex_state->guest_R13 = 0;
    vex_state->guest_R14 = 0;
    vex_state->guest_R15 = 0;

    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
    vex_state->guest_CC_DEP1 = 0;
    vex_state->guest_CC_DEP2 = 0;
    vex_state->guest_CC_NDEP = 0;

    vex_state->guest_DFLAG   = 1; /* forwards */
    vex_state->guest_IDFLAG  = 0;

    /* HACK: represent the offset associated with %fs==0. This
       assumes that %fs is only ever zero. */
    vex_state->guest_FS_ZERO = 0;

    vex_state->guest_RIP = 0;

    /* Initialise the simulated FPU */
    amd64g_dirtyhelper_FINIT( vex_state );

    /* Initialise the SSE state. */
 #  define SSEZERO(_xmm) _xmm[0]=_xmm[1]=_xmm[2]=_xmm[3] = 0;

    vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
    SSEZERO(vex_state->guest_XMM0);
    SSEZERO(vex_state->guest_XMM1);
    SSEZERO(vex_state->guest_XMM2);
    SSEZERO(vex_state->guest_XMM3);
    SSEZERO(vex_state->guest_XMM4);
    SSEZERO(vex_state->guest_XMM5);
    SSEZERO(vex_state->guest_XMM6);
    SSEZERO(vex_state->guest_XMM7);
    SSEZERO(vex_state->guest_XMM8);
    SSEZERO(vex_state->guest_XMM9);
    SSEZERO(vex_state->guest_XMM10);
    SSEZERO(vex_state->guest_XMM11);
    SSEZERO(vex_state->guest_XMM12);
    SSEZERO(vex_state->guest_XMM13);
    SSEZERO(vex_state->guest_XMM14);
    SSEZERO(vex_state->guest_XMM15);

 #  undef SSEZERO

    vex_state->guest_EMWARN = EmWarn_NONE;

    /* These should not ever be either read or written, but we
       initialise them anyway. */
    vex_state->guest_TISTART = 0;
    vex_state->guest_TILEN   = 0;

    vex_state->guest_NRADDR = 0;
 }


 /* Figure out if any part of the guest state contained in minoff
    .. maxoff requires precise memory exceptions.  If in doubt return
    True (but this is generates significantly slower code).

    By default we enforce precise exns for guest %RSP, %RBP and %RIP
    only.  These are the minimum needed to extract correct stack
    backtraces from amd64 code.
 */
 Bool guest_amd64_state_requires_precise_mem_exns ( Int minoff,
                                                    Int maxoff)
 {
    Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
    Int rbp_max = rbp_min + 8 - 1;
    Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
    Int rsp_max = rsp_min + 8 - 1;
    Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
    Int rip_max = rip_min + 8 - 1;

    if (maxoff < rbp_min || minoff > rbp_max) {
       /* no overlap with rbp */
    } else {
       return True;
    }

    if (maxoff < rsp_min || minoff > rsp_max) {
       /* no overlap with rsp */
    } else {
       return True;
    }

    if (maxoff < rip_min || minoff > rip_max) {
       /* no overlap with eip */
    } else {
       return True;
    }

    return False;
 }


 #define ALWAYSDEFD(field)                             \
     { offsetof(VexGuestAMD64State, field),            \
       (sizeof ((VexGuestAMD64State*)0)->field) }

 VexGuestLayout
    amd64guest_layout
       = {
           /* Total size of the guest state, in bytes. */
           .total_sizeB = sizeof(VexGuestAMD64State),

           /* Describe the stack pointer. */
           .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
           .sizeof_SP = 8,

           /* Describe the instruction pointer. */
           .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
           .sizeof_IP = 8,

           /* Describe any sections to be regarded by Memcheck as
              'always-defined'. */
           .n_alwaysDefd = 14,

           /* flags thunk: OP and NDEP are always defd, whereas DEP1
              and DEP2 have to be tracked.  See detailed comment in
              gdefs.h on meaning of thunk fields. */
           .alwaysDefd
              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
 		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
                  /*  4 */ ALWAYSDEFD(guest_RIP),
                  /*  5 */ ALWAYSDEFD(guest_FS_ZERO),
                  /*  6 */ ALWAYSDEFD(guest_FTOP),
                  /*  7 */ ALWAYSDEFD(guest_FPTAG),
                  /*  8 */ ALWAYSDEFD(guest_FPROUND),
                  /*  9 */ ALWAYSDEFD(guest_FC3210),
                  // /* */ ALWAYSDEFD(guest_CS),
                  // /* */ ALWAYSDEFD(guest_DS),
                  // /* */ ALWAYSDEFD(guest_ES),
                  // /* */ ALWAYSDEFD(guest_FS),
                  // /* */ ALWAYSDEFD(guest_GS),
                  // /* */ ALWAYSDEFD(guest_SS),
                  // /* */ ALWAYSDEFD(guest_LDT),
                  // /* */ ALWAYSDEFD(guest_GDT),
                  /* 10 */ ALWAYSDEFD(guest_EMWARN),
                  /* 11 */ ALWAYSDEFD(guest_SSEROUND),
                  /* 12 */ ALWAYSDEFD(guest_TISTART),
                  /* 13 */ ALWAYSDEFD(guest_TILEN)
                }
         };


 /*---------------------------------------------------------------*/
 /*--- end                              guest-amd64/ghelpers.c ---*/
 /*---------------------------------------------------------------*/